/** * \file comp_w.c * * \brief Word-based compression. * * Table-lookup (word) compress, written by Nick Gammon. 21/Oct/95. * * This method maintains a table of 32768 words, where a "word" is defined * as a sequence of alpha or numeric characters (such as "dog123"). * * Compression * ----------- * * The text to be compressed is broken up into "words" and other symbols * (e.g. commas, brackets, spaces and so on). * * Words are looked up in the word table by using a hash-table lookup. If * found, the "index" into the table is emitted with the high order bit set. * If not found, the word is added to the table, and is then emitted as * described above. * * For example: "The dog and the other dog sat on the mat (eating fish)." * would compress as: * * 0x8001 The * 0x8002 dog * 0x8003 and * 0x8004 the * 0x8005 other * 0x8002 dog * 0x8007 sat * 0x8008 on * 0x8004 the * 0x8009 mat * 0x28 ( * 0x800A eating * 0x800B fish) * 0x2E . * * In the above example, the uncompressed text is 55 bytes, and the compressed * text is 26 bytes. * * For simplicity, the above example assumes that the words hash to consecutive * table positions. * * Note that the trailing punctuation character (space, period or whatever) is * considered _part_ of the word. This is to save having to store multiple * spaces between each word, and is relying on the fact that a certain word is * usually followed by the same punctuation. For example, the word "and" would * normally be followed by a space. * * In the above example, the space following "mat" is stored in the table, and * thus the "(" character had to be output separately. The ")" following the * word "fish" is considered part of the word and is stored in the table, * however the last period was output as a separate character. * * Note how the high-order bit is turned on for words in the table, this is so * the decompression routine can detect table lookup characters from ordinary * ones. Also, any table index with a low-order byte of zero cannot be used, * as this would cause the resulting "string" to be prematurely truncated (by * strcpy, strcmp etc.). Thus the lookup routine skips any positions that hash * to the value 0xXX00. (There are only 256 such indices). * * Also, to prevent characters which the user might type in with the high order * bit set causing decompression confusion, all text is stripped of its high * order bit before being added to the table or emitted. * * In the event of two words hashing to the same value, the compression routine * scans forwards for COLLISION_LIMIT entries, looking for a match or a spare * table entry. If none is found, the word is output "as is" (i.e. * uncompressed). You might speed up compression slightly by lowering * COLLISION_LIMIT, at the cost of slightly lower compression ratios. * * The collision limit does not affect _decompression_ as that merely involves * a direct table lookup. * * Decompression * ------------- * * Decompression involves a simple loop, in which each byte of the compressed * text is examined. If it has the high-order bit clear, it is just emitted. * If the high order bit is set it, along with the next byte in the compressed * text, are used to index into the word lookup table. The word found there is * then emitted. * * Speed * ----- * * Tests conducted on (hopefully typical) text show that decompression is about * 4 times as fast as Huffman decompression. * * Compression however is about 3.5 times as _slow_ as Huffman compression. * * The slower compression time is considered acceptable on the grounds that * text is much more often _decompressed_ in a MUSH than compressed. Compression * mainly takes part at database load time (say, once a week) whereas * decompression take part every hour, as the database is dumped to disk, and * whenever an object description is displayed, or an attribute searched for, * * Compression ratio * ----------------- * * The "table compression" method has a poor compression ratio for small amounts * of text, because of the overhead of the 32768 pointers, and the data stored * in them. However, as the database size increases, the ratio improves because * the table overhead becomes progressively less significant. * * The break-even points is with about 1.5 Mb of text, where both the table * compression and Huffman compress to about 63% of the size of the original. * * After that, the compression ratio gradually improves until reaching somewhere * between 40% and 50% of the size of the original, as the amount of text to * compress reaches 10 Mb. * * The nature of Huffman compression however is such that it will always be * fixed at about 63% regardless of the amount of data compressed. */ #include "copyrite.h" #include "config.h" #include <string.h> #include <stdlib.h> #include <ctype.h> #include "conf.h" #include "externs.h" #include "mushdb.h" #include "mymalloc.h" #include "confmagic.h" #define MAXTABLE 32768 /**< Maximum words in the table */ #define MAXWORDS 100 /**< Maximum size of a word */ #define COLLISION_LIMIT 20 /**< Maximum allowed collisions */ #define COMPRESS_HASH_MASK 0x7FFF /**< 32767 in hex */ #define TABLE_FLAG 0x80 /**< Distinguish a table */ #define TABLE_MASK 0x7F /**< Mask out words within a table */ /* Table of words */ static char *words[MAXTABLE]; static size_t words_len[MAXTABLE]; /* The word we are currently compressing */ static char word[MAXWORDS + 2]; static size_t wordpos = 0; /* Stats */ #ifdef COMP_STATS static long total_mallocs = 0; static long total_uncomp = 0; static long total_comp = 0; static long total_entries = 0; #endif /* Work pointer for compression */ static unsigned char *b; static void output_previous_word(void); int init_compress(FILE * f); #ifdef COMP_STATS void compress_stats(long *entries, long *mem_used, long *total_uncompressed, long *total_compressed); #endif static unsigned int hash_fn(const char *s, int hashtab_mask); static void output_previous_word(void) { char *p; int i, j; word[wordpos++] = 0; /* word's trailing null */ /* Don't bother putting single or double letter words in the table */ if (wordpos <= 3) { p = word; while (*p) *b++ = *p++; return; } /* search table to see if word is already in it; */ for (i = hash_fn(word, COMPRESS_HASH_MASK), j = 0; i < MAXTABLE && (words[i] || (i & 0xFF) == 0) && j < COLLISION_LIMIT; i++, j++) if (words[i]) if (strcmp(word, words[i]) == 0) { *b++ = (i >> 8) | TABLE_FLAG; *b++ = i & 0xFF; return; } /* not in table, add to it */ if ((i & 0xFF) == 0) { i++; /* make sure we don't have a null in the message */ j++; } /* Can't add to table if full */ if (i >= MAXTABLE || j >= COLLISION_LIMIT) { p = word; while (*p) *b++ = *p++; return; } words[i] = malloc(wordpos); if (!words[i]) mush_panic("Out of memory in string compression routine"); #ifdef COMP_STATS total_mallocs += wordpos; total_entries++; #endif strncpy(words[i], word, wordpos); words_len[i] = wordpos; *b++ = (i >> 8) | TABLE_FLAG; *b++ = i & 0xFF; } /* end of output_previous_word */ /** Word-compress a string. * * Important notes: * This function mallocs memory that should be freed by the caller! * The caller is also currently responsible for adding mem checks * Don't use it to compress strings longer than BUFFER_LEN or the * later uncompression will not go well. * * \param s string to be compressed. * \return newly allocated compressed string. */ unsigned char * compress(char const *s) { const unsigned char *p; static unsigned char buf[BUFFER_LEN]; p = (unsigned char *) s; b = buf; wordpos = 0; /* break up input into words */ while (*p) { if (!(isdigit(*p) || isalpha(*p)) || wordpos >= MAXWORDS) { if (wordpos) { word[wordpos++] = *p & 0x7F; /* add trailing punctuation */ output_previous_word(); wordpos = 0; } else *b++ = *p & 0x7F; } else word[wordpos++] = *p & 0x7F; p++; } if (wordpos) output_previous_word(); *b = 0; /* trailing null */ #ifdef COMP_STATS total_comp += u_strlen(buf); /* calculate size of compressed text */ total_uncomp += strlen(s); /* calculate size of uncompressed text */ #endif return u_strdup(buf); } /* end of compress; */ /** Word-uncompress a string. * To avoid generating memory problems, this function should be * used with something of the format * \verbatim * char tbuf1[BUFFER_LEN]; * strcpy(tbuf1, uncompress(a->value)); * \endverbatim * if you are using something of type char *buff, use the * safe_uncompress function instead. * * \param s a compressed string. * \return a pointer to a static buffer containing the uncompressed string. */ char * uncompress(unsigned char const *s) { const unsigned char *p; char c; int i; static char buf[BUFFER_LEN]; buf[0] = '\0'; if (!s || !*s) return buf; p = (unsigned char *) s; b = buf; while (*p) { c = *p; if (c & TABLE_FLAG) { i = ((c & TABLE_MASK) << 8) | *(++p); if (i >= MAXTABLE || words[i] == NULL) { static int panicking = 0; if (panicking) { do_rawlog(LT_ERR, "Error in string decompression occurred during panic dump."); exit(1); } else { panicking = 1; /* don't panic from within panic */ do_rawlog(LT_ERR, "Error in string decompression, i = %i", i); mush_panic("Fatal error in decompression"); } } strncpy((char *) b, words[i], words_len[i]); b += words_len[i] - 1; } else *b++ = c; p++; } *b++ = 0; /* trailing null */ return buf; } /* end of uncompress; */ /** Word-uncompress a string, allocating memory. * this function should be used when you're doing something like * \verbatim * char *attrib = safe_uncompress(a->value); * * NEVER use it with something like * * char tbuf1[BUFFER_LEN]; * strcpy(tbuf1, safe_uncompress(a->value)); * \endverbatim * or you will create a horrendous memory leak. * * \param s compressed string to uncompress. * \return pointer to newly allocated string containing uncompressed text. */ char * safe_uncompress(unsigned char const *s) { return (char *) strdup((char *) uncompress(s)); } /** Initialize the word compression. * This function clears the words table the first time through. * \param f (unused). */ int init_compress(FILE * f __attribute__ ((__unused__))) { memset(words, 0, sizeof words); memset(words_len, 0, sizeof words_len); return 0; } #ifdef COMP_STATS /** Return word-compression statistics. */ void compress_stats(long *entries, long *mem_used, long *total_uncompressed, long *total_compressed) { *entries = total_entries; *mem_used = total_mallocs; *total_uncompressed = total_uncomp; *total_compressed = total_comp; } #endif static unsigned hash_fn(const char *s, int hashtab_mask) { /* hash function, using masks (based on TinyMUSH 2.0) */ unsigned hashval; const char *p; for (hashval = 0, p = s; *p; p++) hashval = (hashval << 5) + hashval + *p; return (hashval & hashtab_mask); }