/* comp_b.c */
/* Compression routines */
#include "copyrite.h"
#include "config.h"
#ifdef I_STRING
#include <string.h>
#else
#include <strings.h>
#endif
#include <ctype.h>
#include "conf.h"
#include "mushdb.h"
#include "intrface.h"
#include "externs.h"
#include "confmagic.h"
/* These use a pathetically simple encoding that takes advantage of the */
/* eighth bit on a char; if you are using an international character set, */
/* they may need substantial patching.
* This is basically the old bigram compression from pl9, with
* automatic table tuning at startup.
*/
#define TOKEN_BIT 0x80 /* if on, it's a token */
#define TOKEN_MASK 0x7f /* for stripping out token value */
#define NUM_TOKENS 128
#define MAX_CHAR 128
#ifndef SAMPLE_SIZE
#define SAMPLE_SIZE 0
#endif
static unsigned char tokens[NUM_TOKENS][3];
static unsigned char token_table[MAX_CHAR][MAX_CHAR];
static int table_initialized = 0;
int init_compress _((FILE * f));
static void list_to_table _((void));
static int list_insert _((unsigned char c1, unsigned char c2, int count));
static int compressed _((const char *s));
/* Initialize the bigram table
* 1. Read indb (up to SAMPLE_SIZE chars, if defined) and count
* the frequency of bigrams
* 2. Cheat the relative frequency of some known special chars
* and upper-case letters
* 3. Construct a bigram table
*/
int
init_compress(f)
FILE *f;
{
int counts[256][256];
unsigned char last;
unsigned char c, d, highc, highd;
int i;
int highest;
int total;
/* Initialize the table */
for (c = 0; c < 255; c++)
for (last = 0; last < 255; last++)
if (!isprint(c) || !isprint(last))
counts[c][last] = -1;
else
counts[c][last] = 0;
/* Scan the db */
total = 0;
last = 255;
while (!feof(f) && (!SAMPLE_SIZE || (total++ < SAMPLE_SIZE))) {
c = fgetc(f);
if (c == '\n')
continue;
if (last == 255) {
last = c;
continue;
}
if (counts[last][c] != -1)
counts[last][c]++;
last = c;
}
/* The ']' character is artificially raised by being the
* start-of-attribute marker in indb. Set it back to '[',
* which it should be balancing...
*/
for (c = 0; c < 255; c++) {
counts[']'][c] = counts['['][c];
counts[c][']'] = counts[c]['['];
}
/* Now run through the counts table and build the tokens table
* by finding the 128 highest frequency bigrams from counts
* When we add a bigram to the table, we also change its
* frequency to -1 so it won't get picked again.
*/
highc = highd = 0;
for (i = 0; i < NUM_TOKENS; i++) {
highest = 0;
for (c = 0; c < 255; c++) {
for (d = 0; d < 255; d++) {
if (counts[c][d] >= highest) {
highc = c;
highd = d;
highest = counts[c][d];
}
}
}
tokens[i][0] = highc;
tokens[i][1] = highd;
token_table[tokens[i][0]][tokens[i][1]] = i | TOKEN_BIT;
counts[highc][highd] = -1;
}
table_initialized = 1;
return 0;
}
static int
compressed(s)
const char *s;
{
while (*s) {
if (*s++ & TOKEN_BIT)
return 1;
}
return 0;
}
unsigned char *
compress(s)
char const *s;
{
static unsigned char buf[BUFFER_LEN];
unsigned char *to;
unsigned char token;
unsigned char *p;
p = (unsigned char *) s;
if (compressed(s))
return p; /* already compressed */
/* tokenize the first characters */
for (to = buf; p[0] && p[1]; to++) {
token = token_table[p[0]][p[1]];
if (token) {
*to = token;
p += 2;
} else {
*to = p[0];
p++;
}
}
/* copy the last character (if any) and null */
while ((*to++ = *p++) != '\0') ;
return buf;
}
char *
uncompress(s)
unsigned char const *s;
{
/* to avoid generating memory problems, this function should be
* used with something of the format
* char tbuf1[BUFFER_LEN];
* strcpy(tbuf1, uncompress(a->value));
* if you are using something of type char *buff, use the
* safe_uncompress function instead.
*/
static char buf[BUFFER_LEN];
char *to;
char *token;
unsigned char *p = (unsigned char *) s;
for (to = buf; *p; p++) {
if (*p & TOKEN_BIT) {
token = (char *) tokens[*p & TOKEN_MASK];
*to++ = *token++;
*to++ = *token;
} else {
*to++ = *p;
}
}
*to++ = *p;
return buf;
}
char *
safe_uncompress(s)
unsigned char const *s;
{
/* this function should be used when you're doing something like
* char *attrib = safe_uncompress(a->value);
* NEVER use it with something like
* char tbuf1[BUFFER_LEN]; strcpy(tbuf1, safe_uncompress(a->value));
* or you will create a horrendous memory leak.
*/
return strdup(uncompress(s));
}