pennmush/game/
pennmush/game/data/
pennmush/game/log/
pennmush/game/save/
pennmush/game/txt/evt/
pennmush/game/txt/nws/
pennmush/os2/
/* comp_b.c */

/* Compression routines */
#include "copyrite.h"
#include "config.h"

#ifdef I_STRING
#include <string.h>
#else
#include <strings.h>
#endif
#include <ctype.h>
#include "conf.h"
#include "mushdb.h"
#include "intrface.h"
#include "externs.h"
#include "confmagic.h"

/* These use a pathetically simple encoding that takes advantage of the */
/* eighth bit on a char; if you are using an international character set, */
/* they may need substantial patching. 
 * This is basically the old bigram compression from pl9, with
 * automatic table tuning at startup.
 */

#define TOKEN_BIT 0x80		/* if on, it's a token */
#define TOKEN_MASK 0x7f		/* for stripping out token value */
#define NUM_TOKENS 128
#define MAX_CHAR 128
#ifndef SAMPLE_SIZE
#define SAMPLE_SIZE	0
#endif


static unsigned char tokens[NUM_TOKENS][3];
static unsigned char token_table[MAX_CHAR][MAX_CHAR];
static int table_initialized = 0;


int init_compress _((FILE * f));
static void list_to_table _((void));
static int list_insert _((unsigned char c1, unsigned char c2, int count));
static int compressed _((const char *s));

/* Initialize the bigram table
 * 1. Read indb (up to SAMPLE_SIZE chars, if defined) and count
 *    the frequency of bigrams
 * 2. Cheat the relative frequency of some known special chars
 *    and upper-case letters
 * 3. Construct a bigram table
 */
int
init_compress(f)
    FILE *f;
{
  int counts[256][256];
  unsigned char last;
  unsigned char c, d, highc, highd;
  int i;
  int highest;
  int total;

  /* Initialize the table */
  for (c = 0; c < 255; c++)
    for (last = 0; last < 255; last++)
      if (!isprint(c) || !isprint(last))
	counts[c][last] = -1;
      else
	counts[c][last] = 0;


  /* Scan the db */
  total = 0;
  last = 255;
  while (!feof(f) && (!SAMPLE_SIZE || (total++ < SAMPLE_SIZE))) {
    c = fgetc(f);
    if (c == '\n')
      continue;
    if (last == 255) {
      last = c;
      continue;
    }
    if (counts[last][c] != -1)
      counts[last][c]++;
    last = c;
  }

  /* The ']' character is artificially raised by being the
   * start-of-attribute marker in indb.  Set it back to '[',
   * which it should be balancing...
   */
  for (c = 0; c < 255; c++) {
    counts[']'][c] = counts['['][c];
    counts[c][']'] = counts[c]['['];
  }

  /* Now run through the counts table and build the tokens table
   * by finding the 128 highest frequency bigrams from counts
   * When we add a bigram to the table, we also change its
   * frequency to -1 so it won't get picked again.
   */
  highc = highd = 0;
  for (i = 0; i < NUM_TOKENS; i++) {
    highest = 0;
    for (c = 0; c < 255; c++) {
      for (d = 0; d < 255; d++) {
	if (counts[c][d] >= highest) {
	  highc = c;
	  highd = d;
	  highest = counts[c][d];
	}
      }
    }
    tokens[i][0] = highc;
    tokens[i][1] = highd;
    token_table[tokens[i][0]][tokens[i][1]] = i | TOKEN_BIT;
    counts[highc][highd] = -1;
  }

  table_initialized = 1;
  return 0;
}


static int
compressed(s)
    const char *s;
{
  while (*s) {
    if (*s++ & TOKEN_BIT)
      return 1;
  }
  return 0;
}

unsigned char *
compress(s)
    char const *s;
{
  static unsigned char buf[BUFFER_LEN];
  unsigned char *to;
  unsigned char token;
  unsigned char *p;

  p = (unsigned char *) s;
  if (compressed(s))
    return p;			/* already compressed */

  /* tokenize the first characters */
  for (to = buf; p[0] && p[1]; to++) {
    token = token_table[p[0]][p[1]];
    if (token) {
      *to = token;
      p += 2;
    } else {
      *to = p[0];
      p++;
    }
  }

  /* copy the last character (if any) and null */
  while ((*to++ = *p++) != '\0') ;

  return buf;
}

char *
uncompress(s)
    unsigned char const *s;
{
  /* to avoid generating memory problems, this function should be
   * used with something of the format
   * char tbuf1[BUFFER_LEN];
   * strcpy(tbuf1, uncompress(a->value));
   * if you are using something of type char *buff, use the
   * safe_uncompress function instead.
   */

  static char buf[BUFFER_LEN];
  char *to;
  char *token;
  unsigned char *p = (unsigned char *) s;
  for (to = buf; *p; p++) {
    if (*p & TOKEN_BIT) {
      token = (char *) tokens[*p & TOKEN_MASK];
      *to++ = *token++;
      *to++ = *token;
    } else {
      *to++ = *p;
    }
  }

  *to++ = *p;

  return buf;
}

char *
safe_uncompress(s)
    unsigned char const *s;
{
  /* this function should be used when you're doing something like
   * char *attrib = safe_uncompress(a->value);
   * NEVER use it with something like 
   * char tbuf1[BUFFER_LEN]; strcpy(tbuf1, safe_uncompress(a->value));
   * or you will create a horrendous memory leak.
   */
  return strdup(uncompress(s));
}