Listing 1: compress.c Byte pair encoding compression
/* compress.c -- Byte Pair Encoding compression */ /* Copyright 1996 Philip Gage */ #include <stdio.h> #include <stdlib.h> #define MAXSIZE 65535L /* Input file buffer size */ #define HASHSIZE 8192 /* Hash table size, power of 2 */ #define THRESHOLD 3 /* Increase for speed, min 3 */ void compress (FILE *input, FILE *output) { unsigned char *buffer, *left, *right, *count; unsigned char a, b, bestcount=0, pairtable[128][2]; int i, j, index, bestindex, code=128; size_t size; /* Dynamically allocate buffers and check for errors */ buffer = (unsigned char *)malloc(MAXSIZE); left = (unsigned char *)malloc(HASHSIZE); right = (unsigned char *)malloc(HASHSIZE); count = (unsigned char *)malloc(HASHSIZE); if (buffer==NULL || left==NULL || right==NULL || count==NULL) { printf("Error allocating memory\n"); exit(1); } /* Read input file into buffer and check for errors */ size = fread(buffer,1,MAXSIZE,input); if (size == MAXSIZE) { printf("File too big\n"); exit(1); } for (i=0; i<size; i++) if (buffer[i] > 127) { printf("This program works only on text files\n"); exit(1); } do { /* Replace frequent pairs with bytes 128..255 */ /* Enter counts of all byte pairs into hash table */ memset(count,0,HASHSIZE); for (i=0; i<size-1; i++) { a = buffer[i]; b = buffer[i+1]; index = (a ^ (b << 6)) & (HASHSIZE-1); while ((left[index] != a || right[index] != b) && count[index] != 0) index = (index + 1) & (HASHSIZE-1); left[index] = a; right[index] = b; if (count[index] < 255) ++count[index]; } /* Search hash table for most frequent pair */ bestcount = THRESHOLD - 1; for (i=0; i<HASHSIZE; i++) { if (count[i] > bestcount) { bestcount = count[i]; bestindex = i; } } /* Compress if enough occurrences of pair */ if (bestcount >= THRESHOLD) { /* Add pair to table using code as index */ a = pairtable[code-128][0] = left[bestindex]; b = pairtable[code-128][1] = right[bestindex]; /* Replace all pair occurrences with unused byte */ for (i=0, j=0; i<size; i++, j++) if (a == buffer[i] && b == buffer[i+1]) { buffer[j] = code; ++i; } else buffer[j] = buffer[i]; size = j; } else break; } while (++code < 255); /* Write pair count, pair table and compressed data */ putc(code,output); fwrite(pairtable,2,code-128,output); fwrite(buffer,1,size,output); free(buffer); free(left); free(right); free(count); } int main (int argc, char **argv) { FILE *in, *out; if (argc != 3) printf("Usage: compress inputfile outputfile\n"); else if ((in=fopen(argv[1],"rb"))==NULL) printf("Error opening input %s\n",argv[1]); else if ((out=fopen(argv[2],"wb"))==NULL) printf("Error opening output %s\n",argv[2]); else { compress(in,out); fclose(out); fclose(in); } return 0; }/* End of File */