/************************************************************************** * * mgpp_fast_comp_dict.cpp -- Program to generate a fast compression dictionary * Copyright (C) 1994 Neil Sharman * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #define _XOPEN_SOURCE 1 #define _XOPEN_SOURCE_EXTENDED 1 // need this to avoid bizarre compiler problems under VC++ 6.0 #if defined (__WIN32__) && !defined (GSDL_USE_IOS_H) # include #endif /* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */ #if defined (__WIN32__) || defined (__CYGWIN__) # include "getopt_old.h" #else # include #endif #include "sysfuncs.h" #include "huffman.h" #include "messages.h" #include "memlib.h" #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */ #include "local_strings.h" #include "mg.h" #include "text.h" #include "invf.h" #include "mg_files.h" #include "locallib.h" #include "words.h" #define ALIGN_SIZE(p,t) (((p) + (sizeof(t)-1)) & (~(sizeof(t)-1))) #define WORDNO(p, base) ((((char*)(p))-((char*)(base)))/sizeof(u_char*)) #define FIXUP(p) (fixup[WORDNO(p,buffer)/8] |= (1<<(WORDNO(p, buffer) & 7))) #define IS_FIXUP(p) ((fixup[WORDNO(p,buffer)/8] & (1<<(WORDNO(p, buffer) & 7))) != 0) #define FIXUP_VALS(vals) do { \ int i; \ for (i=0; i < MAX_HUFFCODE_LEN+1; ++i) \ FIXUP(&vals[i]); \ } while(0) static u_long mem_for_comp_dict (char *filename); static void load_comp_dict (char *filename); static void save_fast_dict (char *filename); static void unfixup_buffer (void); static void *buffer; static void *cur; static u_char *fixup; static u_long mem, fixup_mem; int main (int argc, char **argv) { int ch; char *filename = ""; opterr = 0; msg_prefix = argv[0]; while ((ch = getopt (argc, argv, "f:d:h")) != -1) switch (ch) { case 'f': /* input file */ filename = optarg; break; case 'd': set_basepath (optarg); break; case 'h': case '?': fprintf (stderr, "usage: %s [-f input_file] [-d data directory] [-h]\n", argv[0]); exit (1); } mem = mem_for_comp_dict (filename); fixup_mem = (ALIGN_SIZE (mem, u_char *) / sizeof (u_char *) + 7) / 8; cur = buffer = Xmalloc (mem); memset (buffer, 0, mem); fixup = (u_char *) Xmalloc (fixup_mem); memset (fixup, 0, fixup_mem); #ifndef SILENT Message ("Estimated memory for fast_dict %u", mem); Message ("Estimated memory for fixups %u", fixup_mem); #endif load_comp_dict (filename); #ifndef SILENT Message ("Actual memory for fast_dict %u", (char *) cur - (char *) buffer); #endif if ((u_long) cur > (u_long) buffer + mem) FatalError (1, "The buffer was not big enough for the dictionary"); { /* [RPAP - Jan 97: Endian Ordering] */ compression_dict *cd = (compression_dict *) buffer; int i, which; /* cfh */ for (which = 0; which <= 1; ++which) { int j; HTONSI(cd->cfh[which]->hd.num_codes); HTONSI(cd->cfh[which]->hd.mincodelen); HTONSI(cd->cfh[which]->hd.maxcodelen); for (j = 0; j < MAX_HUFFCODE_LEN + 1; ++j) { HTONSI(cd->cfh[which]->hd.lencount[j]); HTONUL(cd->cfh[which]->hd.min_code[j]); } HTONUL(cd->cfh[which]->uncompressed_size); for (j = 0; j < MAX_HUFFCODE_LEN + 1; ++j) HTONUL(cd->cfh[which]->huff_words_size[j]); } HTONUL(cd->MemForCompDict); /* ad */ if (cd->cdh.novel_method == MG_NOVEL_DELTA || cd->cdh.novel_method == MG_NOVEL_HYBRID) for (which = 0; which <= 1; ++which) { int j; HTONUL(cd->ad->afh[which].num_frags); HTONUL(cd->ad->afh[which].mem_for_frags); for (j = 0; j < 33; ++j) { HTONSI(cd->ad->blk_start[which][j]); HTONSI(cd->ad->blk_end[which][j]); } } /* cdh */ HTONUL(cd->cdh.dict_type); HTONUL(cd->cdh.novel_method); for (i = 0; i < TEXT_PARAMS; ++i) HTONUL(cd->cdh.params[which]); HTONUL(cd->cdh.num_words[0]); HTONUL(cd->cdh.num_words[1]); HTONUL(cd->cdh.num_word_chars[0]); HTONUL(cd->cdh.num_word_chars[1]); HTONUL(cd->cdh.lookback); HTONSI(cd->fast_loaded); } unfixup_buffer (); save_fast_dict (filename); return (0); } static void unfixup_buffer () { u_long *p; for (p = (u_long *) buffer; (u_long) p < (u_long) cur; ++p) { if (IS_FIXUP (p)) *p = *p - (u_long) buffer; } } static u_long mem_for_aux_dict (compression_dict_header * /*cdh*/, char *filename) { int i; u_long mem = sizeof (auxiliary_dict); FILE *aux; aux = open_file (filename, TEXT_DICT_AUX_SUFFIX, "rb", MAGIC_AUX_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ for (i = 0; i <= 1; ++i) { aux_frags_header afh; fread (&afh, sizeof (afh), 1, aux); NTOHUL(afh.num_frags); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(afh.mem_for_frags); /* [RPAP - Jan 97: Endian Ordering] */ mem += afh.num_frags * sizeof (u_char *); mem = ALIGN_SIZE (mem + afh.mem_for_frags, u_char *); fseek (aux, afh.mem_for_frags, SEEK_CUR); } fclose (aux); return mem; } static u_long mem_for_words (FILE * dict, compression_dict_header * cdh, comp_frags_header * cfh) { u_long mem = 0; long i, lookback; int ptrs_reqd = 0; int mem_reqd = 0; lookback = cdh->lookback; for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; ++i) { ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback; mem_reqd += cfh->huff_words_size[i]; } mem += ptrs_reqd * sizeof (u_char *); mem += (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **); mem += mem_reqd; for (i = 0; i < cfh->hd.num_codes; ++i) { register int val; for (val = getc (dict) & 0xf; val; --val) getc (dict); } return ALIGN_SIZE (mem, u_char *); } static u_long mem_skip_hd(FILE *dict, u_long mem) { huff_data hd; mem += sizeof (hd); Read_Huffman_Data (dict, &hd, NULL, NULL); if (hd.clens) delete []hd.clens; mem += hd.num_codes * sizeof (unsigned long); mem += (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *); return mem; } static u_long mem_read_cfh(FILE *dict, compression_dict_header *cdh, comp_frags_header *cfh, u_long mem) { mem += sizeof (comp_frags_header); Read_cfh (dict, cfh, NULL, NULL); /* Don't need to count the space for the clens of the huffman data */ mem += mem_for_words (dict, cdh, cfh); if (cfh->hd.clens) delete []cfh->hd.clens; return mem; } static u_long mem_for_comp_dict (char *filename) { u_long mem = sizeof (compression_dict); compression_dict_header cdh; comp_frags_header cfh; FILE *dict; int which; int i; /* [RPAP - Jan 97: Endian Ordering] */ dict = open_file (filename, TEXT_DICT_SUFFIX, "rb", MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ fread (&cdh, sizeof (cdh), 1, dict); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(cdh.dict_type); NTOHUL(cdh.novel_method); for (i = 0; i < TEXT_PARAMS; ++i) NTOHUL(cdh.params[i]); NTOHUL(cdh.num_words[0]); NTOHUL(cdh.num_words[1]); NTOHUL(cdh.num_word_chars[0]); NTOHUL(cdh.num_word_chars[1]); NTOHUL(cdh.lookback); for (which = 0; which < 2; ++which) switch (cdh.dict_type) { case MG_COMPLETE_DICTIONARY: { mem = mem_read_cfh(dict, &cdh, &cfh, mem); } break; case MG_PARTIAL_DICTIONARY: { // huff_data hd; if (cdh.num_words[which]) { mem = mem_read_cfh(dict, &cdh, &cfh, mem); } mem = mem_skip_hd(dict, mem); mem = mem_skip_hd(dict, mem); } break; case MG_SEED_DICTIONARY: { // huff_data hd; if (cdh.num_words[which]) { mem = mem_read_cfh(dict, &cdh, &cfh, mem); } switch (cdh.novel_method) { case MG_NOVEL_HUFFMAN_CHARS: mem = mem_skip_hd(dict, mem); mem = mem_skip_hd(dict, mem); break; case MG_NOVEL_DELTA: break; case MG_NOVEL_HYBRID: break; } break; } } fclose (dict); if (cdh.novel_method == MG_NOVEL_DELTA || cdh.novel_method == MG_NOVEL_HYBRID) mem += mem_for_aux_dict (&cdh, filename); return ALIGN_SIZE (mem, u_char *); } void * getmem (u_long size, int align) { void *res; cur = (void *) (((u_long) cur + (align - 1)) & (~(align - 1))); res = cur; cur = (char *) cur + size; if ((u_long) cur > (u_long)buffer + mem) FatalError (1, "The buffer was not big enough for the dictionary"); return res; } static auxiliary_dict * LoadAuxDict (compression_dict_header * cdh, char *filename) { auxiliary_dict *ad; int i; FILE *aux; aux = open_file (filename, TEXT_DICT_AUX_SUFFIX, "rb", MAGIC_AUX_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ ad = (auxiliary_dict *) getmem (sizeof (auxiliary_dict), sizeof (u_char *)); for (i = 0; i <= 1; ++i) { unsigned int j; u_char *pos; fread (&ad->afh[i], sizeof (aux_frags_header), 1, aux); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(ad->afh[i].num_frags); NTOHUL(ad->afh[i].mem_for_frags); ad->word_data[i] = (u_char *) getmem (ad->afh[i].mem_for_frags, 1); FIXUP (&ad->word_data[i]); ad->words[i] = (u_char **) getmem (ad->afh[i].num_frags * sizeof (u_char *), sizeof (u_char *)); FIXUP (&ad->words[i]); fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char), aux); pos = ad->word_data[i]; for (j = 0; j < ad->afh[i].num_frags; ++j) { ad->words[i][j] = pos; FIXUP (&ad->words[i][j]); pos += *pos + 1; } if (cdh->novel_method == MG_NOVEL_HYBRID) { int num; num = 1; ad->blk_start[i][0] = 0; ad->blk_end[i][0] = cdh->num_words[i] - 1; while (num < 33) { ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1; ad->blk_end[i][num] = ad->blk_start[i][num] + (ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2; ++num; } } } fclose (aux); return (ad); } static u_char *** ReadInWords (FILE * dict, compression_dict * cd, comp_frags_header * cfh, u_char ** escape) { int i, lookback; int ptrs_reqd = 0; int mem_reqd = 0; int num_set[MAX_HUFFCODE_LEN + 1]; u_char *next_word[MAX_HUFFCODE_LEN + 1]; u_char **vals; u_char ***values; u_char word[MAXWORDLEN + 1]; u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1]; lookback = cd->cdh.lookback; for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; ++i) { ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback; mem_reqd += cfh->huff_words_size[i]; } vals = (u_char **) getmem (ptrs_reqd * sizeof (*vals), sizeof (u_char *)); values = (u_char ***) getmem ((MAX_HUFFCODE_LEN + 1) * sizeof (u_char **), sizeof (u_char **)); next_word[0] = (u_char *) getmem (mem_reqd, sizeof (char)); cd->MemForCompDict += ptrs_reqd * sizeof (*vals) + (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) + mem_reqd; values[0] = vals; FIXUP (&values[0]); values[0][0] = next_word[0]; FIXUP (&values[0][0]); for (i = 1; i <= cfh->hd.maxcodelen; ++i) { int next_start = (values[i - 1] - vals) + ((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback); values[i] = &vals[next_start]; FIXUP (&values[i]); next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1]; values[i][0] = next_word[i]; FIXUP (&values[i][0]); } memset (num_set, '\0', sizeof (num_set)); for (i = 0; i < cfh->hd.num_codes; ++i) { register int val, copy; register int len = cfh->hd.clens[i]; val = getc (dict); copy = (val >> 4) & 0xf; val &= 0xf; fread (word + copy + 1, sizeof (u_char), val, dict); *word = val + copy; if ((num_set[len] & ((1 << lookback) - 1)) == 0) { values[len][num_set[len] >> lookback] = next_word[len]; FIXUP (&values[len][num_set[len] >> lookback]); memcpy (next_word[len], word, *word + 1); if (escape && i == cfh->hd.num_codes - 1) { *escape = next_word[len]; FIXUP (escape); } next_word[len] += *word + 1; } else { copy = prefixlen (last_word[len], word); memcpy (next_word[len] + 1, word + copy + 1, *word - copy); *next_word[len] = (copy << 4) + (*word - copy); if (escape && i == cfh->hd.num_codes - 1) { *escape = next_word[len]; FIXUP (escape); } next_word[len] += (*word - copy) + 1; } memcpy (last_word[len], word, *word + 1); ++num_set[len]; } if (cfh->hd.clens) delete []cfh->hd.clens; cfh->hd.clens = NULL; return values; } static unsigned long ** Generate_Fast_Huffman_Vals (huff_data * data, u_long * mem) { int i; unsigned long *fcode[MAX_HUFFCODE_LEN + 1]; unsigned long **values; unsigned long *vals; if (!data) return (NULL); vals = (unsigned long *) getmem (data->num_codes * sizeof (*vals), sizeof (long *)); values = (unsigned long **) getmem ((MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *), sizeof (long *)); memset (values, '\0', (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *)); if (mem) *mem += data->num_codes * sizeof (*vals) + (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *); fcode[0] = values[0] = &vals[0]; FIXUP (&values[0]); for (i = 1; i <= data->maxcodelen; ++i) { fcode[i] = values[i] = &vals[(values[i - 1] - vals) + data->lencount[i - 1]]; FIXUP (&values[i]); } for (i = 0; i < data->num_codes; ++i) if (data->clens[i]) *fcode[(int) (data->clens[i])]++ = i; return (values); } static void load_read_huffman(FILE *dict, compression_dict *cd, int which) { huff_data *hd; u_long **vals; hd = (huff_data *) getmem (sizeof (huff_data), sizeof (char *)); cd->MemForCompDict += sizeof (huff_data); Read_Huffman_Data (dict, hd, &cd->MemForCompDict, NULL); vals = Generate_Fast_Huffman_Vals (hd, &cd->MemForCompDict); cd->chars_huff[which] = hd; FIXUP (&cd->chars_huff[which]); cd->chars_vals[which] = vals; FIXUP (&cd->chars_vals[which]); if (hd->clens) delete []hd->clens; hd->clens = NULL; } static void load_read_words(FILE *dict, compression_dict *cd, int which, int fixescape) { cd->cfh[which] = (comp_frags_header *) getmem (sizeof (*cd->cfh[which]), sizeof (u_char *)); cd->MemForCompDict += sizeof (*cd->cfh[which]); Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL); cd->values[which] = ReadInWords (dict, cd, cd->cfh[which], NULL); FIXUP (&cd->cfh[which]); FIXUP (&cd->values[which]); if (fixescape) { FIXUP(&cd->escape[which]); } else { cd->escape[which] = NULL; } } static void load_comp_dict (char *filename) { FILE *dict; int which; compression_dict *cd; cd = (compression_dict *) getmem (sizeof (compression_dict), sizeof (u_char *)); cd->fast_loaded = 1; dict = open_file (filename, TEXT_DICT_SUFFIX, "rb", MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ Read_cdh (dict, &cd->cdh, NULL, NULL); for (which = 0; which < 2; ++which) switch (cd->cdh.dict_type) { case MG_COMPLETE_DICTIONARY: { load_read_words(dict, cd, which, 0); } break; case MG_PARTIAL_DICTIONARY: { if (cd->cdh.num_words[which]) { load_read_words(dict, cd, which, 1); } load_read_huffman(dict, cd, which); load_read_huffman(dict, cd, which); } break; case MG_SEED_DICTIONARY: { if (cd->cdh.num_words[which]) { load_read_words(dict, cd, which, 1); } switch (cd->cdh.novel_method) { case MG_NOVEL_HUFFMAN_CHARS: load_read_huffman(dict, cd, which); load_read_huffman(dict, cd, which); break; case MG_NOVEL_DELTA: break; case MG_NOVEL_HYBRID: break; } break; } } fclose (dict); if (cd->cdh.novel_method == MG_NOVEL_DELTA || cd->cdh.novel_method == MG_NOVEL_HYBRID) { cd->ad = LoadAuxDict (&cd->cdh, filename); FIXUP (&cd->ad); } } static void save_fast_dict (char *filename) { FILE *fdict; fdict = create_file (filename, TEXT_DICT_FAST_SUFFIX, "wb", MAGIC_FAST_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ { u_long *p; for (p = (u_long *) buffer; (u_long) p < (u_long) cur; ++p) { if (IS_FIXUP (p)) HTONUL(*p); } } /* [RPAP - Jan 97: Endian Ordering] */ HTONUL(mem); HTONUL(fixup_mem); fwrite (&mem, sizeof (mem), 1, fdict); fwrite (&fixup_mem, sizeof (fixup_mem), 1, fdict); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(mem); NTOHUL(fixup_mem); fwrite (buffer, sizeof (u_char), mem, fdict); fwrite (fixup, sizeof (u_char), fixup_mem, fdict); fclose (fdict); }