/************************************************************************** * * mg_fast_comp_dict.cpp -- Program to generate a fast compression dictionary * Copyright (C) 1994 Neil Sharman * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: mg_fast_comp_dict.cpp 860 2000-01-18 03:53:24Z rjmcnab $ * **************************************************************************/ /* $Log$ Revision 1.2 2000/01/18 03:53:23 rjmcnab Fixed a couple of bugs and made building silent if needed. Revision 1.1 2000/01/14 02:26:13 sjboddie Rodgers new C++ mg Revision 1.2 1999/10/17 23:43:26 cs025 Changes to eradicate Xmalloc Revision 1.1 1999/10/11 02:57:50 cs025 Base install of MG-PP Revision 1.1 1999/08/10 21:18:06 sjboddie renamed mg-1.3d directory mg Revision 1.2 1998/11/25 07:55:44 rjmcnab Modified mg to that you can specify the stemmer you want to use via a command line option. You specify it to mg_passes during the build process. The number of the stemmer that you used is stored within the inverted dictionary header and the stemmed dictionary header so the correct stemmer is used in later stages of building and querying. Revision 1.1 1998/11/17 09:34:57 rjmcnab *** empty log message *** * Revision 1.3 1994/10/20 03:56:55 tes * I have rewritten the boolean query optimiser and abstracted out the * components of the boolean query. * * Revision 1.2 1994/09/20 04:41:47 tes * For version 1.1 * */ #include "sysfuncs.h" #include "huffman.h" #include "messages.h" #include "memlib.h" #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */ #include "local_strings.h" #include "mg.h" #include "text.h" #include "invf.h" #include "mg_files.h" #include "locallib.h" #include "words.h" #define ALIGN_SIZE(p,t) (((p) + (sizeof(t)-1)) & (~(sizeof(t)-1))) #define WORDNO(p, base) ((((char*)(p))-((char*)(base)))/sizeof(u_char*)) #define FIXUP(p) (fixup[WORDNO(p,buffer)/8] |= (1<<(WORDNO(p, buffer) & 7))) #define IS_FIXUP(p) ((fixup[WORDNO(p,buffer)/8] & (1<<(WORDNO(p, buffer) & 7))) != 0) #define FIXUP_VALS(vals) do { \ int i; \ for (i=0; i < MAX_HUFFCODE_LEN+1; i++) \ FIXUP(&vals[i]); \ } while(0) static u_long mem_for_comp_dict (char *filename); static void load_comp_dict (char *filename); static void save_fast_dict (char *filename); static void unfixup_buffer (void); static void *buffer; static void *cur; static u_char *fixup; static u_long mem, fixup_mem; int main (int argc, char **argv) { int ch; char *filename = ""; opterr = 0; msg_prefix = argv[0]; while ((ch = getopt (argc, argv, "f:d:h")) != -1) switch (ch) { case 'f': /* input file */ filename = optarg; break; case 'd': set_basepath (optarg); break; case 'h': case '?': fprintf (stderr, "usage: %s [-f input_file] [-d data directory] [-h]\n", argv[0]); exit (1); } mem = mem_for_comp_dict (filename); fixup_mem = (ALIGN_SIZE (mem, u_char *) / sizeof (u_char *) + 7) / 8; cur = buffer = Xmalloc (mem); bzero (buffer, mem); fixup = (u_char *) Xmalloc (fixup_mem); bzero (fixup, fixup_mem); #ifndef SILENT Message ("Estimated memory for fast_dict %u", mem); Message ("Estimated memory for fixups %u", fixup_mem); #endif load_comp_dict (filename); #ifndef SILENT Message ("Actual memory for fast_dict %u", (char *) cur - (char *) buffer); #endif if ((u_long) cur > (u_long) buffer + mem) FatalError (1, "The buffer was not big enough for the dictionary"); { /* [RPAP - Jan 97: Endian Ordering] */ compression_dict *cd = (compression_dict *) buffer; int i, which; /* cfh */ for (which = 0; which <= 1; which++) { int j; HTONSI(cd->cfh[which]->hd.num_codes); HTONSI(cd->cfh[which]->hd.mincodelen); HTONSI(cd->cfh[which]->hd.maxcodelen); for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++) { HTONSI(cd->cfh[which]->hd.lencount[j]); HTONUL(cd->cfh[which]->hd.min_code[j]); } HTONUL(cd->cfh[which]->uncompressed_size); for (j = 0; j < MAX_HUFFCODE_LEN + 1; j++) HTONUL(cd->cfh[which]->huff_words_size[j]); } HTONUL(cd->MemForCompDict); /* ad */ if (cd->cdh.novel_method == MG_NOVEL_DELTA || cd->cdh.novel_method == MG_NOVEL_HYBRID) for (which = 0; which <= 1; which++) { int j; HTONUL(cd->ad->afh[which].num_frags); HTONUL(cd->ad->afh[which].mem_for_frags); for (j = 0; j < 33; j++) { HTONSI(cd->ad->blk_start[which][j]); HTONSI(cd->ad->blk_end[which][j]); } } /* cdh */ HTONUL(cd->cdh.dict_type); HTONUL(cd->cdh.novel_method); for (i = 0; i < TEXT_PARAMS; i++) HTONUL(cd->cdh.params[which]); HTONUL(cd->cdh.num_words[0]); HTONUL(cd->cdh.num_words[1]); HTONUL(cd->cdh.num_word_chars[0]); HTONUL(cd->cdh.num_word_chars[1]); HTONUL(cd->cdh.lookback); HTONSI(cd->fast_loaded); } unfixup_buffer (); save_fast_dict (filename); exit (0); } static void unfixup_buffer () { u_long *p; for (p = (u_long *) buffer; (u_long) p < (u_long) cur; p++) { if (IS_FIXUP (p)) *p = *p - (u_long) buffer; } } static u_long mem_for_aux_dict (compression_dict_header * /*cdh*/, char *filename) { int i; u_long mem = sizeof (auxiliary_dict); FILE *aux; aux = open_file (filename, TEXT_DICT_AUX_SUFFIX, "rb", MAGIC_AUX_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ for (i = 0; i <= 1; i++) { aux_frags_header afh; fread (&afh, sizeof (afh), 1, aux); NTOHUL(afh.num_frags); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(afh.mem_for_frags); /* [RPAP - Jan 97: Endian Ordering] */ mem += afh.num_frags * sizeof (u_char *); mem = ALIGN_SIZE (mem + afh.mem_for_frags, u_char *); fseek (aux, afh.mem_for_frags, SEEK_CUR); } fclose (aux); return mem; } static u_long mem_for_words (FILE * dict, compression_dict_header * cdh, comp_frags_header * cfh) { u_long mem = 0; long i, lookback; int ptrs_reqd = 0; int mem_reqd = 0; lookback = cdh->lookback; for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++) { ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback; mem_reqd += cfh->huff_words_size[i]; } mem += ptrs_reqd * sizeof (u_char *); mem += (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **); mem += mem_reqd; for (i = 0; i < cfh->hd.num_codes; i++) { register int val; for (val = getc (dict) & 0xf; val; val--) getc (dict); } return ALIGN_SIZE (mem, u_char *); } static u_long mem_skip_hd(FILE *dict, u_long mem) { huff_data hd; mem += sizeof (hd); Read_Huffman_Data (dict, &hd, NULL, NULL); if (hd.clens) delete hd.clens; mem += hd.num_codes * sizeof (unsigned long); mem += (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *); return mem; } static u_long mem_read_cfh(FILE *dict, compression_dict_header *cdh, comp_frags_header *cfh, u_long mem) { mem += sizeof (comp_frags_header); Read_cfh (dict, cfh, NULL, NULL); /* Don't need to count the space for the clens of the huffman data */ mem += mem_for_words (dict, cdh, cfh); if (cfh->hd.clens) delete cfh->hd.clens; return mem; } static u_long mem_for_comp_dict (char *filename) { u_long mem = sizeof (compression_dict); compression_dict_header cdh; comp_frags_header cfh; FILE *dict; int which; int i; /* [RPAP - Jan 97: Endian Ordering] */ dict = open_file (filename, TEXT_DICT_SUFFIX, "rb", MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ fread (&cdh, sizeof (cdh), 1, dict); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(cdh.dict_type); NTOHUL(cdh.novel_method); for (i = 0; i < TEXT_PARAMS; i++) NTOHUL(cdh.params[i]); NTOHUL(cdh.num_words[0]); NTOHUL(cdh.num_words[1]); NTOHUL(cdh.num_word_chars[0]); NTOHUL(cdh.num_word_chars[1]); NTOHUL(cdh.lookback); for (which = 0; which < 2; which++) switch (cdh.dict_type) { case MG_COMPLETE_DICTIONARY: { mem = mem_read_cfh(dict, &cdh, &cfh, mem); } break; case MG_PARTIAL_DICTIONARY: { // huff_data hd; if (cdh.num_words[which]) { mem = mem_read_cfh(dict, &cdh, &cfh, mem); } mem = mem_skip_hd(dict, mem); mem = mem_skip_hd(dict, mem); } break; case MG_SEED_DICTIONARY: { // huff_data hd; if (cdh.num_words[which]) { mem = mem_read_cfh(dict, &cdh, &cfh, mem); } switch (cdh.novel_method) { case MG_NOVEL_HUFFMAN_CHARS: mem = mem_skip_hd(dict, mem); mem = mem_skip_hd(dict, mem); break; case MG_NOVEL_DELTA: break; case MG_NOVEL_HYBRID: break; } break; } } fclose (dict); if (cdh.novel_method == MG_NOVEL_DELTA || cdh.novel_method == MG_NOVEL_HYBRID) mem += mem_for_aux_dict (&cdh, filename); return ALIGN_SIZE (mem, u_char *); } void * getmem (u_long size, int align) { void *res; cur = (void *) (((u_long) cur + (align - 1)) & (~(align - 1))); res = cur; cur = (char *) cur + size; if ((u_long) cur > (u_long)buffer + mem) FatalError (1, "The buffer was not big enough for the dictionary"); return res; } static auxiliary_dict * LoadAuxDict (compression_dict_header * cdh, char *filename) { auxiliary_dict *ad; int i; FILE *aux; aux = open_file (filename, TEXT_DICT_AUX_SUFFIX, "rb", MAGIC_AUX_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ ad = (auxiliary_dict *) getmem (sizeof (auxiliary_dict), sizeof (u_char *)); for (i = 0; i <= 1; i++) { unsigned int j; u_char *pos; fread (&ad->afh[i], sizeof (aux_frags_header), 1, aux); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(ad->afh[i].num_frags); NTOHUL(ad->afh[i].mem_for_frags); ad->word_data[i] = (u_char *) getmem (ad->afh[i].mem_for_frags, 1); FIXUP (&ad->word_data[i]); ad->words[i] = (u_char **) getmem (ad->afh[i].num_frags * sizeof (u_char *), sizeof (u_char *)); FIXUP (&ad->words[i]); fread (ad->word_data[i], ad->afh[i].mem_for_frags, sizeof (u_char), aux); pos = ad->word_data[i]; for (j = 0; j < ad->afh[i].num_frags; j++) { ad->words[i][j] = pos; FIXUP (&ad->words[i][j]); pos += *pos + 1; } if (cdh->novel_method == MG_NOVEL_HYBRID) { int num; num = 1; ad->blk_start[i][0] = 0; ad->blk_end[i][0] = cdh->num_words[i] - 1; while (num < 33) { ad->blk_start[i][num] = ad->blk_end[i][num - 1] + 1; ad->blk_end[i][num] = ad->blk_start[i][num] + (ad->blk_end[i][num - 1] - ad->blk_start[i][num - 1]) * 2; num++; } } } fclose (aux); return (ad); } static u_char *** ReadInWords (FILE * dict, compression_dict * cd, comp_frags_header * cfh, u_char ** escape) { int i, lookback; int ptrs_reqd = 0; int mem_reqd = 0; int num_set[MAX_HUFFCODE_LEN + 1]; u_char *next_word[MAX_HUFFCODE_LEN + 1]; u_char **vals; u_char ***values; u_char word[MAXWORDLEN + 1]; u_char last_word[MAX_HUFFCODE_LEN + 1][MAXWORDLEN + 1]; lookback = cd->cdh.lookback; for (i = cfh->hd.mincodelen; i <= cfh->hd.maxcodelen; i++) { ptrs_reqd += (cfh->hd.lencount[i] + ((1 << lookback) - 1)) >> lookback; mem_reqd += cfh->huff_words_size[i]; } vals = (u_char **) getmem (ptrs_reqd * sizeof (*vals), sizeof (u_char *)); values = (u_char ***) getmem ((MAX_HUFFCODE_LEN + 1) * sizeof (u_char **), sizeof (u_char **)); next_word[0] = (u_char *) getmem (mem_reqd, sizeof (char)); cd->MemForCompDict += ptrs_reqd * sizeof (*vals) + (MAX_HUFFCODE_LEN + 1) * sizeof (u_char **) + mem_reqd; values[0] = vals; FIXUP (&values[0]); values[0][0] = next_word[0]; FIXUP (&values[0][0]); for (i = 1; i <= cfh->hd.maxcodelen; i++) { int next_start = (values[i - 1] - vals) + ((cfh->hd.lencount[i - 1] + ((1 << lookback) - 1)) >> lookback); values[i] = &vals[next_start]; FIXUP (&values[i]); next_word[i] = next_word[i - 1] + cfh->huff_words_size[i - 1]; values[i][0] = next_word[i]; FIXUP (&values[i][0]); } bzero ((char *) num_set, sizeof (num_set)); for (i = 0; i < cfh->hd.num_codes; i++) { register int val, copy; register int len = cfh->hd.clens[i]; val = getc (dict); copy = (val >> 4) & 0xf; val &= 0xf; fread (word + copy + 1, sizeof (u_char), val, dict); *word = val + copy; if ((num_set[len] & ((1 << lookback) - 1)) == 0) { values[len][num_set[len] >> lookback] = next_word[len]; FIXUP (&values[len][num_set[len] >> lookback]); memcpy (next_word[len], word, *word + 1); if (escape && i == cfh->hd.num_codes - 1) { *escape = next_word[len]; FIXUP (escape); } next_word[len] += *word + 1; } else { copy = prefixlen (last_word[len], word); memcpy (next_word[len] + 1, word + copy + 1, *word - copy); *next_word[len] = (copy << 4) + (*word - copy); if (escape && i == cfh->hd.num_codes - 1) { *escape = next_word[len]; FIXUP (escape); } next_word[len] += (*word - copy) + 1; } memcpy (last_word[len], word, *word + 1); num_set[len]++; } if (cfh->hd.clens) delete cfh->hd.clens; cfh->hd.clens = NULL; return values; } static unsigned long ** Generate_Fast_Huffman_Vals (huff_data * data, u_long * mem) { int i; unsigned long *fcode[MAX_HUFFCODE_LEN + 1]; unsigned long **values; unsigned long *vals; if (!data) return (NULL); vals = (unsigned long *) getmem (data->num_codes * sizeof (*vals), sizeof (long *)); values = (unsigned long **) getmem ((MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *), sizeof (long *)); bzero ((char *) values, (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *)); if (mem) *mem += data->num_codes * sizeof (*vals) + (MAX_HUFFCODE_LEN + 1) * sizeof (unsigned long *); fcode[0] = values[0] = &vals[0]; FIXUP (&values[0]); for (i = 1; i <= data->maxcodelen; i++) { fcode[i] = values[i] = &vals[(values[i - 1] - vals) + data->lencount[i - 1]]; FIXUP (&values[i]); } for (i = 0; i < data->num_codes; i++) if (data->clens[i]) *fcode[(int) (data->clens[i])]++ = i; return (values); } static void load_read_huffman(FILE *dict, compression_dict *cd, int which) { huff_data *hd; u_long **vals; hd = (huff_data *) getmem (sizeof (huff_data), sizeof (char *)); cd->MemForCompDict += sizeof (huff_data); Read_Huffman_Data (dict, hd, &cd->MemForCompDict, NULL); vals = Generate_Fast_Huffman_Vals (hd, &cd->MemForCompDict); cd->chars_huff[which] = hd; FIXUP (&cd->chars_huff[which]); cd->chars_vals[which] = vals; FIXUP (&cd->chars_vals[which]); if (hd->clens) delete hd->clens; hd->clens = NULL; } static void load_read_words(FILE *dict, compression_dict *cd, int which, int fixescape) { cd->cfh[which] = (comp_frags_header *) getmem (sizeof (*cd->cfh[which]), sizeof (u_char *)); cd->MemForCompDict += sizeof (*cd->cfh[which]); Read_cfh (dict, cd->cfh[which], &cd->MemForCompDict, NULL); cd->values[which] = ReadInWords (dict, cd, cd->cfh[which], NULL); FIXUP (&cd->cfh[which]); FIXUP (&cd->values[which]); if (fixescape) { FIXUP(&cd->escape[which]); } else { cd->escape[which] = NULL; } } static void load_comp_dict (char *filename) { FILE *dict; int which; compression_dict *cd; cd = (compression_dict *) getmem (sizeof (compression_dict), sizeof (u_char *)); cd->fast_loaded = 1; dict = open_file (filename, TEXT_DICT_SUFFIX, "rb", MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ Read_cdh (dict, &cd->cdh, NULL, NULL); for (which = 0; which < 2; which++) switch (cd->cdh.dict_type) { case MG_COMPLETE_DICTIONARY: { load_read_words(dict, cd, which, 0); } break; case MG_PARTIAL_DICTIONARY: { if (cd->cdh.num_words[which]) { load_read_words(dict, cd, which, 1); } load_read_huffman(dict, cd, which); load_read_huffman(dict, cd, which); } break; case MG_SEED_DICTIONARY: { if (cd->cdh.num_words[which]) { load_read_words(dict, cd, which, 1); } switch (cd->cdh.novel_method) { case MG_NOVEL_HUFFMAN_CHARS: load_read_huffman(dict, cd, which); load_read_huffman(dict, cd, which); break; case MG_NOVEL_DELTA: break; case MG_NOVEL_HYBRID: break; } break; } } fclose (dict); if (cd->cdh.novel_method == MG_NOVEL_DELTA || cd->cdh.novel_method == MG_NOVEL_HYBRID) { cd->ad = LoadAuxDict (&cd->cdh, filename); FIXUP (&cd->ad); } } static void save_fast_dict (char *filename) { FILE *fdict; fdict = create_file (filename, TEXT_DICT_FAST_SUFFIX, "wb", MAGIC_FAST_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ { u_long *p; for (p = (u_long *) buffer; (u_long) p < (u_long) cur; p++) { if (IS_FIXUP (p)) HTONUL(*p); } } /* [RPAP - Jan 97: Endian Ordering] */ HTONUL(mem); HTONUL(fixup_mem); fwrite (&mem, sizeof (mem), 1, fdict); fwrite (&fixup_mem, sizeof (fixup_mem), 1, fdict); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(mem); NTOHUL(fixup_mem); fwrite (buffer, sizeof (u_char), mem, fdict); fwrite (fixup, sizeof (u_char), fixup_mem, fdict); fclose (fdict); }