/************************************************************************** * * mg_compression_dict.cpp -- Routines for creating compression dictionary * Copyright (C) 1994 Neil Sharman * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: mg_compression_dict.cpp 856 2000-01-14 02:26:25Z sjboddie $ * **************************************************************************/ #include "sysfuncs.h" #include "memlib.h" #include "messages.h" #include "local_strings.h" #include "bitio_gen.h" #include "bitio_m_stdio.h" #include "mgheap.h" #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */ #include "mg_files.h" #include "locallib.h" #include "invf.h" #include "text.h" #include "words.h" #include "mg.h" #include "WordData.h" #define MAXBITS (sizeof(unsigned long) * 8) /* $Log$ Revision 1.1 2000/01/14 02:26:11 sjboddie Rodgers new C++ mg Revision 1.3 1999/10/17 23:43:25 cs025 Changes to eradicate Xmalloc Revision 1.2 1999/10/11 22:03:03 cs025 Changed mistaken use of NTOHUL on a double to NTOHD as per report from Craig Neville-Manning of problems installing mg/gsdl. Revision 1.1 1999/10/11 02:57:46 cs025 Base install of MG-PP Revision 1.1 1999/08/10 21:18:04 sjboddie renamed mg-1.3d directory mg Revision 1.4 1999/01/15 03:05:51 rjmcnab Renamed lib/heap to be lib/mgheap (it was causing some problems with some versions of the STL libraries which contained a heap.h) Revision 1.3 1998/12/17 09:12:52 rjmcnab Altered mg to process utf-8 encoded Unicode. The main changes are in the parsing of the input, the casefolding, and the stemming. Revision 1.2 1998/11/25 07:55:44 rjmcnab Modified mg to that you can specify the stemmer you want to use via a command line option. You specify it to mg_passes during the build process. The number of the stemmer that you used is stored within the inverted dictionary header and the stemmed dictionary header so the correct stemmer is used in later stages of building and querying. Revision 1.1 1998/11/17 09:34:52 rjmcnab *** empty log message *** * Revision 1.3 1994/10/20 03:56:54 tes * I have rewritten the boolean query optimiser and abstracted out the * components of the boolean query. * * Revision 1.2 1994/09/20 04:41:45 tes * For version 1.1 * */ /* #define DEBUG1 */ /* #define DEBUG */ #define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0)) #define MAX_RECALCULATIONS 100 typedef struct DictWordData : public WordData { float saving; float char_bit_cost; u_long num_trans; u_char *word; } DictWordData; static DictWordData *Words[2]; static u_long Num[2]; static u_long chars[2]; #define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1) #define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0) #define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0) typedef struct DictData { DictWordData **wd; u_long num_wds; u_long chars; } DictData; typedef DictData DictInfo[2]; static DictInfo keep, discard, all; static compression_stats_header csh; static char *file_name = ""; static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS; static void ReadInWords (char *); static void Select_on (int k, heap_comp hc); static void Method3 (int k); static void Select_all (void); static u_long WriteOutWords (char *, u_long, int); static int DecFreqIncWL (void *a, void *b); static int OccuranceOrder (void *a, void *b); int main (int argc, char **argv) { int ch; char type = 'C'; char mode = '0'; int lookback = 2; double k = 0; u_long mem_reqd; opterr = 0; msg_prefix = argv[0]; while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HDY")) != -1) switch (ch) { case 'H': novel_method = MG_NOVEL_HUFFMAN_CHARS; break; case 'D': novel_method = MG_NOVEL_DELTA; break; case 'Y': novel_method = MG_NOVEL_HYBRID; break; case 'f': /* input file */ file_name = optarg; break; case 'd': set_basepath (optarg); break; case 'C': case 'P': case 'S': type = ch; break; case '0': case '1': case '2': case '3': mode = ch; break; case 'l': lookback = atoi (optarg); if (!is_power_of_two (lookback)) FatalError (1, "The lookback value must be a power of 2"); lookback = floorlog_2 (lookback); break; case 'k': k = atof (optarg) * 1024; break; case 'h': case '?': fprintf (stderr, "usage: %s [-l lookback] [-f input_file]" "[-d data directory] [-h] [-0|-1|-2|-3] [-C|-P|-S] [-k mem (Kb)]\n", argv[0]); exit (1); } ReadInWords (file_name); if (type == 'C') { Select_all (); mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback); } else { switch (mode) { case '0': Select_all (); break; case '1': Message ("Dictionary limit of %.2f Kb", k / 1024); Select_on ((int) k, OccuranceOrder); break; case '2': Message ("Dictionary limit of %.2f Kb", k / 1024); Select_on ((int) k, DecFreqIncWL); break; case '3': Message ("Dictionary limit of %.2f Kb", k / 1024); Method3 ((int) k); break; } if (type == 'P') { mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback); } else { mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback); } } Message ("Num words : %8u -> %8u\n", Num[1], keep[1].num_wds); Message ("Num non-words : %8u -> %8u\n", Num[0], keep[0].num_wds); Message ("Chars of words : %8u -> %8u\n", chars[1], keep[1].chars); Message ("Chars of non-words : %8u -> %8u\n", chars[0], keep[0].chars); Message ("Mem usage : %8u -> %8u\n", (Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1], (keep[0].num_wds + keep[1].num_wds) * sizeof (char *) + keep[0].chars + keep[1].chars); Message ("Actual mem required : %8u\n", mem_reqd); exit (0); } static void ReadInWords (char *filename) { FILE *f; unsigned long i; f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb", MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ fread (&csh, sizeof (csh), 1, f); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(csh.num_docs); NTOHD(csh.num_bytes); for (i = 0; i < 2; i++) { frags_stats_header fsh; char *buf; DictWordData *wd; chars[i] = 0; fread (&fsh, sizeof (fsh), 1, f); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(fsh.num_frags); NTOHUL(fsh.mem_for_frags); Num[i] = all[i].num_wds = fsh.num_frags; /* The +1 on the following line is to leave room for the esc code. */ all[i].wd = (DictWordData **) Xmalloc (sizeof (DictWordData *) * Num[i] + 1); buf = (char *) Xmalloc (fsh.mem_for_frags); wd = Words[i] = (DictWordData *) Xmalloc (sizeof (DictWordData) * Num[i]); unsigned int j; for (j = 0; j < Num[i]; j++, wd++) { int len; // read docCount and wordCount wd->read(f); // read an mgString len = fgetc (f); wd->word = (u_char *) buf; *buf++ = len; fread (buf, len, 1, f); buf += len; all[i].wd[j] = wd; } chars[i] = fsh.mem_for_frags - fsh.num_frags; } fclose (f); } static void Alloc_keep_discard (void) { keep[0].num_wds = 0; keep[0].wd = (DictWordData **) Xmalloc ((Num[0] + 1) * sizeof (DictWordData *)); keep[1].num_wds = 0; keep[1].wd = (DictWordData **) Xmalloc ((Num[1] + 1) * sizeof (DictWordData *)); discard[0].num_wds = 0; discard[0].wd = (DictWordData **) Xmalloc ((Num[0] + 1) * sizeof (DictWordData *)); discard[1].num_wds = 0; discard[1].wd = (DictWordData **) Xmalloc ((Num[1] + 1) * sizeof (DictWordData *)); } static int sort_comp (const void *a, const void *b) { DictWordData *aa = *(DictWordData **) a; DictWordData *bb = *(DictWordData **) b; return casecompare (aa->word, bb->word); /* [RPAP - Jan 97: Stem Index Change] */ } static void SortAndCount_DictData (DictData * dd) { unsigned int i; DictWordData **wd; qsort (dd->wd, dd->num_wds, sizeof (DictWordData *), sort_comp); dd->chars = 0; wd = dd->wd; for (i = 0; i < dd->num_wds; i++, wd++) dd->chars += (*wd)->word[0]; } static void Select_all (void) { unsigned int i; Alloc_keep_discard (); keep[0].num_wds = Num[0]; for (i = 0; i < Num[0]; i++) keep[0].wd[i] = Words[0] + i; keep[1].num_wds = Num[1]; for (i = 0; i < Num[1]; i++) keep[1].wd[i] = Words[1] + i; SortAndCount_DictData (&keep[0]); SortAndCount_DictData (&keep[1]); } static int DecFreqIncWL (void *a, void *b) { DictWordData *aa = *(DictWordData **) a; DictWordData *bb = *(DictWordData **) b; if (aa->docCount > bb->docCount) return -1; else if (aa->docCount < bb->docCount) return 1; else return bb->word[0] - aa->word[0]; } static int OccuranceOrder (void *a, void *b) { DictWordData *aa = *(DictWordData **) a; DictWordData *bb = *(DictWordData **) b; if (aa->words() > bb->words()) return 1; else if (aa->words() < bb->words()) return -1; else return 0; } static void Select_on (int k, heap_comp hc) { int i, num, mem; DictWordData **wd; Alloc_keep_discard (); num = Num[0] + Num[1]; wd = (DictWordData **) Xmalloc (num * sizeof (DictWordData *)); for (i = 0; (unsigned int)i < Num[0]; i++) wd[i] = Words[0] + i; for (i = 0; (unsigned int)i < Num[1]; i++) wd[i + Num[0]] = Words[1] + i; heap_build (wd, sizeof (*wd), num, hc); mem = 0; while (k > mem && num) { int idx; DictWordData *word = wd[0]; #ifdef DEBUG fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n", keep[0].num_wds, keep[1].num_wds, mem, word->documents(), word->word, word2str (word->word)); #endif mem += sizeof (u_char *) + word->word[0]; heap_deletehead (wd, sizeof (*wd), &num, hc); idx = KIND (word); keep[idx].wd[keep[idx].num_wds++] = word; } for (i = 0; i < num; i++) { DictWordData *word = wd[i]; int idx = KIND (word); discard[idx].wd[discard[idx].num_wds++] = word; } SortAndCount_DictData (&keep[0]); SortAndCount_DictData (&keep[1]); SortAndCount_DictData (&discard[0]); SortAndCount_DictData (&discard[1]); assert (keep[0].num_wds + discard[0].num_wds == Num[0]); assert (keep[1].num_wds + discard[1].num_wds == Num[1]); } static int BigSaving (void *a, void *b) { DictWordData *aa = *(DictWordData **) a; DictWordData *bb = *(DictWordData **) b; return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving); } static int SmallSaving (void *a, void *b) { DictWordData *aa = *(DictWordData **) a; DictWordData *bb = *(DictWordData **) b; return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving); } static void CalcCharCounts (DictWordData ** wd, int num, char *char_lens[2], char *len_lens[2], u_long escape[2]) { long char_freqs[2][256]; long len_freqs[2][16]; huff_data hd; int i; escape[0] = 0; escape[1] = 0; bzero ((char *) char_freqs, sizeof (char_freqs)); bzero ((char *) len_freqs, sizeof (len_freqs)); for (i = 0; i < num; i++, wd++) { u_long freq = (*wd)->documents(); u_char *buf = (*wd)->word; int len = *buf++; int idx = KIND (*wd); len_freqs[idx][len] += freq; escape[idx] += freq; for (; len; len--, buf++) char_freqs[idx][(u_long) (*buf)] += freq; } Generate_Huffman_Data (256, char_freqs[0], &hd, NULL); char_lens[0] = hd.clens; Generate_Huffman_Data (256, char_freqs[1], &hd, NULL); char_lens[1] = hd.clens; Generate_Huffman_Data (16, len_freqs[0], &hd, NULL); len_lens[0] = hd.clens; Generate_Huffman_Data (16, len_freqs[1], &hd, NULL); len_lens[1] = hd.clens; } inline void CalcBitCostForWordData(DictWordData **word, int num, double freqs_trans[2], u_long escape[2], char * char_lens[2], char *len_lens[2], double esc[2], int num_trans) { int j; for (j = 0; j < num; j++, word++) { float cbc, wbc; u_char *buf = (*word)->word; int len = *buf++; u_long freq = (*word)->documents(); int idx = KIND (*word); cbc = len_lens[idx][len]; for (; len; len--, buf++) cbc += char_lens[idx][(u_long) (*buf)]; (*word)->char_bit_cost = (cbc + esc[idx]) * freq; wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq; (*word)->saving = ((*word)->char_bit_cost - wbc) / (sizeof (char *) + (*word)->word[0]); (*word)->num_trans = num_trans; } } void CalcBitCost (DictWordData ** discard_word, int discard_num, DictWordData ** keep_word, int keep_num, double freqs_trans[2], u_long escape[2], int num_trans) { char *char_lens[2]; char *len_lens[2]; double esc[2]; CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape); esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0])); esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1])); CalcBitCostForWordData(discard_word, discard_num, freqs_trans, escape, char_lens, len_lens, esc, num_trans); CalcBitCostForWordData(keep_word, keep_num, freqs_trans, escape, char_lens, len_lens, esc, num_trans); Xfree (char_lens[0]); Xfree (char_lens[1]); Xfree (len_lens[0]); Xfree (len_lens[1]); } inline void m3_transferWord(DictWordData **toHeap, int *toNum, heap_comp toSaving, DictWordData **fromHeap, int *fromNum, heap_comp fromSaving, int *num_trans, double freqs_trans[], int *mem, double *total, int *count) { /* Transfer the word at the top of the keep heap to the discard heap. */ DictWordData *word = fromHeap[0]; int idx = KIND (word); heap_deletehead (fromHeap, sizeof (word), fromNum, fromSaving); toHeap[*toNum] = word; heap_additem (toHeap, sizeof (word), toNum, toSaving); freqs_trans[idx] -= word->documents(); *mem = (*mem) - sizeof (u_char *) + word->word[0]; *num_trans += 1; *total += word->saving; *count += 1; #ifdef DEBUG fprintf (stderr, "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n", *mem, word->documents(), word->char_bit_cost, word->saving, word2str (word->word)); #endif } inline void m3_storeWord(DictWordData **wordHeap, int wordNum, int num_trans, double freqs_trans[], u_long escape[], heap_comp hc) { DictWordData *word = wordHeap[0]; int idx = KIND (word); float wbc; #ifdef DEBUG1 fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word), word->saving); #endif wbc = -log2 (word->documents() / (freqs_trans[idx] + escape[idx])) * word->documents(); word->saving = (word->char_bit_cost - wbc) / (sizeof (char *) + word->word[0]); #ifdef DEBUG1 fprintf (stderr, " %.2f\n", word->saving); #endif word->num_trans = num_trans; heap_changedhead (wordHeap, sizeof (word), wordNum, hc); } static void Method3 (int k) { int i, keep_num, discard_num, mem, num_trans, recalc_reqd; int keep_to_discard = 0; int discard_to_keep = 0; int recalcs = 0; double freqs_trans[2], total; u_long escape[2]; DictWordData **keep_heap, **discard_heap; freqs_trans[0] = freqs_trans[1] = 0; num_trans = 0; discard_num = Num[0] + Num[1]; discard_heap = (DictWordData **) Xmalloc (discard_num * sizeof (DictWordData *)); keep_num = 0; keep_heap = (DictWordData **) Xmalloc (discard_num * sizeof (DictWordData *)); for (i = 0; (unsigned int)i < Num[0]; i++) discard_heap[i] = Words[0] + i; for (i = 0; (unsigned int)i < Num[1]; i++) discard_heap[i + Num[0]] = Words[1] + i; heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL); mem = 0; while (k > mem && discard_num) { DictWordData *word = discard_heap[0]; mem += sizeof (u_char *) + word->word[0]; heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL); keep_heap[keep_num++] = word; freqs_trans[KIND (word)] += word->documents(); num_trans++; } CalcBitCost (discard_heap, discard_num, keep_heap, keep_num, freqs_trans, escape, num_trans); heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving); heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving); total = 0; recalc_reqd = 0; while (keep_num && discard_num) { if ((keep_num && keep_heap[0]->num_trans < (unsigned)num_trans) || (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans)) { if (keep_num && keep_heap[0]->num_trans < (unsigned)num_trans) { m3_storeWord(keep_heap, keep_num, num_trans, freqs_trans, escape, SmallSaving); } if (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans) { m3_storeWord(discard_heap, discard_num, num_trans, freqs_trans, escape, BigSaving); } } else if (keep_heap[0]->saving < discard_heap[0]->saving) { assert (keep_num && discard_num); if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > (unsigned)k) { /* Transfer the word at the top of the keep heap to the discard heap. */ m3_transferWord(discard_heap, &discard_num, BigSaving, keep_heap, &keep_num, SmallSaving, &num_trans, freqs_trans, &mem, &total, &keep_to_discard); } else { /* Transfer the word at the top of the discard heap to the keep heap. */ m3_transferWord(keep_heap, &keep_num, SmallSaving, discard_heap, &discard_num, BigSaving, &num_trans, freqs_trans, &mem, &total, &discard_to_keep); } recalc_reqd = 1; } else { if (recalc_reqd == 0) break; #ifdef DEBUG1 fprintf (stderr, "--------------\n"); #endif if (recalcs == MAX_RECALCULATIONS) break; CalcBitCost (discard_heap, discard_num, keep_heap, keep_num, freqs_trans, escape, num_trans); heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving); heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving); recalc_reqd = 0; recalcs++; } } Alloc_keep_discard (); for (i = 0; i < discard_num; i++) { DictWordData *word = discard_heap[i]; int idx = KIND (word); assert (IsWord (word) || IsNonWord (word)); discard[idx].wd[discard[idx].num_wds++] = word; } for (i = 0; i < keep_num; i++) { DictWordData *word = keep_heap[i]; int idx = KIND (word); assert (IsWord (word) || IsNonWord (word)); keep[idx].wd[keep[idx].num_wds++] = word; } SortAndCount_DictData (&keep[0]); SortAndCount_DictData (&keep[1]); SortAndCount_DictData (&discard[0]); SortAndCount_DictData (&discard[1]); assert (keep[0].num_wds + discard[0].num_wds == Num[0]); assert (keep[1].num_wds + discard[1].num_wds == Num[1]); Message ("Keep -> Discard : %8d", keep_to_discard); Message ("Discard -> Keep : %8d", discard_to_keep); Message ("Huffman Recalculations : %8d", recalcs); if (recalcs == MAX_RECALCULATIONS) Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS); } /**************************************************************************** ***** ***** ***** Dictionary saving code ***** ***** ***** ****************************************************************************/ static void Write_cdh (FILE * f, compression_dict_header * cdh) { /* [RPAP - Jan 97: Endian Ordering] */ int i; compression_dict_header tmp = *cdh; HTONUL(tmp.dict_type); HTONUL(tmp.novel_method); for (i = 0; i < TEXT_PARAMS; i++) HTONUL(tmp.params[i]); HTONUL(tmp.num_words[0]); HTONUL(tmp.num_words[1]); HTONUL(tmp.num_word_chars[0]); HTONUL(tmp.num_word_chars[1]); HTONUL(tmp.lookback); fwrite (&tmp, sizeof (tmp), 1, f); } static void Write_words (FILE * f, DictData * dd) { unsigned int i; u_char *curr, *prev = NULL; for (i = 0; i < dd->num_wds; i++) { int len; curr = dd->wd[i]->word; if (prev) /* look for prefix match with prev string */ len = prefixlen (prev, curr); else len = 0; fputc ((len << 4) + (curr[0] - len), f); fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f); prev = curr; } } static int Uncompressed_size (DictData * dd) { unsigned int i, us; for (us = i = 0; i < dd->num_wds; i++) us += dd->wd[i]->word[0]; return us; } static u_long Write_data (FILE * f, DictData * dd, int lookback) { u_long mem_reqd; huff_data *hd; int i; u_long us = dd->num_wds; long *freqs; u_long huff_words_size[MAX_HUFFCODE_LEN + 1]; u_long lencounts[MAX_HUFFCODE_LEN + 1]; u_char *lastword[MAX_HUFFCODE_LEN + 1]; if (!(freqs = new long [dd->num_wds])) FatalError (1, "Unable to allocate memory for freqs"); for (i = 0; (unsigned)i < dd->num_wds; i++) { freqs[i] = dd->wd[i]->documents(); us += dd->wd[i]->word[0]; } if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL))) FatalError (1, "Unable to allocate memory for huffman data"); delete (freqs); freqs = NULL; if (Write_Huffman_Data (f, hd) == -1) FatalError (1, "Unable to write huffman data"); HTONUL(us); /* [RPAP - Jan 97: Endian Ordering] */ fwrite (&us, sizeof (us), 1, f); NTOHUL(us); /* [RPAP - Jan 97: Endian Ordering] */ /* Calculate the amount of memory that will be required to store the text for each different huffman code len. Every 1<num_wds; i++) { int codelen = hd->clens[i]; u_char *word = dd->wd[i]->word; if (!codelen) FatalError (1, "The length of a code for a word was zero"); huff_words_size[codelen] += word[0] + 1; mem_reqd += word[0] + (lookback != 0); #if 0 if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0) lastword[codelen] = word; else huff_words_size[codelen] -= prefixlen (lastword[codelen], word); #else if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0) { int save = prefixlen (lastword[codelen], word); mem_reqd -= save; huff_words_size[codelen] -= save; } else { mem_reqd += sizeof (u_char *); } lastword[codelen] = word; #endif lencounts[codelen]++; } /* [RPAP - Jan 97: Endian Ordering] */ for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++) HTONUL(huff_words_size[i]); fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size), hd->maxcodelen - hd->mincodelen + 1, f); /* [RPAP - Jan 97: Endian Ordering] */ for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++) NTOHUL(huff_words_size[i]); Write_words (f, dd); delete hd->clens; delete hd; return mem_reqd; } static void Write_charfreqs (FILE * f, DictData * dd, int words, int zero_freq_permitted) { unsigned int j; long freqs[256]; DictWordData **wd = dd->wd; huff_data *hd; bzero ((char *) freqs, sizeof (freqs)); for (j = 0; j < dd->num_wds; j++, wd++) { u_char *buf = (*wd)->word; int len = *buf++; for (; len; len--, buf++) freqs[(u_long) (*buf)] += (*wd)->documents(); } if (!zero_freq_permitted) for (j = 0; j < 256; j++) if (!freqs[j] && PESINAWORD (j) == words) freqs[j] = 1; if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL))) FatalError (1, "Unable to allocate memory for huffman data"); if (Write_Huffman_Data (f, hd) == -1) FatalError (1, "Unable to write huffman data"); delete hd->clens; delete hd; } static void Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted) { unsigned int j; long freqs[16]; DictWordData **wd = dd->wd; huff_data *hd; bzero ((char *) freqs, sizeof (freqs)); for (j = 0; j < dd->num_wds; j++, wd++) freqs[(*wd)->word[0]] += (*wd)->documents(); if (!zero_freq_permitted) for (j = 0; j < 16; j++) if (!freqs[j]) freqs[j] = 1; if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL))) FatalError (1, "Unable to allocate memory for huffman data"); if (Write_Huffman_Data (f, hd) == -1) FatalError (1, "Unable to write huffman data"); delete hd->clens; delete hd; } static u_long WriteOutWords (char *file_name, u_long type, int lookback) { FILE *f; int i; u_long mem_reqd = 0; compression_dict_header cdh; f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b", MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */ bzero((char *) &cdh, sizeof(cdh)); cdh.dict_type = type; cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method : MG_NOVEL_HUFFMAN_CHARS; cdh.num_words[1] = keep[1].num_wds; cdh.num_words[0] = keep[0].num_wds; cdh.num_word_chars[1] = Uncompressed_size (&keep[1]); cdh.num_word_chars[0] = Uncompressed_size (&keep[0]); cdh.lookback = lookback; Write_cdh (f, &cdh); for (i = 0; i < 2; i++) switch (type) { case MG_COMPLETE_DICTIONARY: { mem_reqd += Write_data (f, &keep[i], lookback); } break; case MG_PARTIAL_DICTIONARY: { if (keep[i].num_wds) { int j; DictWordData esc; esc.docCount = 0; esc.word = (u_char *) ""; keep[i].wd[keep[i].num_wds++] = &esc; for (j = 0; (unsigned)j < discard[i].num_wds; j++) esc.docCount += discard[i].wd[j]->documents(); if (!esc.docCount) esc.docCount++; mem_reqd += Write_data (f, &keep[i], lookback); } Write_charfreqs (f, &discard[i], i, 1); Write_wordlens (f, &discard[i], 1); } break; case MG_SEED_DICTIONARY: { if (keep[i].num_wds) { int j; DictWordData esc; esc.docCount = 0; esc.word = (u_char *) ""; keep[i].wd[keep[i].num_wds++] = &esc; for (j = 0; (unsigned)j < all[i].num_wds; j++) if (all[i].wd[j]->documents() == 1) esc.docCount++; if (!esc.docCount) esc.docCount++; mem_reqd += Write_data (f, &keep[i], lookback); } switch (novel_method) { case MG_NOVEL_HUFFMAN_CHARS: Write_charfreqs (f, &all[i], i, 0); Write_wordlens (f, &all[i], 0); break; case MG_NOVEL_DELTA: break; case MG_NOVEL_HYBRID: break; default: FatalError (1, "Bad novel method"); } } break; } fclose (f); return mem_reqd; }