source: trunk/gsdl/src/mgpp/text/mg_compression_dict.cpp@ 879

Last change on this file since 879 was 860, checked in by rjmcnab, 24 years ago

Fixed a couple of bugs and made building silent if needed.

  • Property svn:keywords set to Author Date Id Revision
File size: 26.7 KB
Line 
1/**************************************************************************
2 *
3 * mg_compression_dict.cpp -- Routines for creating compression dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_compression_dict.cpp 860 2000-01-18 03:53:24Z rjmcnab $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "local_strings.h"
29#include "bitio_gen.h"
30#include "bitio_m_stdio.h"
31#include "mgheap.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "locallib.h"
36#include "invf.h"
37#include "text.h"
38#include "words.h"
39#include "mg.h"
40#include "WordData.h"
41
42#define MAXBITS (sizeof(unsigned long) * 8)
43
44/*
45 $Log$
46 Revision 1.2 2000/01/18 03:53:22 rjmcnab
47 Fixed a couple of bugs and made building silent if needed.
48
49 Revision 1.1 2000/01/14 02:26:11 sjboddie
50 Rodgers new C++ mg
51
52 Revision 1.3 1999/10/17 23:43:25 cs025
53 Changes to eradicate Xmalloc
54
55 Revision 1.2 1999/10/11 22:03:03 cs025
56 Changed mistaken use of NTOHUL on a double to NTOHD as per report
57 from Craig Neville-Manning of problems installing mg/gsdl.
58
59 Revision 1.1 1999/10/11 02:57:46 cs025
60 Base install of MG-PP
61
62 Revision 1.1 1999/08/10 21:18:04 sjboddie
63 renamed mg-1.3d directory mg
64
65 Revision 1.4 1999/01/15 03:05:51 rjmcnab
66
67 Renamed lib/heap to be lib/mgheap (it was causing some problems with
68 some versions of the STL libraries which contained a heap.h)
69
70 Revision 1.3 1998/12/17 09:12:52 rjmcnab
71
72 Altered mg to process utf-8 encoded Unicode. The main changes
73 are in the parsing of the input, the casefolding, and the stemming.
74
75 Revision 1.2 1998/11/25 07:55:44 rjmcnab
76
77 Modified mg to that you can specify the stemmer you want
78 to use via a command line option. You specify it to
79 mg_passes during the build process. The number of the
80 stemmer that you used is stored within the inverted
81 dictionary header and the stemmed dictionary header so
82 the correct stemmer is used in later stages of building
83 and querying.
84
85 Revision 1.1 1998/11/17 09:34:52 rjmcnab
86 *** empty log message ***
87
88 * Revision 1.3 1994/10/20 03:56:54 tes
89 * I have rewritten the boolean query optimiser and abstracted out the
90 * components of the boolean query.
91 *
92 * Revision 1.2 1994/09/20 04:41:45 tes
93 * For version 1.1
94 *
95 */
96
97/* #define DEBUG1 */
98
99/* #define DEBUG */
100
101#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))
102
103#define MAX_RECALCULATIONS 100
104
105typedef struct DictWordData : public WordData
106 {
107 float saving;
108 float char_bit_cost;
109 u_long num_trans;
110 u_char *word;
111 }
112DictWordData;
113
114static DictWordData *Words[2];
115static u_long Num[2];
116static u_long chars[2];
117
118#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
119#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
120#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)
121
122
123typedef struct DictData
124 {
125 DictWordData **wd;
126 u_long num_wds;
127 u_long chars;
128 }
129DictData;
130
131typedef DictData DictInfo[2];
132
133static DictInfo keep, discard, all;
134static compression_stats_header csh;
135
136static char *file_name = "";
137static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;
138
139
140static void ReadInWords (char *);
141static void Select_on (int k, heap_comp hc);
142static void Method3 (int k);
143static void Select_all (void);
144static u_long WriteOutWords (char *, u_long, int);
145static int DecFreqIncWL (void *a, void *b);
146static int OccuranceOrder (void *a, void *b);
147
148
149
150int main (int argc, char **argv)
151{
152 int ch;
153 char type = 'C';
154 char mode = '0';
155 int lookback = 2;
156 double k = 0;
157 u_long mem_reqd;
158 opterr = 0;
159 msg_prefix = argv[0];
160 while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HDY")) != -1)
161 switch (ch)
162 {
163 case 'H':
164 novel_method = MG_NOVEL_HUFFMAN_CHARS;
165 break;
166 case 'D':
167 novel_method = MG_NOVEL_DELTA;
168 break;
169 case 'Y':
170 novel_method = MG_NOVEL_HYBRID;
171 break;
172 case 'f': /* input file */
173 file_name = optarg;
174 break;
175 case 'd':
176 set_basepath (optarg);
177 break;
178 case 'C':
179 case 'P':
180 case 'S':
181 type = ch;
182 break;
183 case '0':
184 case '1':
185 case '2':
186 case '3':
187 mode = ch;
188 break;
189 case 'l':
190 lookback = atoi (optarg);
191 if (!is_power_of_two (lookback))
192 FatalError (1, "The lookback value must be a power of 2");
193 lookback = floorlog_2 (lookback);
194 break;
195 case 'k':
196 k = atof (optarg) * 1024;
197 break;
198 case 'h':
199 case '?':
200 fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
201 "[-d data directory] [-h] [-0|-1|-2|-3] [-C|-P|-S] [-k mem (Kb)]\n",
202 argv[0]);
203 exit (1);
204 }
205
206 ReadInWords (file_name);
207
208 if (type == 'C')
209 {
210 Select_all ();
211 mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
212 }
213 else
214 {
215 switch (mode)
216 {
217 case '0':
218 Select_all ();
219 break;
220 case '1':
221#ifndef SILENT
222 Message ("Dictionary limit of %.2f Kb", k / 1024);
223#endif
224 Select_on ((int) k, OccuranceOrder);
225 break;
226 case '2':
227#ifndef SILENT
228 Message ("Dictionary limit of %.2f Kb", k / 1024);
229#endif
230 Select_on ((int) k, DecFreqIncWL);
231 break;
232 case '3':
233#ifndef SILENT
234 Message ("Dictionary limit of %.2f Kb", k / 1024);
235#endif
236 Method3 ((int) k);
237 break;
238 }
239 if (type == 'P')
240 {
241 mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
242 }
243 else
244 {
245 mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
246 }
247 }
248
249#ifndef SILENT
250 Message ("Num words : %8u -> %8u\n", Num[1], keep[1].num_wds);
251 Message ("Num non-words : %8u -> %8u\n", Num[0], keep[0].num_wds);
252 Message ("Chars of words : %8u -> %8u\n", chars[1], keep[1].chars);
253 Message ("Chars of non-words : %8u -> %8u\n", chars[0], keep[0].chars);
254 Message ("Mem usage : %8u -> %8u\n",
255 (Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
256 (keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
257 keep[0].chars + keep[1].chars);
258 Message ("Actual mem required : %8u\n", mem_reqd);
259#endif
260 exit (0);
261}
262
263
264
265
266static void
267ReadInWords (char *filename)
268{
269 FILE *f;
270 unsigned long i;
271 f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
272 MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
273
274 fread (&csh, sizeof (csh), 1, f);
275
276 /* [RPAP - Jan 97: Endian Ordering] */
277 NTOHUL(csh.num_docs);
278 NTOHD(csh.num_bytes);
279
280 for (i = 0; i < 2; i++)
281 {
282 frags_stats_header fsh;
283 char *buf;
284 DictWordData *wd;
285 chars[i] = 0;
286
287 fread (&fsh, sizeof (fsh), 1, f);
288 /* [RPAP - Jan 97: Endian Ordering] */
289 NTOHUL(fsh.num_frags);
290 NTOHUL(fsh.mem_for_frags);
291
292 Num[i] = all[i].num_wds = fsh.num_frags;
293
294 /* The +1 on the following line is to leave room for the esc code. */
295 all[i].wd = (DictWordData **) Xmalloc (sizeof (DictWordData *) * Num[i] + 1);
296
297 buf = (char *) Xmalloc (fsh.mem_for_frags);
298 wd = Words[i] = (DictWordData *) Xmalloc (sizeof (DictWordData) * Num[i]);
299 unsigned int j;
300 for (j = 0; j < Num[i]; j++, wd++)
301 {
302 int len;
303
304 // read docCount and wordCount
305 wd->read(f);
306
307 // read an mgString
308 len = fgetc (f);
309 wd->word = (u_char *) buf;
310 *buf++ = len;
311 fread (buf, len, 1, f);
312 buf += len;
313 all[i].wd[j] = wd;
314 }
315 chars[i] = fsh.mem_for_frags - fsh.num_frags;
316 }
317 fclose (f);
318}
319
320
321static void
322Alloc_keep_discard (void)
323{
324 keep[0].num_wds = 0;
325 keep[0].wd = (DictWordData **) Xmalloc ((Num[0] + 1) * sizeof (DictWordData *));
326 keep[1].num_wds = 0;
327 keep[1].wd = (DictWordData **) Xmalloc ((Num[1] + 1) * sizeof (DictWordData *));
328 discard[0].num_wds = 0;
329 discard[0].wd = (DictWordData **) Xmalloc ((Num[0] + 1) * sizeof (DictWordData *));
330 discard[1].num_wds = 0;
331 discard[1].wd = (DictWordData **) Xmalloc ((Num[1] + 1) * sizeof (DictWordData *));
332}
333
334
335static int
336sort_comp (const void *a, const void *b)
337{
338 DictWordData *aa = *(DictWordData **) a;
339 DictWordData *bb = *(DictWordData **) b;
340 return casecompare (aa->word, bb->word); /* [RPAP - Jan 97: Stem Index Change] */
341}
342
343
344
345static void
346SortAndCount_DictData (DictData * dd)
347{
348 unsigned int i;
349 DictWordData **wd;
350 qsort (dd->wd, dd->num_wds, sizeof (DictWordData *), sort_comp);
351 dd->chars = 0;
352 wd = dd->wd;
353 for (i = 0; i < dd->num_wds; i++, wd++)
354 dd->chars += (*wd)->word[0];
355}
356
357
358static void
359Select_all (void)
360{
361 unsigned int i;
362 Alloc_keep_discard ();
363 keep[0].num_wds = Num[0];
364 for (i = 0; i < Num[0]; i++)
365 keep[0].wd[i] = Words[0] + i;
366 keep[1].num_wds = Num[1];
367 for (i = 0; i < Num[1]; i++)
368 keep[1].wd[i] = Words[1] + i;
369 SortAndCount_DictData (&keep[0]);
370 SortAndCount_DictData (&keep[1]);
371}
372
373
374
375
376static int
377DecFreqIncWL (void *a, void *b)
378{
379 DictWordData *aa = *(DictWordData **) a;
380 DictWordData *bb = *(DictWordData **) b;
381 if (aa->docCount > bb->docCount)
382 return -1;
383 else if (aa->docCount < bb->docCount)
384 return 1;
385 else
386 return bb->word[0] - aa->word[0];
387}
388
389
390static int
391OccuranceOrder (void *a, void *b)
392{
393 DictWordData *aa = *(DictWordData **) a;
394 DictWordData *bb = *(DictWordData **) b;
395 if (aa->words() > bb->words())
396 return 1;
397 else if (aa->words() < bb->words())
398 return -1;
399 else
400 return 0;
401}
402
403
404static void
405Select_on (int k, heap_comp hc)
406{
407 int i, num, mem;
408 DictWordData **wd;
409
410 Alloc_keep_discard ();
411
412 num = Num[0] + Num[1];
413 wd = (DictWordData **) Xmalloc (num * sizeof (DictWordData *));
414 for (i = 0; (unsigned int)i < Num[0]; i++)
415 wd[i] = Words[0] + i;
416 for (i = 0; (unsigned int)i < Num[1]; i++)
417 wd[i + Num[0]] = Words[1] + i;
418
419 heap_build (wd, sizeof (*wd), num, hc);
420
421 mem = 0;
422 while (k > mem && num)
423 {
424 int idx;
425 DictWordData *word = wd[0];
426#ifdef DEBUG
427 fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
428 keep[0].num_wds, keep[1].num_wds,
429 mem, word->documents(), word->word, word2str (word->word));
430#endif
431 mem += sizeof (u_char *) + word->word[0];
432 heap_deletehead (wd, sizeof (*wd), &num, hc);
433 idx = KIND (word);
434 keep[idx].wd[keep[idx].num_wds++] = word;
435 }
436
437 for (i = 0; i < num; i++)
438 {
439 DictWordData *word = wd[i];
440 int idx = KIND (word);
441 discard[idx].wd[discard[idx].num_wds++] = word;
442 }
443 SortAndCount_DictData (&keep[0]);
444 SortAndCount_DictData (&keep[1]);
445 SortAndCount_DictData (&discard[0]);
446 SortAndCount_DictData (&discard[1]);
447
448 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
449 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
450
451}
452
453
454
455static int
456BigSaving (void *a, void *b)
457{
458 DictWordData *aa = *(DictWordData **) a;
459 DictWordData *bb = *(DictWordData **) b;
460 return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
461}
462
463static int
464SmallSaving (void *a, void *b)
465{
466 DictWordData *aa = *(DictWordData **) a;
467 DictWordData *bb = *(DictWordData **) b;
468 return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
469}
470
471
472static void
473CalcCharCounts (DictWordData ** wd, int num,
474 char *char_lens[2], char *len_lens[2],
475 u_long escape[2])
476{
477 long char_freqs[2][256];
478 long len_freqs[2][16];
479 huff_data hd;
480 int i;
481 escape[0] = 0;
482 escape[1] = 0;
483 bzero ((char *) char_freqs, sizeof (char_freqs));
484 bzero ((char *) len_freqs, sizeof (len_freqs));
485 for (i = 0; i < num; i++, wd++)
486 {
487 u_long freq = (*wd)->documents();
488 u_char *buf = (*wd)->word;
489 int len = *buf++;
490 int idx = KIND (*wd);
491 len_freqs[idx][len] += freq;
492 escape[idx] += freq;
493 for (; len; len--, buf++)
494 char_freqs[idx][(u_long) (*buf)] += freq;
495 }
496 Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
497 char_lens[0] = hd.clens;
498 Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
499 char_lens[1] = hd.clens;
500
501 Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
502 len_lens[0] = hd.clens;
503 Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
504 len_lens[1] = hd.clens;
505}
506
507
508
509
510
511inline void CalcBitCostForWordData(DictWordData **word, int num,
512 double freqs_trans[2], u_long escape[2],
513 char * char_lens[2], char *len_lens[2],
514 double esc[2], int num_trans)
515{
516 int j;
517
518 for (j = 0; j < num; j++, word++)
519 {
520 float cbc, wbc;
521 u_char *buf = (*word)->word;
522 int len = *buf++;
523 u_long freq = (*word)->documents();
524 int idx = KIND (*word);
525
526 cbc = len_lens[idx][len];
527 for (; len; len--, buf++)
528 cbc += char_lens[idx][(u_long) (*buf)];
529
530 (*word)->char_bit_cost = (cbc + esc[idx]) * freq;
531
532 wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
533
534 (*word)->saving = ((*word)->char_bit_cost - wbc) /
535 (sizeof (char *) + (*word)->word[0]);
536
537 (*word)->num_trans = num_trans;
538 }
539}
540
541
542
543void
544CalcBitCost (DictWordData ** discard_word, int discard_num,
545 DictWordData ** keep_word, int keep_num, double freqs_trans[2],
546 u_long escape[2], int num_trans)
547{
548 char *char_lens[2];
549 char *len_lens[2];
550 double esc[2];
551 CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
552 esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
553 esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
554 CalcBitCostForWordData(discard_word, discard_num, freqs_trans, escape,
555 char_lens, len_lens, esc, num_trans);
556 CalcBitCostForWordData(keep_word, keep_num, freqs_trans, escape,
557 char_lens, len_lens, esc, num_trans);
558 Xfree (char_lens[0]);
559 Xfree (char_lens[1]);
560 Xfree (len_lens[0]);
561 Xfree (len_lens[1]);
562}
563
564inline void m3_transferWord(DictWordData **toHeap, int *toNum, heap_comp toSaving,
565 DictWordData **fromHeap, int *fromNum, heap_comp fromSaving,
566 int *num_trans, double freqs_trans[], int *mem, double *total,
567 int *count)
568{
569 /* Transfer the word at the top of the keep heap to the
570 discard heap. */
571 DictWordData *word = fromHeap[0];
572 int idx = KIND (word);
573 heap_deletehead (fromHeap, sizeof (word), fromNum, fromSaving);
574 toHeap[*toNum] = word;
575 heap_additem (toHeap, sizeof (word), toNum, toSaving);
576 freqs_trans[idx] -= word->documents();
577 *mem = (*mem) - sizeof (u_char *) + word->word[0];
578 *num_trans += 1;
579 *total += word->saving;
580 *count += 1;
581#ifdef DEBUG
582 fprintf (stderr,
583 "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
584 *mem, word->documents(), word->char_bit_cost,
585 word->saving, word2str (word->word));
586#endif
587}
588
589inline void m3_storeWord(DictWordData **wordHeap, int wordNum, int num_trans,
590 double freqs_trans[], u_long escape[], heap_comp hc)
591{
592 DictWordData *word = wordHeap[0];
593 int idx = KIND (word);
594 float wbc;
595#ifdef DEBUG1
596 fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
597 word->saving);
598#endif
599 wbc = -log2 (word->documents() / (freqs_trans[idx] + escape[idx])) *
600 word->documents();
601 word->saving = (word->char_bit_cost - wbc) /
602 (sizeof (char *) + word->word[0]);
603#ifdef DEBUG1
604 fprintf (stderr, " %.2f\n", word->saving);
605#endif
606 word->num_trans = num_trans;
607 heap_changedhead (wordHeap, sizeof (word), wordNum, hc);
608}
609
610
611static void
612Method3 (int k)
613{
614 int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
615 int keep_to_discard = 0;
616 int discard_to_keep = 0;
617 int recalcs = 0;
618 double freqs_trans[2], total;
619 u_long escape[2];
620 DictWordData **keep_heap, **discard_heap;
621
622 freqs_trans[0] = freqs_trans[1] = 0;
623 num_trans = 0;
624
625 discard_num = Num[0] + Num[1];
626 discard_heap = (DictWordData **) Xmalloc (discard_num * sizeof (DictWordData *));
627
628 keep_num = 0;
629 keep_heap = (DictWordData **) Xmalloc (discard_num * sizeof (DictWordData *));
630
631
632 for (i = 0; (unsigned int)i < Num[0]; i++)
633 discard_heap[i] = Words[0] + i;
634 for (i = 0; (unsigned int)i < Num[1]; i++)
635 discard_heap[i + Num[0]] = Words[1] + i;
636
637 heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);
638
639 mem = 0;
640 while (k > mem && discard_num)
641 {
642 DictWordData *word = discard_heap[0];
643 mem += sizeof (u_char *) + word->word[0];
644 heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
645 keep_heap[keep_num++] = word;
646 freqs_trans[KIND (word)] += word->documents();
647 num_trans++;
648 }
649
650 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
651 freqs_trans, escape, num_trans);
652 heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
653 heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);
654
655 total = 0;
656 recalc_reqd = 0;
657 while (keep_num && discard_num)
658 {
659 if ((keep_num && keep_heap[0]->num_trans < (unsigned)num_trans) ||
660 (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans))
661 {
662 if (keep_num && keep_heap[0]->num_trans < (unsigned)num_trans)
663 {
664 m3_storeWord(keep_heap, keep_num, num_trans, freqs_trans, escape, SmallSaving);
665 }
666
667 if (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans)
668 {
669 m3_storeWord(discard_heap, discard_num, num_trans, freqs_trans, escape, BigSaving);
670 }
671 }
672 else if (keep_heap[0]->saving < discard_heap[0]->saving)
673 {
674 assert (keep_num && discard_num);
675 if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > (unsigned)k)
676 {
677 /* Transfer the word at the top of the keep heap to the
678 discard heap. */
679 m3_transferWord(discard_heap, &discard_num, BigSaving,
680 keep_heap, &keep_num, SmallSaving,
681 &num_trans, freqs_trans, &mem, &total, &keep_to_discard);
682 }
683 else
684 {
685 /* Transfer the word at the top of the discard heap to the
686 keep heap. */
687 m3_transferWord(keep_heap, &keep_num, SmallSaving,
688 discard_heap, &discard_num, BigSaving,
689 &num_trans, freqs_trans, &mem, &total, &discard_to_keep);
690 }
691
692 recalc_reqd = 1;
693
694 }
695 else
696 {
697 if (recalc_reqd == 0)
698 break;
699#ifdef DEBUG1
700 fprintf (stderr, "--------------\n");
701#endif
702 if (recalcs == MAX_RECALCULATIONS)
703 break;
704 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
705 freqs_trans, escape, num_trans);
706 heap_build (discard_heap, sizeof (*discard_heap),
707 discard_num, BigSaving);
708 heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
709 recalc_reqd = 0;
710 recalcs++;
711 }
712 }
713
714 Alloc_keep_discard ();
715
716 for (i = 0; i < discard_num; i++)
717 {
718 DictWordData *word = discard_heap[i];
719 int idx = KIND (word);
720 assert (IsWord (word) || IsNonWord (word));
721 discard[idx].wd[discard[idx].num_wds++] = word;
722 }
723 for (i = 0; i < keep_num; i++)
724 {
725 DictWordData *word = keep_heap[i];
726 int idx = KIND (word);
727 assert (IsWord (word) || IsNonWord (word));
728 keep[idx].wd[keep[idx].num_wds++] = word;
729 }
730 SortAndCount_DictData (&keep[0]);
731 SortAndCount_DictData (&keep[1]);
732 SortAndCount_DictData (&discard[0]);
733 SortAndCount_DictData (&discard[1]);
734
735 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
736 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
737#ifndef SILENT
738 Message ("Keep -> Discard : %8d", keep_to_discard);
739 Message ("Discard -> Keep : %8d", discard_to_keep);
740 Message ("Huffman Recalculations : %8d", recalcs);
741 if (recalcs == MAX_RECALCULATIONS)
742 Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
743#endif
744}
745
746
747
748
749
750
751
752
753
754
755/****************************************************************************
756 ***** *****
757 ***** Dictionary saving code *****
758 ***** *****
759 ****************************************************************************/
760
761
762
763static void
764Write_cdh (FILE * f, compression_dict_header * cdh)
765{
766 /* [RPAP - Jan 97: Endian Ordering] */
767 int i;
768 compression_dict_header tmp = *cdh;
769 HTONUL(tmp.dict_type);
770 HTONUL(tmp.novel_method);
771 for (i = 0; i < TEXT_PARAMS; i++)
772 HTONUL(tmp.params[i]);
773 HTONUL(tmp.num_words[0]);
774 HTONUL(tmp.num_words[1]);
775 HTONUL(tmp.num_word_chars[0]);
776 HTONUL(tmp.num_word_chars[1]);
777 HTONUL(tmp.lookback);
778
779 fwrite (&tmp, sizeof (tmp), 1, f);
780}
781
782
783static void
784Write_words (FILE * f, DictData * dd)
785{
786 unsigned int i;
787 u_char *curr, *prev = NULL;
788 for (i = 0; i < dd->num_wds; i++)
789 {
790 int len;
791 curr = dd->wd[i]->word;
792 if (prev)
793 /* look for prefix match with prev string */
794 len = prefixlen (prev, curr);
795 else
796 len = 0;
797 fputc ((len << 4) + (curr[0] - len), f);
798 fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
799 prev = curr;
800 }
801
802}
803
804
805static int
806Uncompressed_size (DictData * dd)
807{
808 unsigned int i, us;
809 for (us = i = 0; i < dd->num_wds; i++)
810 us += dd->wd[i]->word[0];
811 return us;
812}
813
814
815static u_long
816Write_data (FILE * f, DictData * dd, int lookback)
817{
818 u_long mem_reqd;
819 huff_data *hd;
820 int i;
821 u_long us = dd->num_wds;
822 long *freqs;
823 u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
824 u_long lencounts[MAX_HUFFCODE_LEN + 1];
825 u_char *lastword[MAX_HUFFCODE_LEN + 1];
826
827 if (!(freqs = new long [dd->num_wds]))
828 FatalError (1, "Unable to allocate memory for freqs");
829
830 for (i = 0; (unsigned)i < dd->num_wds; i++)
831 {
832 freqs[i] = dd->wd[i]->documents();
833 us += dd->wd[i]->word[0];
834 }
835
836 if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
837 FatalError (1, "Unable to allocate memory for huffman data");
838
839 delete (freqs);
840 freqs = NULL;
841
842 if (Write_Huffman_Data (f, hd) == -1)
843 FatalError (1, "Unable to write huffman data");
844
845 HTONUL(us); /* [RPAP - Jan 97: Endian Ordering] */
846 fwrite (&us, sizeof (us), 1, f);
847 NTOHUL(us); /* [RPAP - Jan 97: Endian Ordering] */
848
849
850/* Calculate the amount of memory that will be required to store the text for
851 each different huffman code len. Every 1<<lookback words for each different
852 codelen length will not be prefixed by previous strings. */
853
854
855 bzero ((char *) &huff_words_size, sizeof (huff_words_size));
856 bzero ((char *) &lencounts, sizeof (lencounts));
857
858 mem_reqd = 0;
859
860 for (i = 0; (unsigned)i < dd->num_wds; i++)
861 {
862 int codelen = hd->clens[i];
863 u_char *word = dd->wd[i]->word;
864
865 if (!codelen)
866 FatalError (1, "The length of a code for a word was zero");
867
868 huff_words_size[codelen] += word[0] + 1;
869 mem_reqd += word[0] + (lookback != 0);
870#if 0
871 if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
872 lastword[codelen] = word;
873 else
874 huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
875#else
876 if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
877 {
878 int save = prefixlen (lastword[codelen], word);
879 mem_reqd -= save;
880 huff_words_size[codelen] -= save;
881 }
882 else
883 {
884 mem_reqd += sizeof (u_char *);
885 }
886 lastword[codelen] = word;
887#endif
888 lencounts[codelen]++;
889 }
890
891 /* [RPAP - Jan 97: Endian Ordering] */
892 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
893 HTONUL(huff_words_size[i]);
894
895 fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
896 hd->maxcodelen - hd->mincodelen + 1, f);
897
898 /* [RPAP - Jan 97: Endian Ordering] */
899 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
900 NTOHUL(huff_words_size[i]);
901
902 Write_words (f, dd);
903
904 delete hd->clens;
905 delete hd;
906
907 return mem_reqd;
908}
909
910
911
912static void
913Write_charfreqs (FILE * f, DictData * dd, int words,
914 int zero_freq_permitted)
915{
916 unsigned int j;
917 long freqs[256];
918 DictWordData **wd = dd->wd;
919 huff_data *hd;
920
921 bzero ((char *) freqs, sizeof (freqs));
922
923 for (j = 0; j < dd->num_wds; j++, wd++)
924 {
925 u_char *buf = (*wd)->word;
926 int len = *buf++;
927 for (; len; len--, buf++)
928 freqs[(u_long) (*buf)] += (*wd)->documents();
929 }
930
931 if (!zero_freq_permitted)
932 for (j = 0; j < 256; j++)
933 if (!freqs[j] && PESINAWORD (j) == words)
934 freqs[j] = 1;
935
936 if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
937 FatalError (1, "Unable to allocate memory for huffman data");
938
939 if (Write_Huffman_Data (f, hd) == -1)
940 FatalError (1, "Unable to write huffman data");
941
942 delete hd->clens;
943 delete hd;
944}
945
946
947
948
949static void
950Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
951{
952 unsigned int j;
953 long freqs[16];
954 DictWordData **wd = dd->wd;
955 huff_data *hd;
956
957 bzero ((char *) freqs, sizeof (freqs));
958
959 for (j = 0; j < dd->num_wds; j++, wd++)
960 freqs[(*wd)->word[0]] += (*wd)->documents();
961
962 if (!zero_freq_permitted)
963 for (j = 0; j < 16; j++)
964 if (!freqs[j])
965 freqs[j] = 1;
966
967 if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
968 FatalError (1, "Unable to allocate memory for huffman data");
969
970 if (Write_Huffman_Data (f, hd) == -1)
971 FatalError (1, "Unable to write huffman data");
972
973
974 delete hd->clens;
975 delete hd;
976}
977
978
979
980static u_long
981WriteOutWords (char *file_name, u_long type, int lookback)
982{
983 FILE *f;
984 int i;
985 u_long mem_reqd = 0;
986
987 compression_dict_header cdh;
988 f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
989 MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
990
991 bzero((char *) &cdh, sizeof(cdh));
992
993 cdh.dict_type = type;
994 cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
995 MG_NOVEL_HUFFMAN_CHARS;
996
997 cdh.num_words[1] = keep[1].num_wds;
998 cdh.num_words[0] = keep[0].num_wds;
999 cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
1000 cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
1001 cdh.lookback = lookback;
1002
1003 Write_cdh (f, &cdh);
1004
1005 for (i = 0; i < 2; i++)
1006 switch (type)
1007 {
1008 case MG_COMPLETE_DICTIONARY:
1009 {
1010 mem_reqd += Write_data (f, &keep[i], lookback);
1011 }
1012 break;
1013 case MG_PARTIAL_DICTIONARY:
1014 {
1015 if (keep[i].num_wds)
1016 {
1017 int j;
1018 DictWordData esc;
1019 esc.docCount = 0;
1020 esc.word = (u_char *) "";
1021 keep[i].wd[keep[i].num_wds++] = &esc;
1022 for (j = 0; (unsigned)j < discard[i].num_wds; j++)
1023 esc.docCount += discard[i].wd[j]->documents();
1024 if (!esc.docCount)
1025 esc.docCount++;
1026 mem_reqd += Write_data (f, &keep[i], lookback);
1027 }
1028 Write_charfreqs (f, &discard[i], i, 1);
1029 Write_wordlens (f, &discard[i], 1);
1030 }
1031 break;
1032 case MG_SEED_DICTIONARY:
1033 {
1034 if (keep[i].num_wds)
1035 {
1036 int j;
1037 DictWordData esc;
1038 esc.docCount = 0;
1039 esc.word = (u_char *) "";
1040 keep[i].wd[keep[i].num_wds++] = &esc;
1041 for (j = 0; (unsigned)j < all[i].num_wds; j++)
1042 if (all[i].wd[j]->documents() == 1)
1043 esc.docCount++;
1044 if (!esc.docCount)
1045 esc.docCount++;
1046 mem_reqd += Write_data (f, &keep[i], lookback);
1047 }
1048 switch (novel_method)
1049 {
1050 case MG_NOVEL_HUFFMAN_CHARS:
1051 Write_charfreqs (f, &all[i], i, 0);
1052 Write_wordlens (f, &all[i], 0);
1053 break;
1054 case MG_NOVEL_DELTA:
1055 break;
1056 case MG_NOVEL_HYBRID:
1057 break;
1058 default:
1059 FatalError (1, "Bad novel method");
1060 }
1061 }
1062 break;
1063 }
1064 fclose (f);
1065 return mem_reqd;
1066}
Note: See TracBrowser for help on using the repository browser.