source: trunk/gsdl/src/mgpp/text/mg_compression_dict.cpp@ 856

Last change on this file since 856 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:keywords set to Author Date Id Revision
File size: 26.5 KB
Line 
1/**************************************************************************
2 *
3 * mg_compression_dict.cpp -- Routines for creating compression dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_compression_dict.cpp 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "local_strings.h"
29#include "bitio_gen.h"
30#include "bitio_m_stdio.h"
31#include "mgheap.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "locallib.h"
36#include "invf.h"
37#include "text.h"
38#include "words.h"
39#include "mg.h"
40#include "WordData.h"
41
42#define MAXBITS (sizeof(unsigned long) * 8)
43
44/*
45 $Log$
46 Revision 1.1 2000/01/14 02:26:11 sjboddie
47 Rodgers new C++ mg
48
49 Revision 1.3 1999/10/17 23:43:25 cs025
50 Changes to eradicate Xmalloc
51
52 Revision 1.2 1999/10/11 22:03:03 cs025
53 Changed mistaken use of NTOHUL on a double to NTOHD as per report
54 from Craig Neville-Manning of problems installing mg/gsdl.
55
56 Revision 1.1 1999/10/11 02:57:46 cs025
57 Base install of MG-PP
58
59 Revision 1.1 1999/08/10 21:18:04 sjboddie
60 renamed mg-1.3d directory mg
61
62 Revision 1.4 1999/01/15 03:05:51 rjmcnab
63
64 Renamed lib/heap to be lib/mgheap (it was causing some problems with
65 some versions of the STL libraries which contained a heap.h)
66
67 Revision 1.3 1998/12/17 09:12:52 rjmcnab
68
69 Altered mg to process utf-8 encoded Unicode. The main changes
70 are in the parsing of the input, the casefolding, and the stemming.
71
72 Revision 1.2 1998/11/25 07:55:44 rjmcnab
73
74 Modified mg to that you can specify the stemmer you want
75 to use via a command line option. You specify it to
76 mg_passes during the build process. The number of the
77 stemmer that you used is stored within the inverted
78 dictionary header and the stemmed dictionary header so
79 the correct stemmer is used in later stages of building
80 and querying.
81
82 Revision 1.1 1998/11/17 09:34:52 rjmcnab
83 *** empty log message ***
84
85 * Revision 1.3 1994/10/20 03:56:54 tes
86 * I have rewritten the boolean query optimiser and abstracted out the
87 * components of the boolean query.
88 *
89 * Revision 1.2 1994/09/20 04:41:45 tes
90 * For version 1.1
91 *
92 */
93
94/* #define DEBUG1 */
95
96/* #define DEBUG */
97
98#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))
99
100#define MAX_RECALCULATIONS 100
101
102typedef struct DictWordData : public WordData
103 {
104 float saving;
105 float char_bit_cost;
106 u_long num_trans;
107 u_char *word;
108 }
109DictWordData;
110
111static DictWordData *Words[2];
112static u_long Num[2];
113static u_long chars[2];
114
115#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
116#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
117#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)
118
119
120typedef struct DictData
121 {
122 DictWordData **wd;
123 u_long num_wds;
124 u_long chars;
125 }
126DictData;
127
128typedef DictData DictInfo[2];
129
130static DictInfo keep, discard, all;
131static compression_stats_header csh;
132
133static char *file_name = "";
134static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;
135
136
137static void ReadInWords (char *);
138static void Select_on (int k, heap_comp hc);
139static void Method3 (int k);
140static void Select_all (void);
141static u_long WriteOutWords (char *, u_long, int);
142static int DecFreqIncWL (void *a, void *b);
143static int OccuranceOrder (void *a, void *b);
144
145
146
147int main (int argc, char **argv)
148{
149 int ch;
150 char type = 'C';
151 char mode = '0';
152 int lookback = 2;
153 double k = 0;
154 u_long mem_reqd;
155 opterr = 0;
156 msg_prefix = argv[0];
157 while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HDY")) != -1)
158 switch (ch)
159 {
160 case 'H':
161 novel_method = MG_NOVEL_HUFFMAN_CHARS;
162 break;
163 case 'D':
164 novel_method = MG_NOVEL_DELTA;
165 break;
166 case 'Y':
167 novel_method = MG_NOVEL_HYBRID;
168 break;
169 case 'f': /* input file */
170 file_name = optarg;
171 break;
172 case 'd':
173 set_basepath (optarg);
174 break;
175 case 'C':
176 case 'P':
177 case 'S':
178 type = ch;
179 break;
180 case '0':
181 case '1':
182 case '2':
183 case '3':
184 mode = ch;
185 break;
186 case 'l':
187 lookback = atoi (optarg);
188 if (!is_power_of_two (lookback))
189 FatalError (1, "The lookback value must be a power of 2");
190 lookback = floorlog_2 (lookback);
191 break;
192 case 'k':
193 k = atof (optarg) * 1024;
194 break;
195 case 'h':
196 case '?':
197 fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
198 "[-d data directory] [-h] [-0|-1|-2|-3] [-C|-P|-S] [-k mem (Kb)]\n",
199 argv[0]);
200 exit (1);
201 }
202
203 ReadInWords (file_name);
204
205 if (type == 'C')
206 {
207 Select_all ();
208 mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
209 }
210 else
211 {
212 switch (mode)
213 {
214 case '0':
215 Select_all ();
216 break;
217 case '1':
218 Message ("Dictionary limit of %.2f Kb", k / 1024);
219 Select_on ((int) k, OccuranceOrder);
220 break;
221 case '2':
222 Message ("Dictionary limit of %.2f Kb", k / 1024);
223 Select_on ((int) k, DecFreqIncWL);
224 break;
225 case '3':
226 Message ("Dictionary limit of %.2f Kb", k / 1024);
227 Method3 ((int) k);
228 break;
229 }
230 if (type == 'P')
231 {
232 mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
233 }
234 else
235 {
236 mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
237 }
238 }
239
240 Message ("Num words : %8u -> %8u\n", Num[1], keep[1].num_wds);
241 Message ("Num non-words : %8u -> %8u\n", Num[0], keep[0].num_wds);
242 Message ("Chars of words : %8u -> %8u\n", chars[1], keep[1].chars);
243 Message ("Chars of non-words : %8u -> %8u\n", chars[0], keep[0].chars);
244 Message ("Mem usage : %8u -> %8u\n",
245 (Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
246 (keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
247 keep[0].chars + keep[1].chars);
248 Message ("Actual mem required : %8u\n", mem_reqd);
249
250 exit (0);
251}
252
253
254
255
256static void
257ReadInWords (char *filename)
258{
259 FILE *f;
260 unsigned long i;
261 f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
262 MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
263
264 fread (&csh, sizeof (csh), 1, f);
265
266 /* [RPAP - Jan 97: Endian Ordering] */
267 NTOHUL(csh.num_docs);
268 NTOHD(csh.num_bytes);
269
270 for (i = 0; i < 2; i++)
271 {
272 frags_stats_header fsh;
273 char *buf;
274 DictWordData *wd;
275 chars[i] = 0;
276
277 fread (&fsh, sizeof (fsh), 1, f);
278 /* [RPAP - Jan 97: Endian Ordering] */
279 NTOHUL(fsh.num_frags);
280 NTOHUL(fsh.mem_for_frags);
281
282 Num[i] = all[i].num_wds = fsh.num_frags;
283
284 /* The +1 on the following line is to leave room for the esc code. */
285 all[i].wd = (DictWordData **) Xmalloc (sizeof (DictWordData *) * Num[i] + 1);
286
287 buf = (char *) Xmalloc (fsh.mem_for_frags);
288 wd = Words[i] = (DictWordData *) Xmalloc (sizeof (DictWordData) * Num[i]);
289 unsigned int j;
290 for (j = 0; j < Num[i]; j++, wd++)
291 {
292 int len;
293
294 // read docCount and wordCount
295 wd->read(f);
296
297 // read an mgString
298 len = fgetc (f);
299 wd->word = (u_char *) buf;
300 *buf++ = len;
301 fread (buf, len, 1, f);
302 buf += len;
303 all[i].wd[j] = wd;
304 }
305 chars[i] = fsh.mem_for_frags - fsh.num_frags;
306 }
307 fclose (f);
308}
309
310
311static void
312Alloc_keep_discard (void)
313{
314 keep[0].num_wds = 0;
315 keep[0].wd = (DictWordData **) Xmalloc ((Num[0] + 1) * sizeof (DictWordData *));
316 keep[1].num_wds = 0;
317 keep[1].wd = (DictWordData **) Xmalloc ((Num[1] + 1) * sizeof (DictWordData *));
318 discard[0].num_wds = 0;
319 discard[0].wd = (DictWordData **) Xmalloc ((Num[0] + 1) * sizeof (DictWordData *));
320 discard[1].num_wds = 0;
321 discard[1].wd = (DictWordData **) Xmalloc ((Num[1] + 1) * sizeof (DictWordData *));
322}
323
324
325static int
326sort_comp (const void *a, const void *b)
327{
328 DictWordData *aa = *(DictWordData **) a;
329 DictWordData *bb = *(DictWordData **) b;
330 return casecompare (aa->word, bb->word); /* [RPAP - Jan 97: Stem Index Change] */
331}
332
333
334
335static void
336SortAndCount_DictData (DictData * dd)
337{
338 unsigned int i;
339 DictWordData **wd;
340 qsort (dd->wd, dd->num_wds, sizeof (DictWordData *), sort_comp);
341 dd->chars = 0;
342 wd = dd->wd;
343 for (i = 0; i < dd->num_wds; i++, wd++)
344 dd->chars += (*wd)->word[0];
345}
346
347
348static void
349Select_all (void)
350{
351 unsigned int i;
352 Alloc_keep_discard ();
353 keep[0].num_wds = Num[0];
354 for (i = 0; i < Num[0]; i++)
355 keep[0].wd[i] = Words[0] + i;
356 keep[1].num_wds = Num[1];
357 for (i = 0; i < Num[1]; i++)
358 keep[1].wd[i] = Words[1] + i;
359 SortAndCount_DictData (&keep[0]);
360 SortAndCount_DictData (&keep[1]);
361}
362
363
364
365
366static int
367DecFreqIncWL (void *a, void *b)
368{
369 DictWordData *aa = *(DictWordData **) a;
370 DictWordData *bb = *(DictWordData **) b;
371 if (aa->docCount > bb->docCount)
372 return -1;
373 else if (aa->docCount < bb->docCount)
374 return 1;
375 else
376 return bb->word[0] - aa->word[0];
377}
378
379
380static int
381OccuranceOrder (void *a, void *b)
382{
383 DictWordData *aa = *(DictWordData **) a;
384 DictWordData *bb = *(DictWordData **) b;
385 if (aa->words() > bb->words())
386 return 1;
387 else if (aa->words() < bb->words())
388 return -1;
389 else
390 return 0;
391}
392
393
394static void
395Select_on (int k, heap_comp hc)
396{
397 int i, num, mem;
398 DictWordData **wd;
399
400 Alloc_keep_discard ();
401
402 num = Num[0] + Num[1];
403 wd = (DictWordData **) Xmalloc (num * sizeof (DictWordData *));
404 for (i = 0; (unsigned int)i < Num[0]; i++)
405 wd[i] = Words[0] + i;
406 for (i = 0; (unsigned int)i < Num[1]; i++)
407 wd[i + Num[0]] = Words[1] + i;
408
409 heap_build (wd, sizeof (*wd), num, hc);
410
411 mem = 0;
412 while (k > mem && num)
413 {
414 int idx;
415 DictWordData *word = wd[0];
416#ifdef DEBUG
417 fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
418 keep[0].num_wds, keep[1].num_wds,
419 mem, word->documents(), word->word, word2str (word->word));
420#endif
421 mem += sizeof (u_char *) + word->word[0];
422 heap_deletehead (wd, sizeof (*wd), &num, hc);
423 idx = KIND (word);
424 keep[idx].wd[keep[idx].num_wds++] = word;
425 }
426
427 for (i = 0; i < num; i++)
428 {
429 DictWordData *word = wd[i];
430 int idx = KIND (word);
431 discard[idx].wd[discard[idx].num_wds++] = word;
432 }
433 SortAndCount_DictData (&keep[0]);
434 SortAndCount_DictData (&keep[1]);
435 SortAndCount_DictData (&discard[0]);
436 SortAndCount_DictData (&discard[1]);
437
438 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
439 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
440
441}
442
443
444
445static int
446BigSaving (void *a, void *b)
447{
448 DictWordData *aa = *(DictWordData **) a;
449 DictWordData *bb = *(DictWordData **) b;
450 return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
451}
452
453static int
454SmallSaving (void *a, void *b)
455{
456 DictWordData *aa = *(DictWordData **) a;
457 DictWordData *bb = *(DictWordData **) b;
458 return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
459}
460
461
462static void
463CalcCharCounts (DictWordData ** wd, int num,
464 char *char_lens[2], char *len_lens[2],
465 u_long escape[2])
466{
467 long char_freqs[2][256];
468 long len_freqs[2][16];
469 huff_data hd;
470 int i;
471 escape[0] = 0;
472 escape[1] = 0;
473 bzero ((char *) char_freqs, sizeof (char_freqs));
474 bzero ((char *) len_freqs, sizeof (len_freqs));
475 for (i = 0; i < num; i++, wd++)
476 {
477 u_long freq = (*wd)->documents();
478 u_char *buf = (*wd)->word;
479 int len = *buf++;
480 int idx = KIND (*wd);
481 len_freqs[idx][len] += freq;
482 escape[idx] += freq;
483 for (; len; len--, buf++)
484 char_freqs[idx][(u_long) (*buf)] += freq;
485 }
486 Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
487 char_lens[0] = hd.clens;
488 Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
489 char_lens[1] = hd.clens;
490
491 Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
492 len_lens[0] = hd.clens;
493 Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
494 len_lens[1] = hd.clens;
495}
496
497
498
499
500
501inline void CalcBitCostForWordData(DictWordData **word, int num,
502 double freqs_trans[2], u_long escape[2],
503 char * char_lens[2], char *len_lens[2],
504 double esc[2], int num_trans)
505{
506 int j;
507
508 for (j = 0; j < num; j++, word++)
509 {
510 float cbc, wbc;
511 u_char *buf = (*word)->word;
512 int len = *buf++;
513 u_long freq = (*word)->documents();
514 int idx = KIND (*word);
515
516 cbc = len_lens[idx][len];
517 for (; len; len--, buf++)
518 cbc += char_lens[idx][(u_long) (*buf)];
519
520 (*word)->char_bit_cost = (cbc + esc[idx]) * freq;
521
522 wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
523
524 (*word)->saving = ((*word)->char_bit_cost - wbc) /
525 (sizeof (char *) + (*word)->word[0]);
526
527 (*word)->num_trans = num_trans;
528 }
529}
530
531
532
533void
534CalcBitCost (DictWordData ** discard_word, int discard_num,
535 DictWordData ** keep_word, int keep_num, double freqs_trans[2],
536 u_long escape[2], int num_trans)
537{
538 char *char_lens[2];
539 char *len_lens[2];
540 double esc[2];
541 CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
542 esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
543 esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
544 CalcBitCostForWordData(discard_word, discard_num, freqs_trans, escape,
545 char_lens, len_lens, esc, num_trans);
546 CalcBitCostForWordData(keep_word, keep_num, freqs_trans, escape,
547 char_lens, len_lens, esc, num_trans);
548 Xfree (char_lens[0]);
549 Xfree (char_lens[1]);
550 Xfree (len_lens[0]);
551 Xfree (len_lens[1]);
552}
553
554inline void m3_transferWord(DictWordData **toHeap, int *toNum, heap_comp toSaving,
555 DictWordData **fromHeap, int *fromNum, heap_comp fromSaving,
556 int *num_trans, double freqs_trans[], int *mem, double *total,
557 int *count)
558{
559 /* Transfer the word at the top of the keep heap to the
560 discard heap. */
561 DictWordData *word = fromHeap[0];
562 int idx = KIND (word);
563 heap_deletehead (fromHeap, sizeof (word), fromNum, fromSaving);
564 toHeap[*toNum] = word;
565 heap_additem (toHeap, sizeof (word), toNum, toSaving);
566 freqs_trans[idx] -= word->documents();
567 *mem = (*mem) - sizeof (u_char *) + word->word[0];
568 *num_trans += 1;
569 *total += word->saving;
570 *count += 1;
571#ifdef DEBUG
572 fprintf (stderr,
573 "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
574 *mem, word->documents(), word->char_bit_cost,
575 word->saving, word2str (word->word));
576#endif
577}
578
579inline void m3_storeWord(DictWordData **wordHeap, int wordNum, int num_trans,
580 double freqs_trans[], u_long escape[], heap_comp hc)
581{
582 DictWordData *word = wordHeap[0];
583 int idx = KIND (word);
584 float wbc;
585#ifdef DEBUG1
586 fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
587 word->saving);
588#endif
589 wbc = -log2 (word->documents() / (freqs_trans[idx] + escape[idx])) *
590 word->documents();
591 word->saving = (word->char_bit_cost - wbc) /
592 (sizeof (char *) + word->word[0]);
593#ifdef DEBUG1
594 fprintf (stderr, " %.2f\n", word->saving);
595#endif
596 word->num_trans = num_trans;
597 heap_changedhead (wordHeap, sizeof (word), wordNum, hc);
598}
599
600
601static void
602Method3 (int k)
603{
604 int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
605 int keep_to_discard = 0;
606 int discard_to_keep = 0;
607 int recalcs = 0;
608 double freqs_trans[2], total;
609 u_long escape[2];
610 DictWordData **keep_heap, **discard_heap;
611
612 freqs_trans[0] = freqs_trans[1] = 0;
613 num_trans = 0;
614
615 discard_num = Num[0] + Num[1];
616 discard_heap = (DictWordData **) Xmalloc (discard_num * sizeof (DictWordData *));
617
618 keep_num = 0;
619 keep_heap = (DictWordData **) Xmalloc (discard_num * sizeof (DictWordData *));
620
621
622 for (i = 0; (unsigned int)i < Num[0]; i++)
623 discard_heap[i] = Words[0] + i;
624 for (i = 0; (unsigned int)i < Num[1]; i++)
625 discard_heap[i + Num[0]] = Words[1] + i;
626
627 heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);
628
629 mem = 0;
630 while (k > mem && discard_num)
631 {
632 DictWordData *word = discard_heap[0];
633 mem += sizeof (u_char *) + word->word[0];
634 heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
635 keep_heap[keep_num++] = word;
636 freqs_trans[KIND (word)] += word->documents();
637 num_trans++;
638 }
639
640 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
641 freqs_trans, escape, num_trans);
642 heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
643 heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);
644
645 total = 0;
646 recalc_reqd = 0;
647 while (keep_num && discard_num)
648 {
649 if ((keep_num && keep_heap[0]->num_trans < (unsigned)num_trans) ||
650 (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans))
651 {
652 if (keep_num && keep_heap[0]->num_trans < (unsigned)num_trans)
653 {
654 m3_storeWord(keep_heap, keep_num, num_trans, freqs_trans, escape, SmallSaving);
655 }
656
657 if (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans)
658 {
659 m3_storeWord(discard_heap, discard_num, num_trans, freqs_trans, escape, BigSaving);
660 }
661 }
662 else if (keep_heap[0]->saving < discard_heap[0]->saving)
663 {
664 assert (keep_num && discard_num);
665 if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > (unsigned)k)
666 {
667 /* Transfer the word at the top of the keep heap to the
668 discard heap. */
669 m3_transferWord(discard_heap, &discard_num, BigSaving,
670 keep_heap, &keep_num, SmallSaving,
671 &num_trans, freqs_trans, &mem, &total, &keep_to_discard);
672 }
673 else
674 {
675 /* Transfer the word at the top of the discard heap to the
676 keep heap. */
677 m3_transferWord(keep_heap, &keep_num, SmallSaving,
678 discard_heap, &discard_num, BigSaving,
679 &num_trans, freqs_trans, &mem, &total, &discard_to_keep);
680 }
681
682 recalc_reqd = 1;
683
684 }
685 else
686 {
687 if (recalc_reqd == 0)
688 break;
689#ifdef DEBUG1
690 fprintf (stderr, "--------------\n");
691#endif
692 if (recalcs == MAX_RECALCULATIONS)
693 break;
694 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
695 freqs_trans, escape, num_trans);
696 heap_build (discard_heap, sizeof (*discard_heap),
697 discard_num, BigSaving);
698 heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
699 recalc_reqd = 0;
700 recalcs++;
701 }
702 }
703
704 Alloc_keep_discard ();
705
706 for (i = 0; i < discard_num; i++)
707 {
708 DictWordData *word = discard_heap[i];
709 int idx = KIND (word);
710 assert (IsWord (word) || IsNonWord (word));
711 discard[idx].wd[discard[idx].num_wds++] = word;
712 }
713 for (i = 0; i < keep_num; i++)
714 {
715 DictWordData *word = keep_heap[i];
716 int idx = KIND (word);
717 assert (IsWord (word) || IsNonWord (word));
718 keep[idx].wd[keep[idx].num_wds++] = word;
719 }
720 SortAndCount_DictData (&keep[0]);
721 SortAndCount_DictData (&keep[1]);
722 SortAndCount_DictData (&discard[0]);
723 SortAndCount_DictData (&discard[1]);
724
725 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
726 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
727 Message ("Keep -> Discard : %8d", keep_to_discard);
728 Message ("Discard -> Keep : %8d", discard_to_keep);
729 Message ("Huffman Recalculations : %8d", recalcs);
730 if (recalcs == MAX_RECALCULATIONS)
731 Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
732}
733
734
735
736
737
738
739
740
741
742
743/****************************************************************************
744 ***** *****
745 ***** Dictionary saving code *****
746 ***** *****
747 ****************************************************************************/
748
749
750
751static void
752Write_cdh (FILE * f, compression_dict_header * cdh)
753{
754 /* [RPAP - Jan 97: Endian Ordering] */
755 int i;
756 compression_dict_header tmp = *cdh;
757 HTONUL(tmp.dict_type);
758 HTONUL(tmp.novel_method);
759 for (i = 0; i < TEXT_PARAMS; i++)
760 HTONUL(tmp.params[i]);
761 HTONUL(tmp.num_words[0]);
762 HTONUL(tmp.num_words[1]);
763 HTONUL(tmp.num_word_chars[0]);
764 HTONUL(tmp.num_word_chars[1]);
765 HTONUL(tmp.lookback);
766
767 fwrite (&tmp, sizeof (tmp), 1, f);
768}
769
770
771static void
772Write_words (FILE * f, DictData * dd)
773{
774 unsigned int i;
775 u_char *curr, *prev = NULL;
776 for (i = 0; i < dd->num_wds; i++)
777 {
778 int len;
779 curr = dd->wd[i]->word;
780 if (prev)
781 /* look for prefix match with prev string */
782 len = prefixlen (prev, curr);
783 else
784 len = 0;
785 fputc ((len << 4) + (curr[0] - len), f);
786 fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
787 prev = curr;
788 }
789
790}
791
792
793static int
794Uncompressed_size (DictData * dd)
795{
796 unsigned int i, us;
797 for (us = i = 0; i < dd->num_wds; i++)
798 us += dd->wd[i]->word[0];
799 return us;
800}
801
802
803static u_long
804Write_data (FILE * f, DictData * dd, int lookback)
805{
806 u_long mem_reqd;
807 huff_data *hd;
808 int i;
809 u_long us = dd->num_wds;
810 long *freqs;
811 u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
812 u_long lencounts[MAX_HUFFCODE_LEN + 1];
813 u_char *lastword[MAX_HUFFCODE_LEN + 1];
814
815 if (!(freqs = new long [dd->num_wds]))
816 FatalError (1, "Unable to allocate memory for freqs");
817
818 for (i = 0; (unsigned)i < dd->num_wds; i++)
819 {
820 freqs[i] = dd->wd[i]->documents();
821 us += dd->wd[i]->word[0];
822 }
823
824 if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
825 FatalError (1, "Unable to allocate memory for huffman data");
826
827 delete (freqs);
828 freqs = NULL;
829
830 if (Write_Huffman_Data (f, hd) == -1)
831 FatalError (1, "Unable to write huffman data");
832
833 HTONUL(us); /* [RPAP - Jan 97: Endian Ordering] */
834 fwrite (&us, sizeof (us), 1, f);
835 NTOHUL(us); /* [RPAP - Jan 97: Endian Ordering] */
836
837
838/* Calculate the amount of memory that will be required to store the text for
839 each different huffman code len. Every 1<<lookback words for each different
840 codelen length will not be prefixed by previous strings. */
841
842
843 bzero ((char *) &huff_words_size, sizeof (huff_words_size));
844 bzero ((char *) &lencounts, sizeof (lencounts));
845
846 mem_reqd = 0;
847
848 for (i = 0; (unsigned)i < dd->num_wds; i++)
849 {
850 int codelen = hd->clens[i];
851 u_char *word = dd->wd[i]->word;
852
853 if (!codelen)
854 FatalError (1, "The length of a code for a word was zero");
855
856 huff_words_size[codelen] += word[0] + 1;
857 mem_reqd += word[0] + (lookback != 0);
858#if 0
859 if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
860 lastword[codelen] = word;
861 else
862 huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
863#else
864 if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
865 {
866 int save = prefixlen (lastword[codelen], word);
867 mem_reqd -= save;
868 huff_words_size[codelen] -= save;
869 }
870 else
871 {
872 mem_reqd += sizeof (u_char *);
873 }
874 lastword[codelen] = word;
875#endif
876 lencounts[codelen]++;
877 }
878
879 /* [RPAP - Jan 97: Endian Ordering] */
880 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
881 HTONUL(huff_words_size[i]);
882
883 fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
884 hd->maxcodelen - hd->mincodelen + 1, f);
885
886 /* [RPAP - Jan 97: Endian Ordering] */
887 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
888 NTOHUL(huff_words_size[i]);
889
890 Write_words (f, dd);
891
892 delete hd->clens;
893 delete hd;
894
895 return mem_reqd;
896}
897
898
899
900static void
901Write_charfreqs (FILE * f, DictData * dd, int words,
902 int zero_freq_permitted)
903{
904 unsigned int j;
905 long freqs[256];
906 DictWordData **wd = dd->wd;
907 huff_data *hd;
908
909 bzero ((char *) freqs, sizeof (freqs));
910
911 for (j = 0; j < dd->num_wds; j++, wd++)
912 {
913 u_char *buf = (*wd)->word;
914 int len = *buf++;
915 for (; len; len--, buf++)
916 freqs[(u_long) (*buf)] += (*wd)->documents();
917 }
918
919 if (!zero_freq_permitted)
920 for (j = 0; j < 256; j++)
921 if (!freqs[j] && PESINAWORD (j) == words)
922 freqs[j] = 1;
923
924 if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
925 FatalError (1, "Unable to allocate memory for huffman data");
926
927 if (Write_Huffman_Data (f, hd) == -1)
928 FatalError (1, "Unable to write huffman data");
929
930 delete hd->clens;
931 delete hd;
932}
933
934
935
936
937static void
938Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
939{
940 unsigned int j;
941 long freqs[16];
942 DictWordData **wd = dd->wd;
943 huff_data *hd;
944
945 bzero ((char *) freqs, sizeof (freqs));
946
947 for (j = 0; j < dd->num_wds; j++, wd++)
948 freqs[(*wd)->word[0]] += (*wd)->documents();
949
950 if (!zero_freq_permitted)
951 for (j = 0; j < 16; j++)
952 if (!freqs[j])
953 freqs[j] = 1;
954
955 if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
956 FatalError (1, "Unable to allocate memory for huffman data");
957
958 if (Write_Huffman_Data (f, hd) == -1)
959 FatalError (1, "Unable to write huffman data");
960
961
962 delete hd->clens;
963 delete hd;
964}
965
966
967
968static u_long
969WriteOutWords (char *file_name, u_long type, int lookback)
970{
971 FILE *f;
972 int i;
973 u_long mem_reqd = 0;
974
975 compression_dict_header cdh;
976 f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
977 MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
978
979 bzero((char *) &cdh, sizeof(cdh));
980
981 cdh.dict_type = type;
982 cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
983 MG_NOVEL_HUFFMAN_CHARS;
984
985 cdh.num_words[1] = keep[1].num_wds;
986 cdh.num_words[0] = keep[0].num_wds;
987 cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
988 cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
989 cdh.lookback = lookback;
990
991 Write_cdh (f, &cdh);
992
993 for (i = 0; i < 2; i++)
994 switch (type)
995 {
996 case MG_COMPLETE_DICTIONARY:
997 {
998 mem_reqd += Write_data (f, &keep[i], lookback);
999 }
1000 break;
1001 case MG_PARTIAL_DICTIONARY:
1002 {
1003 if (keep[i].num_wds)
1004 {
1005 int j;
1006 DictWordData esc;
1007 esc.docCount = 0;
1008 esc.word = (u_char *) "";
1009 keep[i].wd[keep[i].num_wds++] = &esc;
1010 for (j = 0; (unsigned)j < discard[i].num_wds; j++)
1011 esc.docCount += discard[i].wd[j]->documents();
1012 if (!esc.docCount)
1013 esc.docCount++;
1014 mem_reqd += Write_data (f, &keep[i], lookback);
1015 }
1016 Write_charfreqs (f, &discard[i], i, 1);
1017 Write_wordlens (f, &discard[i], 1);
1018 }
1019 break;
1020 case MG_SEED_DICTIONARY:
1021 {
1022 if (keep[i].num_wds)
1023 {
1024 int j;
1025 DictWordData esc;
1026 esc.docCount = 0;
1027 esc.word = (u_char *) "";
1028 keep[i].wd[keep[i].num_wds++] = &esc;
1029 for (j = 0; (unsigned)j < all[i].num_wds; j++)
1030 if (all[i].wd[j]->documents() == 1)
1031 esc.docCount++;
1032 if (!esc.docCount)
1033 esc.docCount++;
1034 mem_reqd += Write_data (f, &keep[i], lookback);
1035 }
1036 switch (novel_method)
1037 {
1038 case MG_NOVEL_HUFFMAN_CHARS:
1039 Write_charfreqs (f, &all[i], i, 0);
1040 Write_wordlens (f, &all[i], 0);
1041 break;
1042 case MG_NOVEL_DELTA:
1043 break;
1044 case MG_NOVEL_HYBRID:
1045 break;
1046 default:
1047 FatalError (1, "Bad novel method");
1048 }
1049 }
1050 break;
1051 }
1052 fclose (f);
1053 return mem_reqd;
1054}
Note: See TracBrowser for help on using the repository browser.