source: trunk/gsdl/packages/mg/src/text/mg_compression_dict.c@ 455

Last change on this file since 455 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:keywords set to Author Date Id Revision
File size: 26.6 KB
Line 
1/**************************************************************************
2 *
3 * mg_compression_dict.c -- Routines for reading the compression dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_compression_dict.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "local_strings.h"
29#include "bitio_gen.h"
30#include "bitio_m.h"
31#include "bitio_m_stdio.h"
32#include "timing.h"
33#include "mgheap.h"
34#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36#include "mg_files.h"
37#include "locallib.h"
38#include "invf.h"
39#include "text.h"
40#include "words.h"
41#include "mg.h"
42
43#define MAXBITS (sizeof(unsigned long) * 8)
44
45/*
46 $Log$
47 Revision 1.1 1999/08/10 21:18:04 sjboddie
48 renamed mg-1.3d directory mg
49
50 Revision 1.4 1999/01/15 03:05:51 rjmcnab
51
52 Renamed lib/heap to be lib/mgheap (it was causing some problems with
53 some versions of the STL libraries which contained a heap.h)
54
55 Revision 1.3 1998/12/17 09:12:52 rjmcnab
56
57 Altered mg to process utf-8 encoded Unicode. The main changes
58 are in the parsing of the input, the casefolding, and the stemming.
59
60 Revision 1.2 1998/11/25 07:55:44 rjmcnab
61
62 Modified mg to that you can specify the stemmer you want
63 to use via a command line option. You specify it to
64 mg_passes during the build process. The number of the
65 stemmer that you used is stored within the inverted
66 dictionary header and the stemmed dictionary header so
67 the correct stemmer is used in later stages of building
68 and querying.
69
70 Revision 1.1 1998/11/17 09:34:52 rjmcnab
71 *** empty log message ***
72
73 * Revision 1.3 1994/10/20 03:56:54 tes
74 * I have rewritten the boolean query optimiser and abstracted out the
75 * components of the boolean query.
76 *
77 * Revision 1.2 1994/09/20 04:41:45 tes
78 * For version 1.1
79 *
80 */
81
82static char *RCSID = "$Id: mg_compression_dict.c 439 1999-08-10 21:23:37Z sjboddie $";
83
84/* #define DEBUG1 */
85
86/* #define DEBUG */
87
88#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))
89
90#define MAX_RECALCULATIONS 100
91
92typedef struct WordData
93 {
94 u_long freq, occur_num;
95 float saving;
96 float char_bit_cost;
97 u_long num_trans;
98 u_char *word;
99 }
100WordData;
101
102static WordData *Words[2];
103static u_long Num[2];
104static u_long chars[2];
105
106#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
107#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
108#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)
109
110
111typedef struct DictData
112 {
113 WordData **wd;
114 u_long num_wds;
115 u_long chars;
116 }
117DictData;
118
119typedef DictData DictInfo[2];
120
121static DictInfo keep, discard, all;
122static compression_stats_header csh;
123
124static char *file_name = "";
125static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;
126
127
128static void ReadInWords (char *);
129static void Select_on (int k, heap_comp hc);
130static void Method3 (int k);
131static void Select_all (void);
132static u_long WriteOutWords (char *, u_long, int);
133static int DecFreqIncWL (void *a, void *b);
134static int OccuranceOrder (void *a, void *b);
135
136
137
138int main (int argc, char **argv)
139{
140 ProgTime StartTime;
141 int ch;
142 char type = 'C';
143 char mode = '0';
144 int lookback = 2;
145 double k = 0;
146 u_long mem_reqd;
147 opterr = 0;
148 msg_prefix = argv[0];
149 while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HBDYM")) != -1)
150 switch (ch)
151 {
152 case 'H':
153 novel_method = MG_NOVEL_HUFFMAN_CHARS;
154 break;
155 case 'B':
156 novel_method = MG_NOVEL_BINARY;
157 break;
158 case 'D':
159 novel_method = MG_NOVEL_DELTA;
160 break;
161 case 'Y':
162 novel_method = MG_NOVEL_HYBRID;
163 break;
164 case 'M':
165 novel_method = MG_NOVEL_HYBRID_MTF;
166 break;
167 case 'f': /* input file */
168 file_name = optarg;
169 break;
170 case 'd':
171 set_basepath (optarg);
172 break;
173 case 'C':
174 case 'P':
175 case 'S':
176 type = ch;
177 break;
178 case '0':
179 case '1':
180 case '2':
181 case '3':
182 mode = ch;
183 break;
184 case 'l':
185 lookback = atoi (optarg);
186 if (!is_power_of_two (lookback))
187 FatalError (1, "The lookback value must be a power of 2");
188 lookback = floorlog_2 (lookback);
189 break;
190 case 'k':
191 k = atof (optarg) * 1024;
192 break;
193 case 'h':
194 case '?':
195 fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
196 "[-d data directory] [-h] [-0|-1|-2|-3] [-C|-P|-S] [-k mem (Kb)]\n",
197 argv[0]);
198 exit (1);
199 }
200 GetTime (&StartTime);
201
202 ReadInWords (file_name);
203
204 if (type == 'C')
205 {
206 Select_all ();
207 mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
208 }
209 else
210 {
211 switch (mode)
212 {
213 case '0':
214 Select_all ();
215 break;
216 case '1':
217 Message ("Dictionary limit of %.2f Kb", k / 1024);
218 Select_on ((int) k, OccuranceOrder);
219 break;
220 case '2':
221 Message ("Dictionary limit of %.2f Kb", k / 1024);
222 Select_on ((int) k, DecFreqIncWL);
223 break;
224 case '3':
225 Message ("Dictionary limit of %.2f Kb", k / 1024);
226 Method3 ((int) k);
227 break;
228 }
229 if (type == 'P')
230 {
231 mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
232 }
233 else
234 {
235 mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
236 }
237 }
238
239 Message ("Num words : %8u -> %8u\n", Num[1], keep[1].num_wds);
240 Message ("Num non-words : %8u -> %8u\n", Num[0], keep[0].num_wds);
241 Message ("Chars of words : %8u -> %8u\n", chars[1], keep[1].chars);
242 Message ("Chars of non-words : %8u -> %8u\n", chars[0], keep[0].chars);
243 Message ("Mem usage : %8u -> %8u\n",
244 (Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
245 (keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
246 keep[0].chars + keep[1].chars);
247 Message ("Actual mem required : %8u\n", mem_reqd);
248
249 Message ("%s", ElapsedTime (&StartTime, NULL));
250 return 0;
251}
252
253
254
255
256static void
257ReadInWords (char *filename)
258{
259 FILE *f;
260 unsigned long i;
261 f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
262 MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
263
264 fread (&csh, sizeof (csh), 1, f);
265
266 /* [RPAP - Jan 97: Endian Ordering] */
267 NTOHUL(csh.num_docs);
268 NTOHUL(csh.num_bytes);
269
270 for (i = 0; i < 2; i++)
271 {
272 frags_stats_header fsh;
273 char *buf;
274 WordData *wd;
275 int j;
276 chars[i] = 0;
277
278 fread (&fsh, sizeof (fsh), 1, f);
279 /* [RPAP - Jan 97: Endian Ordering] */
280 NTOHUL(fsh.num_frags);
281 NTOHUL(fsh.mem_for_frags);
282
283 Num[i] = all[i].num_wds = fsh.num_frags;
284
285 /* The +1 on the following line is to leave room for the esc code. */
286 all[i].wd = Xmalloc (sizeof (WordData *) * Num[i] + 1);
287
288 buf = Xmalloc (fsh.mem_for_frags);
289 wd = Words[i] = Xmalloc (sizeof (WordData) * Num[i]);
290 for (j = 0; j < Num[i]; j++, wd++)
291 {
292 int len;
293 fread (&wd->freq, sizeof (wd->freq), 1, f);
294 NTOHUL(wd->freq); /* [RPAP - Jan 97: Endian Ordering] */
295 fread (&wd->occur_num, sizeof (wd->occur_num), 1, f);
296 NTOHUL(wd->occur_num); /* [RPAP - Jan 97: Endian Ordering] */
297 len = fgetc (f);
298 wd->word = (u_char *) buf;
299 *buf++ = len;
300 fread (buf, len, 1, f);
301 buf += len;
302 all[i].wd[j] = wd;
303 }
304 chars[i] = fsh.mem_for_frags - fsh.num_frags;
305 }
306 fclose (f);
307}
308
309
310static void
311Alloc_keep_discard (void)
312{
313 keep[0].num_wds = 0;
314 keep[0].wd = Xmalloc ((Num[0] + 1) * sizeof (WordData *));
315 keep[1].num_wds = 0;
316 keep[1].wd = Xmalloc ((Num[1] + 1) * sizeof (WordData *));
317 discard[0].num_wds = 0;
318 discard[0].wd = Xmalloc ((Num[0] + 1) * sizeof (WordData *));
319 discard[1].num_wds = 0;
320 discard[1].wd = Xmalloc ((Num[1] + 1) * sizeof (WordData *));
321}
322
323
324static int
325sort_comp (const void *a, const void *b)
326{
327 WordData *aa = *(WordData **) a;
328 WordData *bb = *(WordData **) b;
329 return casecompare (aa->word, bb->word); /* [RPAP - Jan 97: Stem Index Change] */
330}
331
332
333
334static void
335SortAndCount_DictData (DictData * dd)
336{
337 int i;
338 WordData **wd;
339 qsort (dd->wd, dd->num_wds, sizeof (WordData *), sort_comp);
340 dd->chars = 0;
341 wd = dd->wd;
342 for (i = 0; i < dd->num_wds; i++, wd++)
343 dd->chars += (*wd)->word[0];
344}
345
346
347static void
348Select_all (void)
349{
350 int i;
351 Alloc_keep_discard ();
352 keep[0].num_wds = Num[0];
353 for (i = 0; i < Num[0]; i++)
354 keep[0].wd[i] = Words[0] + i;
355 keep[1].num_wds = Num[1];
356 for (i = 0; i < Num[1]; i++)
357 keep[1].wd[i] = Words[1] + i;
358 SortAndCount_DictData (&keep[0]);
359 SortAndCount_DictData (&keep[1]);
360}
361
362
363
364
365static int
366DecFreqIncWL (void *a, void *b)
367{
368 WordData *aa = *(WordData **) a;
369 WordData *bb = *(WordData **) b;
370 if (aa->freq > bb->freq)
371 return -1;
372 else if (aa->freq < bb->freq)
373 return 1;
374 else
375 return bb->word[0] - aa->word[0];
376}
377
378
379static int
380OccuranceOrder (void *a, void *b)
381{
382 WordData *aa = *(WordData **) a;
383 WordData *bb = *(WordData **) b;
384 if (aa->occur_num > bb->occur_num)
385 return 1;
386 else if (aa->occur_num < bb->occur_num)
387 return -1;
388 else
389 return 0;
390}
391
392
393static void
394Select_on (int k, heap_comp hc)
395{
396 int i, num, mem;
397 WordData **wd;
398
399 Alloc_keep_discard ();
400
401 num = Num[0] + Num[1];
402 wd = Xmalloc (num * sizeof (WordData *));
403 for (i = 0; i < Num[0]; i++)
404 wd[i] = Words[0] + i;
405 for (i = 0; i < Num[1]; i++)
406 wd[i + Num[0]] = Words[1] + i;
407
408 heap_build (wd, sizeof (*wd), num, hc);
409
410 mem = 0;
411 while (k > mem && num)
412 {
413 int idx;
414 WordData *word = wd[0];
415#ifdef DEBUG
416 fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
417 keep[0].num_wds, keep[1].num_wds,
418 mem, word->freq, word->occur_num, word2str (word->word));
419#endif
420 mem += sizeof (u_char *) + word->word[0];
421 heap_deletehead (wd, sizeof (*wd), &num, hc);
422 idx = KIND (word);
423 keep[idx].wd[keep[idx].num_wds++] = word;
424 }
425
426 for (i = 0; i < num; i++)
427 {
428 WordData *word = wd[i];
429 int idx = KIND (word);
430 discard[idx].wd[discard[idx].num_wds++] = word;
431 }
432 SortAndCount_DictData (&keep[0]);
433 SortAndCount_DictData (&keep[1]);
434 SortAndCount_DictData (&discard[0]);
435 SortAndCount_DictData (&discard[1]);
436
437 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
438 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
439
440}
441
442
443
444static int
445BigSaving (void *a, void *b)
446{
447 WordData *aa = *(WordData **) a;
448 WordData *bb = *(WordData **) b;
449 return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
450}
451
452static int
453SmallSaving (void *a, void *b)
454{
455 WordData *aa = *(WordData **) a;
456 WordData *bb = *(WordData **) b;
457 return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
458}
459
460
461static void
462CalcCharCounts (WordData ** wd, int num,
463 char *char_lens[2], char *len_lens[2],
464 u_long escape[2])
465{
466 long char_freqs[2][256];
467 long len_freqs[2][16];
468 huff_data hd;
469 int i;
470 escape[0] = 0;
471 escape[1] = 0;
472 bzero ((char *) char_freqs, sizeof (char_freqs));
473 bzero ((char *) len_freqs, sizeof (len_freqs));
474 for (i = 0; i < num; i++, wd++)
475 {
476 u_long freq = (*wd)->freq;
477 u_char *buf = (*wd)->word;
478 int len = *buf++;
479 int idx = KIND (*wd);
480 len_freqs[idx][len] += freq;
481 escape[idx] += freq;
482 for (; len; len--, buf++)
483 char_freqs[idx][(u_long) (*buf)] += freq;
484 }
485 Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
486 char_lens[0] = hd.clens;
487 Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
488 char_lens[1] = hd.clens;
489
490 Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
491 len_lens[0] = hd.clens;
492 Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
493 len_lens[1] = hd.clens;
494}
495
496
497
498
499
500
501
502
503
504void
505CalcBitCost (WordData ** discard_word, int discard_num,
506 WordData ** keep_word, int keep_num, double freqs_trans[2],
507 u_long escape[2], int num_trans)
508{
509 char *char_lens[2];
510 char *len_lens[2];
511 double esc[2];
512 int j;
513 CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
514 esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
515 esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
516 for (j = 0; j < discard_num; j++, discard_word++)
517 {
518 float cbc, wbc;
519 u_char *buf = (*discard_word)->word;
520 int len = *buf++;
521 u_long freq = (*discard_word)->freq;
522 int idx = KIND (*discard_word);
523
524 cbc = len_lens[idx][len];
525 for (; len; len--, buf++)
526 cbc += char_lens[idx][(u_long) (*buf)];
527
528 (*discard_word)->char_bit_cost = (cbc + esc[idx]) * freq;
529
530 wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
531
532 (*discard_word)->saving = ((*discard_word)->char_bit_cost - wbc) /
533 (sizeof (char *) + (*discard_word)->word[0]);
534
535 (*discard_word)->num_trans = num_trans;
536 }
537 for (j = 0; j < keep_num; j++, keep_word++)
538 {
539 float cbc, wbc;
540 u_char *buf = (*keep_word)->word;
541 int len = *buf++;
542 u_long freq = (*keep_word)->freq;
543 int idx = KIND (*keep_word);
544
545 cbc = len_lens[idx][len];
546 for (; len; len--, buf++)
547 cbc += char_lens[idx][(u_long) (*buf)];
548
549 (*keep_word)->char_bit_cost = (cbc + esc[idx]) * freq;
550
551 wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
552
553 (*keep_word)->saving = ((*keep_word)->char_bit_cost - wbc) /
554 (sizeof (char *) + (*keep_word)->word[0]);
555
556 (*keep_word)->num_trans = num_trans;
557 }
558 Xfree (char_lens[0]);
559 Xfree (char_lens[1]);
560 Xfree (len_lens[0]);
561 Xfree (len_lens[1]);
562}
563
564
565
566static void
567Method3 (int k)
568{
569 int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
570 int keep_to_discard = 0;
571 int discard_to_keep = 0;
572 int recalcs = 0;
573 double freqs_trans[2], total;
574 u_long escape[2];
575 WordData **keep_heap, **discard_heap;
576
577 freqs_trans[0] = freqs_trans[1] = 0;
578 num_trans = 0;
579
580 discard_num = Num[0] + Num[1];
581 discard_heap = Xmalloc (discard_num * sizeof (WordData *));
582
583 keep_num = 0;
584 keep_heap = Xmalloc (discard_num * sizeof (WordData *));
585
586
587 for (i = 0; i < Num[0]; i++)
588 discard_heap[i] = Words[0] + i;
589 for (i = 0; i < Num[1]; i++)
590 discard_heap[i + Num[0]] = Words[1] + i;
591
592 heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);
593
594 mem = 0;
595 while (k > mem && discard_num)
596 {
597 WordData *word = discard_heap[0];
598 mem += sizeof (u_char *) + word->word[0];
599 heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
600 keep_heap[keep_num++] = word;
601 freqs_trans[KIND (word)] += word->freq;
602 num_trans++;
603 }
604
605 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
606 freqs_trans, escape, num_trans);
607 heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
608 heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);
609
610 total = 0;
611 recalc_reqd = 0;
612 while (keep_num && discard_num)
613 {
614 if ((keep_num && keep_heap[0]->num_trans < num_trans) ||
615 (discard_num && discard_heap[0]->num_trans < num_trans))
616 {
617 if (keep_num && keep_heap[0]->num_trans < num_trans)
618 {
619 WordData *word = keep_heap[0];
620 int idx = KIND (word);
621 float wbc;
622#ifdef DEBUG1
623 fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
624 word->saving);
625#endif
626 wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
627 word->freq;
628 word->saving = (word->char_bit_cost - wbc) /
629 (sizeof (char *) + word->word[0]);
630#ifdef DEBUG1
631 fprintf (stderr, " %.2f\n", word->saving);
632#endif
633 word->num_trans = num_trans;
634 heap_changedhead (keep_heap, sizeof (word), keep_num, SmallSaving);
635 }
636
637 if (discard_num && discard_heap[0]->num_trans < num_trans)
638 {
639 WordData *word = discard_heap[0];
640 int idx = KIND (word);
641 float wbc;
642#ifdef DEBUG1
643 fprintf (stderr, "DISCARD \"%s\" %.2f ->", word2str (word->word),
644 word->saving);
645#endif
646 wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
647 word->freq;
648 word->saving = (word->char_bit_cost - wbc) /
649 (sizeof (char *) + word->word[0]);
650#ifdef DEBUG1
651 fprintf (stderr, " %.2f\n", word->saving);
652#endif
653 word->num_trans = num_trans;
654 heap_changedhead (discard_heap, sizeof (word),
655 discard_num, BigSaving);
656 }
657 }
658 else if (keep_heap[0]->saving < discard_heap[0]->saving)
659 {
660 assert (keep_num && discard_num);
661 if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > k)
662 {
663 /* Transfer the word at the top of the keep heap to the
664 discard heap. */
665 WordData *word = keep_heap[0];
666 int idx = KIND (word);
667 heap_deletehead (keep_heap, sizeof (word), &keep_num, SmallSaving);
668 discard_heap[discard_num] = word;
669 heap_additem (discard_heap, sizeof (word), &discard_num,
670 BigSaving);
671 freqs_trans[idx] -= word->freq;
672 mem -= sizeof (u_char *) + word->word[0];
673 num_trans++;
674 total += word->saving;
675 keep_to_discard++;
676#ifdef DEBUG
677 fprintf (stderr,
678 "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
679 mem, word->freq, word->char_bit_cost,
680 word->saving, word2str (word->word));
681#endif
682 }
683 else
684 {
685 /* Transfer the word at the top of the discard heap to the
686 keep heap. */
687 WordData *word = discard_heap[0];
688 int idx = KIND (word);
689 heap_deletehead (discard_heap, sizeof (word),
690 &discard_num, BigSaving);
691 keep_heap[keep_num] = word;
692 heap_additem (keep_heap, sizeof (word), &keep_num, SmallSaving);
693 freqs_trans[idx] += word->freq;
694 mem += sizeof (u_char *) + word->word[0];
695 num_trans++;
696 total += word->saving;
697 discard_to_keep++;
698#ifdef DEBUG
699 fprintf (stderr,
700 "DISCARD -> KEEP %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
701 mem, word->freq, word->char_bit_cost,
702 word->saving, word2str (word->word));
703#endif
704 }
705
706 recalc_reqd = 1;
707
708 }
709 else
710 {
711 if (recalc_reqd == 0)
712 break;
713#ifdef DEBUG1
714 fprintf (stderr, "--------------\n");
715#endif
716 if (recalcs == MAX_RECALCULATIONS)
717 break;
718 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
719 freqs_trans, escape, num_trans);
720 heap_build (discard_heap, sizeof (*discard_heap),
721 discard_num, BigSaving);
722 heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
723 recalc_reqd = 0;
724 recalcs++;
725 }
726 }
727
728 Alloc_keep_discard ();
729
730 for (i = 0; i < discard_num; i++)
731 {
732 WordData *word = discard_heap[i];
733 int idx = KIND (word);
734 assert (IsWord (word) || IsNonWord (word));
735 discard[idx].wd[discard[idx].num_wds++] = word;
736 }
737 for (i = 0; i < keep_num; i++)
738 {
739 WordData *word = keep_heap[i];
740 int idx = KIND (word);
741 assert (IsWord (word) || IsNonWord (word));
742 keep[idx].wd[keep[idx].num_wds++] = word;
743 }
744 SortAndCount_DictData (&keep[0]);
745 SortAndCount_DictData (&keep[1]);
746 SortAndCount_DictData (&discard[0]);
747 SortAndCount_DictData (&discard[1]);
748
749 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
750 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
751 Message ("Keep -> Discard : %8d", keep_to_discard);
752 Message ("Discard -> Keep : %8d", discard_to_keep);
753 Message ("Huffman Recalculations : %8d", recalcs);
754 if (recalcs == MAX_RECALCULATIONS)
755 Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
756}
757
758
759
760
761
762
763
764
765
766
767/****************************************************************************
768 ***** *****
769 ***** Dictionary saving code *****
770 ***** *****
771 ****************************************************************************/
772
773
774
775static void
776Write_cdh (FILE * f, compression_dict_header * cdh)
777{
778 /* [RPAP - Jan 97: Endian Ordering] */
779 int i;
780 compression_dict_header tmp = *cdh;
781 HTONUL(tmp.dict_type);
782 HTONUL(tmp.novel_method);
783 for (i = 0; i < TEXT_PARAMS; i++)
784 HTONUL(tmp.params[i]);
785 HTONUL(tmp.num_words[0]);
786 HTONUL(tmp.num_words[1]);
787 HTONUL(tmp.num_word_chars[0]);
788 HTONUL(tmp.num_word_chars[1]);
789 HTONUL(tmp.lookback);
790
791 fwrite (&tmp, sizeof (tmp), 1, f);
792}
793
794
795static void
796Write_words (FILE * f, DictData * dd)
797{
798 int i;
799 u_char *curr, *prev = NULL;
800 for (i = 0; i < dd->num_wds; i++)
801 {
802 int len;
803 curr = dd->wd[i]->word;
804 if (prev)
805 /* look for prefix match with prev string */
806 len = prefixlen (prev, curr);
807 else
808 len = 0;
809 fputc ((len << 4) + (curr[0] - len), f);
810 fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
811 prev = curr;
812 }
813
814}
815
816
817static int
818Uncompressed_size (DictData * dd)
819{
820 int i, us;
821 for (us = i = 0; i < dd->num_wds; i++)
822 us += dd->wd[i]->word[0];
823 return us;
824}
825
826
827static u_long
828Write_data (FILE * f, DictData * dd, int lookback)
829{
830 u_long mem_reqd;
831 huff_data *hd;
832 int i;
833 u_long us = dd->num_wds;
834 long *freqs;
835 u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
836 u_long lencounts[MAX_HUFFCODE_LEN + 1];
837 u_char *lastword[MAX_HUFFCODE_LEN + 1];
838
839 if (!(freqs = Xmalloc ((dd->num_wds) * sizeof (*freqs))))
840 FatalError (1, "Unable to allocate memory for freqs");
841
842 for (i = 0; i < dd->num_wds; i++)
843 {
844 freqs[i] = dd->wd[i]->freq;
845 us += dd->wd[i]->word[0];
846 }
847
848 if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
849 FatalError (1, "Unable to allocate memory for huffman data");
850
851 Xfree (freqs), freqs = NULL;
852
853 if (Write_Huffman_Data (f, hd) == -1)
854 FatalError (1, "Unable to write huffman data");
855
856 HTONUL(us); /* [RPAP - Jan 97: Endian Ordering] */
857 fwrite (&us, sizeof (us), 1, f);
858 NTOHUL(us); /* [RPAP - Jan 97: Endian Ordering] */
859
860
861/* Calculate the amount of memory that will be required to store the text for
862 each different huffman code len. Every 1<<lookback words for each different
863 codelen length will not be prefixed by previous strings. */
864
865
866 bzero ((char *) &huff_words_size, sizeof (huff_words_size));
867 bzero ((char *) &lencounts, sizeof (lencounts));
868
869 mem_reqd = 0;
870
871 for (i = 0; i < dd->num_wds; i++)
872 {
873 int codelen = hd->clens[i];
874 u_char *word = dd->wd[i]->word;
875
876 if (!codelen)
877 FatalError (1, "The length of a code for a word was zero");
878
879 huff_words_size[codelen] += word[0] + 1;
880 mem_reqd += word[0] + (lookback != 0);
881#if 0
882 if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
883 lastword[codelen] = word;
884 else
885 huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
886#else
887 if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
888 {
889 int save = prefixlen (lastword[codelen], word);
890 mem_reqd -= save;
891 huff_words_size[codelen] -= save;
892 }
893 else
894 {
895 mem_reqd += sizeof (u_char *);
896 }
897 lastword[codelen] = word;
898#endif
899 lencounts[codelen]++;
900 }
901
902 /* [RPAP - Jan 97: Endian Ordering] */
903 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
904 HTONUL(huff_words_size[i]);
905
906 fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
907 hd->maxcodelen - hd->mincodelen + 1, f);
908
909 /* [RPAP - Jan 97: Endian Ordering] */
910 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
911 NTOHUL(huff_words_size[i]);
912
913 Write_words (f, dd);
914
915 Xfree (hd->clens);
916 Xfree (hd);
917
918 return mem_reqd;
919}
920
921
922
923static void
924Write_charfreqs (FILE * f, DictData * dd, int words,
925 int zero_freq_permitted)
926{
927 int j;
928 long freqs[256];
929 WordData **wd = dd->wd;
930 huff_data *hd;
931
932 bzero ((char *) freqs, sizeof (freqs));
933
934 for (j = 0; j < dd->num_wds; j++, wd++)
935 {
936 u_char *buf = (*wd)->word;
937 int len = *buf++;
938 for (; len; len--, buf++)
939 freqs[(u_long) (*buf)] += (*wd)->freq;
940 }
941
942 if (!zero_freq_permitted)
943 for (j = 0; j < 256; j++)
944 if (!freqs[j] && PESINAWORD (j) == words)
945 freqs[j] = 1;
946
947 if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
948 FatalError (1, "Unable to allocate memory for huffman data");
949
950 if (Write_Huffman_Data (f, hd) == -1)
951 FatalError (1, "Unable to write huffman data");
952
953 Xfree (hd->clens);
954 Xfree (hd);
955}
956
957
958
959
960static void
961Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
962{
963 int j;
964 long freqs[16];
965 WordData **wd = dd->wd;
966 huff_data *hd;
967
968 bzero ((char *) freqs, sizeof (freqs));
969
970 for (j = 0; j < dd->num_wds; j++, wd++)
971 freqs[(*wd)->word[0]] += (*wd)->freq;
972
973 if (!zero_freq_permitted)
974 for (j = 0; j < 16; j++)
975 if (!freqs[j])
976 freqs[j] = 1;
977
978 if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
979 FatalError (1, "Unable to allocate memory for huffman data");
980
981 if (Write_Huffman_Data (f, hd) == -1)
982 FatalError (1, "Unable to write huffman data");
983
984
985 Xfree (hd->clens);
986 Xfree (hd);
987}
988
989
990
991static u_long
992WriteOutWords (char *file_name, u_long type, int lookback)
993{
994 FILE *f;
995 int i;
996 u_long mem_reqd = 0;
997
998 compression_dict_header cdh;
999 f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
1000 MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
1001
1002 bzero((char *) &cdh, sizeof(cdh));
1003
1004 cdh.dict_type = type;
1005 cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
1006 MG_NOVEL_HUFFMAN_CHARS;
1007
1008 cdh.num_words[1] = keep[1].num_wds;
1009 cdh.num_words[0] = keep[0].num_wds;
1010 cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
1011 cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
1012 cdh.lookback = lookback;
1013
1014 Write_cdh (f, &cdh);
1015
1016 for (i = 0; i < 2; i++)
1017 switch (type)
1018 {
1019 case MG_COMPLETE_DICTIONARY:
1020 {
1021 mem_reqd += Write_data (f, &keep[i], lookback);
1022 }
1023 break;
1024 case MG_PARTIAL_DICTIONARY:
1025 {
1026 if (keep[i].num_wds)
1027 {
1028 int j;
1029 WordData esc;
1030 esc.freq = 0;
1031 esc.word = (u_char *) "";
1032 keep[i].wd[keep[i].num_wds++] = &esc;
1033 for (j = 0; j < discard[i].num_wds; j++)
1034 esc.freq += discard[i].wd[j]->freq;
1035 if (!esc.freq)
1036 esc.freq++;
1037 mem_reqd += Write_data (f, &keep[i], lookback);
1038 }
1039 Write_charfreqs (f, &discard[i], i, 1);
1040 Write_wordlens (f, &discard[i], 1);
1041 }
1042 break;
1043 case MG_SEED_DICTIONARY:
1044 {
1045 if (keep[i].num_wds)
1046 {
1047 int j;
1048 WordData esc;
1049 esc.freq = 0;
1050 esc.word = (u_char *) "";
1051 keep[i].wd[keep[i].num_wds++] = &esc;
1052 for (j = 0; j < all[i].num_wds; j++)
1053 if (all[i].wd[j]->freq == 1)
1054 esc.freq++;
1055 if (!esc.freq)
1056 esc.freq++;
1057 mem_reqd += Write_data (f, &keep[i], lookback);
1058 }
1059 switch (novel_method)
1060 {
1061 case MG_NOVEL_HUFFMAN_CHARS:
1062 Write_charfreqs (f, &all[i], i, 0);
1063 Write_wordlens (f, &all[i], 0);
1064 break;
1065 case MG_NOVEL_BINARY:
1066 break;
1067 case MG_NOVEL_DELTA:
1068 break;
1069 case MG_NOVEL_HYBRID:
1070 break;
1071 case MG_NOVEL_HYBRID_MTF:
1072 break;
1073 default:
1074 FatalError (1, "Bad novel method");
1075 }
1076 }
1077 break;
1078 }
1079 fclose (f);
1080 return mem_reqd;
1081}
Note: See TracBrowser for help on using the repository browser.