source: trunk/gsdl/packages/mg/src/text/mg_compression_dict.c@ 1014

Last change on this file since 1014 was 820, checked in by cs025, 25 years ago

Incorrect use of NTOHUL on a double value (num_bytes).

  • Property svn:keywords set to Author Date Id Revision
File size: 26.7 KB
Line 
1/**************************************************************************
2 *
3 * mg_compression_dict.c -- Routines for reading the compression dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_compression_dict.c 820 1999-12-10 15:04:25Z cs025 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "local_strings.h"
29#include "bitio_gen.h"
30#include "bitio_m.h"
31#include "bitio_m_stdio.h"
32#include "timing.h"
33#include "mgheap.h"
34#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36#include "mg_files.h"
37#include "locallib.h"
38#include "invf.h"
39#include "text.h"
40#include "words.h"
41#include "mg.h"
42
43#define MAXBITS (sizeof(unsigned long) * 8)
44
45/*
46 $Log$
47 Revision 1.2 1999/12/10 15:04:25 cs025
48 Incorrect use of NTOHUL on a double value (num_bytes).
49
50 Revision 1.1 1999/08/10 21:18:04 sjboddie
51 renamed mg-1.3d directory mg
52
53 Revision 1.4 1999/01/15 03:05:51 rjmcnab
54
55 Renamed lib/heap to be lib/mgheap (it was causing some problems with
56 some versions of the STL libraries which contained a heap.h)
57
58 Revision 1.3 1998/12/17 09:12:52 rjmcnab
59
60 Altered mg to process utf-8 encoded Unicode. The main changes
61 are in the parsing of the input, the casefolding, and the stemming.
62
63 Revision 1.2 1998/11/25 07:55:44 rjmcnab
64
65 Modified mg to that you can specify the stemmer you want
66 to use via a command line option. You specify it to
67 mg_passes during the build process. The number of the
68 stemmer that you used is stored within the inverted
69 dictionary header and the stemmed dictionary header so
70 the correct stemmer is used in later stages of building
71 and querying.
72
73 Revision 1.1 1998/11/17 09:34:52 rjmcnab
74 *** empty log message ***
75
76 * Revision 1.3 1994/10/20 03:56:54 tes
77 * I have rewritten the boolean query optimiser and abstracted out the
78 * components of the boolean query.
79 *
80 * Revision 1.2 1994/09/20 04:41:45 tes
81 * For version 1.1
82 *
83 */
84
85static char *RCSID = "$Id: mg_compression_dict.c 820 1999-12-10 15:04:25Z cs025 $";
86
87/* #define DEBUG1 */
88
89/* #define DEBUG */
90
91#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))
92
93#define MAX_RECALCULATIONS 100
94
95typedef struct WordData
96 {
97 u_long freq, occur_num;
98 float saving;
99 float char_bit_cost;
100 u_long num_trans;
101 u_char *word;
102 }
103WordData;
104
105static WordData *Words[2];
106static u_long Num[2];
107static u_long chars[2];
108
109#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
110#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
111#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)
112
113
114typedef struct DictData
115 {
116 WordData **wd;
117 u_long num_wds;
118 u_long chars;
119 }
120DictData;
121
122typedef DictData DictInfo[2];
123
124static DictInfo keep, discard, all;
125static compression_stats_header csh;
126
127static char *file_name = "";
128static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;
129
130
131static void ReadInWords (char *);
132static void Select_on (int k, heap_comp hc);
133static void Method3 (int k);
134static void Select_all (void);
135static u_long WriteOutWords (char *, u_long, int);
136static int DecFreqIncWL (void *a, void *b);
137static int OccuranceOrder (void *a, void *b);
138
139
140
141int main (int argc, char **argv)
142{
143 ProgTime StartTime;
144 int ch;
145 char type = 'C';
146 char mode = '0';
147 int lookback = 2;
148 double k = 0;
149 u_long mem_reqd;
150 opterr = 0;
151 msg_prefix = argv[0];
152 while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HBDYM")) != -1)
153 switch (ch)
154 {
155 case 'H':
156 novel_method = MG_NOVEL_HUFFMAN_CHARS;
157 break;
158 case 'B':
159 novel_method = MG_NOVEL_BINARY;
160 break;
161 case 'D':
162 novel_method = MG_NOVEL_DELTA;
163 break;
164 case 'Y':
165 novel_method = MG_NOVEL_HYBRID;
166 break;
167 case 'M':
168 novel_method = MG_NOVEL_HYBRID_MTF;
169 break;
170 case 'f': /* input file */
171 file_name = optarg;
172 break;
173 case 'd':
174 set_basepath (optarg);
175 break;
176 case 'C':
177 case 'P':
178 case 'S':
179 type = ch;
180 break;
181 case '0':
182 case '1':
183 case '2':
184 case '3':
185 mode = ch;
186 break;
187 case 'l':
188 lookback = atoi (optarg);
189 if (!is_power_of_two (lookback))
190 FatalError (1, "The lookback value must be a power of 2");
191 lookback = floorlog_2 (lookback);
192 break;
193 case 'k':
194 k = atof (optarg) * 1024;
195 break;
196 case 'h':
197 case '?':
198 fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
199 "[-d data directory] [-h] [-0|-1|-2|-3] [-C|-P|-S] [-k mem (Kb)]\n",
200 argv[0]);
201 exit (1);
202 }
203 GetTime (&StartTime);
204
205 ReadInWords (file_name);
206
207 if (type == 'C')
208 {
209 Select_all ();
210 mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
211 }
212 else
213 {
214 switch (mode)
215 {
216 case '0':
217 Select_all ();
218 break;
219 case '1':
220 Message ("Dictionary limit of %.2f Kb", k / 1024);
221 Select_on ((int) k, OccuranceOrder);
222 break;
223 case '2':
224 Message ("Dictionary limit of %.2f Kb", k / 1024);
225 Select_on ((int) k, DecFreqIncWL);
226 break;
227 case '3':
228 Message ("Dictionary limit of %.2f Kb", k / 1024);
229 Method3 ((int) k);
230 break;
231 }
232 if (type == 'P')
233 {
234 mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
235 }
236 else
237 {
238 mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
239 }
240 }
241
242 Message ("Num words : %8u -> %8u\n", Num[1], keep[1].num_wds);
243 Message ("Num non-words : %8u -> %8u\n", Num[0], keep[0].num_wds);
244 Message ("Chars of words : %8u -> %8u\n", chars[1], keep[1].chars);
245 Message ("Chars of non-words : %8u -> %8u\n", chars[0], keep[0].chars);
246 Message ("Mem usage : %8u -> %8u\n",
247 (Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
248 (keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
249 keep[0].chars + keep[1].chars);
250 Message ("Actual mem required : %8u\n", mem_reqd);
251
252 Message ("%s", ElapsedTime (&StartTime, NULL));
253 return 0;
254}
255
256
257
258
259static void
260ReadInWords (char *filename)
261{
262 FILE *f;
263 unsigned long i;
264 f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
265 MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
266
267 fread (&csh, sizeof (csh), 1, f);
268
269 /* [RPAP - Jan 97: Endian Ordering] */
270 NTOHUL(csh.num_docs);
271 NTOHD(csh.num_bytes);
272
273 for (i = 0; i < 2; i++)
274 {
275 frags_stats_header fsh;
276 char *buf;
277 WordData *wd;
278 int j;
279 chars[i] = 0;
280
281 fread (&fsh, sizeof (fsh), 1, f);
282 /* [RPAP - Jan 97: Endian Ordering] */
283 NTOHUL(fsh.num_frags);
284 NTOHUL(fsh.mem_for_frags);
285
286 Num[i] = all[i].num_wds = fsh.num_frags;
287
288 /* The +1 on the following line is to leave room for the esc code. */
289 all[i].wd = Xmalloc (sizeof (WordData *) * Num[i] + 1);
290
291 buf = Xmalloc (fsh.mem_for_frags);
292 wd = Words[i] = Xmalloc (sizeof (WordData) * Num[i]);
293 for (j = 0; j < Num[i]; j++, wd++)
294 {
295 int len;
296 fread (&wd->freq, sizeof (wd->freq), 1, f);
297 NTOHUL(wd->freq); /* [RPAP - Jan 97: Endian Ordering] */
298 fread (&wd->occur_num, sizeof (wd->occur_num), 1, f);
299 NTOHUL(wd->occur_num); /* [RPAP - Jan 97: Endian Ordering] */
300 len = fgetc (f);
301 wd->word = (u_char *) buf;
302 *buf++ = len;
303 fread (buf, len, 1, f);
304 buf += len;
305 all[i].wd[j] = wd;
306 }
307 chars[i] = fsh.mem_for_frags - fsh.num_frags;
308 }
309 fclose (f);
310}
311
312
313static void
314Alloc_keep_discard (void)
315{
316 keep[0].num_wds = 0;
317 keep[0].wd = Xmalloc ((Num[0] + 1) * sizeof (WordData *));
318 keep[1].num_wds = 0;
319 keep[1].wd = Xmalloc ((Num[1] + 1) * sizeof (WordData *));
320 discard[0].num_wds = 0;
321 discard[0].wd = Xmalloc ((Num[0] + 1) * sizeof (WordData *));
322 discard[1].num_wds = 0;
323 discard[1].wd = Xmalloc ((Num[1] + 1) * sizeof (WordData *));
324}
325
326
327static int
328sort_comp (const void *a, const void *b)
329{
330 WordData *aa = *(WordData **) a;
331 WordData *bb = *(WordData **) b;
332 return casecompare (aa->word, bb->word); /* [RPAP - Jan 97: Stem Index Change] */
333}
334
335
336
337static void
338SortAndCount_DictData (DictData * dd)
339{
340 int i;
341 WordData **wd;
342 qsort (dd->wd, dd->num_wds, sizeof (WordData *), sort_comp);
343 dd->chars = 0;
344 wd = dd->wd;
345 for (i = 0; i < dd->num_wds; i++, wd++)
346 dd->chars += (*wd)->word[0];
347}
348
349
350static void
351Select_all (void)
352{
353 int i;
354 Alloc_keep_discard ();
355 keep[0].num_wds = Num[0];
356 for (i = 0; i < Num[0]; i++)
357 keep[0].wd[i] = Words[0] + i;
358 keep[1].num_wds = Num[1];
359 for (i = 0; i < Num[1]; i++)
360 keep[1].wd[i] = Words[1] + i;
361 SortAndCount_DictData (&keep[0]);
362 SortAndCount_DictData (&keep[1]);
363}
364
365
366
367
368static int
369DecFreqIncWL (void *a, void *b)
370{
371 WordData *aa = *(WordData **) a;
372 WordData *bb = *(WordData **) b;
373 if (aa->freq > bb->freq)
374 return -1;
375 else if (aa->freq < bb->freq)
376 return 1;
377 else
378 return bb->word[0] - aa->word[0];
379}
380
381
382static int
383OccuranceOrder (void *a, void *b)
384{
385 WordData *aa = *(WordData **) a;
386 WordData *bb = *(WordData **) b;
387 if (aa->occur_num > bb->occur_num)
388 return 1;
389 else if (aa->occur_num < bb->occur_num)
390 return -1;
391 else
392 return 0;
393}
394
395
396static void
397Select_on (int k, heap_comp hc)
398{
399 int i, num, mem;
400 WordData **wd;
401
402 Alloc_keep_discard ();
403
404 num = Num[0] + Num[1];
405 wd = Xmalloc (num * sizeof (WordData *));
406 for (i = 0; i < Num[0]; i++)
407 wd[i] = Words[0] + i;
408 for (i = 0; i < Num[1]; i++)
409 wd[i + Num[0]] = Words[1] + i;
410
411 heap_build (wd, sizeof (*wd), num, hc);
412
413 mem = 0;
414 while (k > mem && num)
415 {
416 int idx;
417 WordData *word = wd[0];
418#ifdef DEBUG
419 fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
420 keep[0].num_wds, keep[1].num_wds,
421 mem, word->freq, word->occur_num, word2str (word->word));
422#endif
423 mem += sizeof (u_char *) + word->word[0];
424 heap_deletehead (wd, sizeof (*wd), &num, hc);
425 idx = KIND (word);
426 keep[idx].wd[keep[idx].num_wds++] = word;
427 }
428
429 for (i = 0; i < num; i++)
430 {
431 WordData *word = wd[i];
432 int idx = KIND (word);
433 discard[idx].wd[discard[idx].num_wds++] = word;
434 }
435 SortAndCount_DictData (&keep[0]);
436 SortAndCount_DictData (&keep[1]);
437 SortAndCount_DictData (&discard[0]);
438 SortAndCount_DictData (&discard[1]);
439
440 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
441 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
442
443}
444
445
446
447static int
448BigSaving (void *a, void *b)
449{
450 WordData *aa = *(WordData **) a;
451 WordData *bb = *(WordData **) b;
452 return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
453}
454
455static int
456SmallSaving (void *a, void *b)
457{
458 WordData *aa = *(WordData **) a;
459 WordData *bb = *(WordData **) b;
460 return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
461}
462
463
464static void
465CalcCharCounts (WordData ** wd, int num,
466 char *char_lens[2], char *len_lens[2],
467 u_long escape[2])
468{
469 long char_freqs[2][256];
470 long len_freqs[2][16];
471 huff_data hd;
472 int i;
473 escape[0] = 0;
474 escape[1] = 0;
475 bzero ((char *) char_freqs, sizeof (char_freqs));
476 bzero ((char *) len_freqs, sizeof (len_freqs));
477 for (i = 0; i < num; i++, wd++)
478 {
479 u_long freq = (*wd)->freq;
480 u_char *buf = (*wd)->word;
481 int len = *buf++;
482 int idx = KIND (*wd);
483 len_freqs[idx][len] += freq;
484 escape[idx] += freq;
485 for (; len; len--, buf++)
486 char_freqs[idx][(u_long) (*buf)] += freq;
487 }
488 Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
489 char_lens[0] = hd.clens;
490 Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
491 char_lens[1] = hd.clens;
492
493 Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
494 len_lens[0] = hd.clens;
495 Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
496 len_lens[1] = hd.clens;
497}
498
499
500
501
502
503
504
505
506
507void
508CalcBitCost (WordData ** discard_word, int discard_num,
509 WordData ** keep_word, int keep_num, double freqs_trans[2],
510 u_long escape[2], int num_trans)
511{
512 char *char_lens[2];
513 char *len_lens[2];
514 double esc[2];
515 int j;
516 CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
517 esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
518 esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
519 for (j = 0; j < discard_num; j++, discard_word++)
520 {
521 float cbc, wbc;
522 u_char *buf = (*discard_word)->word;
523 int len = *buf++;
524 u_long freq = (*discard_word)->freq;
525 int idx = KIND (*discard_word);
526
527 cbc = len_lens[idx][len];
528 for (; len; len--, buf++)
529 cbc += char_lens[idx][(u_long) (*buf)];
530
531 (*discard_word)->char_bit_cost = (cbc + esc[idx]) * freq;
532
533 wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
534
535 (*discard_word)->saving = ((*discard_word)->char_bit_cost - wbc) /
536 (sizeof (char *) + (*discard_word)->word[0]);
537
538 (*discard_word)->num_trans = num_trans;
539 }
540 for (j = 0; j < keep_num; j++, keep_word++)
541 {
542 float cbc, wbc;
543 u_char *buf = (*keep_word)->word;
544 int len = *buf++;
545 u_long freq = (*keep_word)->freq;
546 int idx = KIND (*keep_word);
547
548 cbc = len_lens[idx][len];
549 for (; len; len--, buf++)
550 cbc += char_lens[idx][(u_long) (*buf)];
551
552 (*keep_word)->char_bit_cost = (cbc + esc[idx]) * freq;
553
554 wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
555
556 (*keep_word)->saving = ((*keep_word)->char_bit_cost - wbc) /
557 (sizeof (char *) + (*keep_word)->word[0]);
558
559 (*keep_word)->num_trans = num_trans;
560 }
561 Xfree (char_lens[0]);
562 Xfree (char_lens[1]);
563 Xfree (len_lens[0]);
564 Xfree (len_lens[1]);
565}
566
567
568
569static void
570Method3 (int k)
571{
572 int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
573 int keep_to_discard = 0;
574 int discard_to_keep = 0;
575 int recalcs = 0;
576 double freqs_trans[2], total;
577 u_long escape[2];
578 WordData **keep_heap, **discard_heap;
579
580 freqs_trans[0] = freqs_trans[1] = 0;
581 num_trans = 0;
582
583 discard_num = Num[0] + Num[1];
584 discard_heap = Xmalloc (discard_num * sizeof (WordData *));
585
586 keep_num = 0;
587 keep_heap = Xmalloc (discard_num * sizeof (WordData *));
588
589
590 for (i = 0; i < Num[0]; i++)
591 discard_heap[i] = Words[0] + i;
592 for (i = 0; i < Num[1]; i++)
593 discard_heap[i + Num[0]] = Words[1] + i;
594
595 heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);
596
597 mem = 0;
598 while (k > mem && discard_num)
599 {
600 WordData *word = discard_heap[0];
601 mem += sizeof (u_char *) + word->word[0];
602 heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
603 keep_heap[keep_num++] = word;
604 freqs_trans[KIND (word)] += word->freq;
605 num_trans++;
606 }
607
608 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
609 freqs_trans, escape, num_trans);
610 heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
611 heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);
612
613 total = 0;
614 recalc_reqd = 0;
615 while (keep_num && discard_num)
616 {
617 if ((keep_num && keep_heap[0]->num_trans < num_trans) ||
618 (discard_num && discard_heap[0]->num_trans < num_trans))
619 {
620 if (keep_num && keep_heap[0]->num_trans < num_trans)
621 {
622 WordData *word = keep_heap[0];
623 int idx = KIND (word);
624 float wbc;
625#ifdef DEBUG1
626 fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
627 word->saving);
628#endif
629 wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
630 word->freq;
631 word->saving = (word->char_bit_cost - wbc) /
632 (sizeof (char *) + word->word[0]);
633#ifdef DEBUG1
634 fprintf (stderr, " %.2f\n", word->saving);
635#endif
636 word->num_trans = num_trans;
637 heap_changedhead (keep_heap, sizeof (word), keep_num, SmallSaving);
638 }
639
640 if (discard_num && discard_heap[0]->num_trans < num_trans)
641 {
642 WordData *word = discard_heap[0];
643 int idx = KIND (word);
644 float wbc;
645#ifdef DEBUG1
646 fprintf (stderr, "DISCARD \"%s\" %.2f ->", word2str (word->word),
647 word->saving);
648#endif
649 wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
650 word->freq;
651 word->saving = (word->char_bit_cost - wbc) /
652 (sizeof (char *) + word->word[0]);
653#ifdef DEBUG1
654 fprintf (stderr, " %.2f\n", word->saving);
655#endif
656 word->num_trans = num_trans;
657 heap_changedhead (discard_heap, sizeof (word),
658 discard_num, BigSaving);
659 }
660 }
661 else if (keep_heap[0]->saving < discard_heap[0]->saving)
662 {
663 assert (keep_num && discard_num);
664 if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > k)
665 {
666 /* Transfer the word at the top of the keep heap to the
667 discard heap. */
668 WordData *word = keep_heap[0];
669 int idx = KIND (word);
670 heap_deletehead (keep_heap, sizeof (word), &keep_num, SmallSaving);
671 discard_heap[discard_num] = word;
672 heap_additem (discard_heap, sizeof (word), &discard_num,
673 BigSaving);
674 freqs_trans[idx] -= word->freq;
675 mem -= sizeof (u_char *) + word->word[0];
676 num_trans++;
677 total += word->saving;
678 keep_to_discard++;
679#ifdef DEBUG
680 fprintf (stderr,
681 "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
682 mem, word->freq, word->char_bit_cost,
683 word->saving, word2str (word->word));
684#endif
685 }
686 else
687 {
688 /* Transfer the word at the top of the discard heap to the
689 keep heap. */
690 WordData *word = discard_heap[0];
691 int idx = KIND (word);
692 heap_deletehead (discard_heap, sizeof (word),
693 &discard_num, BigSaving);
694 keep_heap[keep_num] = word;
695 heap_additem (keep_heap, sizeof (word), &keep_num, SmallSaving);
696 freqs_trans[idx] += word->freq;
697 mem += sizeof (u_char *) + word->word[0];
698 num_trans++;
699 total += word->saving;
700 discard_to_keep++;
701#ifdef DEBUG
702 fprintf (stderr,
703 "DISCARD -> KEEP %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
704 mem, word->freq, word->char_bit_cost,
705 word->saving, word2str (word->word));
706#endif
707 }
708
709 recalc_reqd = 1;
710
711 }
712 else
713 {
714 if (recalc_reqd == 0)
715 break;
716#ifdef DEBUG1
717 fprintf (stderr, "--------------\n");
718#endif
719 if (recalcs == MAX_RECALCULATIONS)
720 break;
721 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
722 freqs_trans, escape, num_trans);
723 heap_build (discard_heap, sizeof (*discard_heap),
724 discard_num, BigSaving);
725 heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
726 recalc_reqd = 0;
727 recalcs++;
728 }
729 }
730
731 Alloc_keep_discard ();
732
733 for (i = 0; i < discard_num; i++)
734 {
735 WordData *word = discard_heap[i];
736 int idx = KIND (word);
737 assert (IsWord (word) || IsNonWord (word));
738 discard[idx].wd[discard[idx].num_wds++] = word;
739 }
740 for (i = 0; i < keep_num; i++)
741 {
742 WordData *word = keep_heap[i];
743 int idx = KIND (word);
744 assert (IsWord (word) || IsNonWord (word));
745 keep[idx].wd[keep[idx].num_wds++] = word;
746 }
747 SortAndCount_DictData (&keep[0]);
748 SortAndCount_DictData (&keep[1]);
749 SortAndCount_DictData (&discard[0]);
750 SortAndCount_DictData (&discard[1]);
751
752 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
753 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
754 Message ("Keep -> Discard : %8d", keep_to_discard);
755 Message ("Discard -> Keep : %8d", discard_to_keep);
756 Message ("Huffman Recalculations : %8d", recalcs);
757 if (recalcs == MAX_RECALCULATIONS)
758 Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
759}
760
761
762
763
764
765
766
767
768
769
770/****************************************************************************
771 ***** *****
772 ***** Dictionary saving code *****
773 ***** *****
774 ****************************************************************************/
775
776
777
778static void
779Write_cdh (FILE * f, compression_dict_header * cdh)
780{
781 /* [RPAP - Jan 97: Endian Ordering] */
782 int i;
783 compression_dict_header tmp = *cdh;
784 HTONUL(tmp.dict_type);
785 HTONUL(tmp.novel_method);
786 for (i = 0; i < TEXT_PARAMS; i++)
787 HTONUL(tmp.params[i]);
788 HTONUL(tmp.num_words[0]);
789 HTONUL(tmp.num_words[1]);
790 HTONUL(tmp.num_word_chars[0]);
791 HTONUL(tmp.num_word_chars[1]);
792 HTONUL(tmp.lookback);
793
794 fwrite (&tmp, sizeof (tmp), 1, f);
795}
796
797
798static void
799Write_words (FILE * f, DictData * dd)
800{
801 int i;
802 u_char *curr, *prev = NULL;
803 for (i = 0; i < dd->num_wds; i++)
804 {
805 int len;
806 curr = dd->wd[i]->word;
807 if (prev)
808 /* look for prefix match with prev string */
809 len = prefixlen (prev, curr);
810 else
811 len = 0;
812 fputc ((len << 4) + (curr[0] - len), f);
813 fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
814 prev = curr;
815 }
816
817}
818
819
820static int
821Uncompressed_size (DictData * dd)
822{
823 int i, us;
824 for (us = i = 0; i < dd->num_wds; i++)
825 us += dd->wd[i]->word[0];
826 return us;
827}
828
829
830static u_long
831Write_data (FILE * f, DictData * dd, int lookback)
832{
833 u_long mem_reqd;
834 huff_data *hd;
835 int i;
836 u_long us = dd->num_wds;
837 long *freqs;
838 u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
839 u_long lencounts[MAX_HUFFCODE_LEN + 1];
840 u_char *lastword[MAX_HUFFCODE_LEN + 1];
841
842 if (!(freqs = Xmalloc ((dd->num_wds) * sizeof (*freqs))))
843 FatalError (1, "Unable to allocate memory for freqs");
844
845 for (i = 0; i < dd->num_wds; i++)
846 {
847 freqs[i] = dd->wd[i]->freq;
848 us += dd->wd[i]->word[0];
849 }
850
851 if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
852 FatalError (1, "Unable to allocate memory for huffman data");
853
854 Xfree (freqs), freqs = NULL;
855
856 if (Write_Huffman_Data (f, hd) == -1)
857 FatalError (1, "Unable to write huffman data");
858
859 HTONUL(us); /* [RPAP - Jan 97: Endian Ordering] */
860 fwrite (&us, sizeof (us), 1, f);
861 NTOHUL(us); /* [RPAP - Jan 97: Endian Ordering] */
862
863
864/* Calculate the amount of memory that will be required to store the text for
865 each different huffman code len. Every 1<<lookback words for each different
866 codelen length will not be prefixed by previous strings. */
867
868
869 bzero ((char *) &huff_words_size, sizeof (huff_words_size));
870 bzero ((char *) &lencounts, sizeof (lencounts));
871
872 mem_reqd = 0;
873
874 for (i = 0; i < dd->num_wds; i++)
875 {
876 int codelen = hd->clens[i];
877 u_char *word = dd->wd[i]->word;
878
879 if (!codelen)
880 FatalError (1, "The length of a code for a word was zero");
881
882 huff_words_size[codelen] += word[0] + 1;
883 mem_reqd += word[0] + (lookback != 0);
884#if 0
885 if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
886 lastword[codelen] = word;
887 else
888 huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
889#else
890 if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
891 {
892 int save = prefixlen (lastword[codelen], word);
893 mem_reqd -= save;
894 huff_words_size[codelen] -= save;
895 }
896 else
897 {
898 mem_reqd += sizeof (u_char *);
899 }
900 lastword[codelen] = word;
901#endif
902 lencounts[codelen]++;
903 }
904
905 /* [RPAP - Jan 97: Endian Ordering] */
906 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
907 HTONUL(huff_words_size[i]);
908
909 fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
910 hd->maxcodelen - hd->mincodelen + 1, f);
911
912 /* [RPAP - Jan 97: Endian Ordering] */
913 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
914 NTOHUL(huff_words_size[i]);
915
916 Write_words (f, dd);
917
918 Xfree (hd->clens);
919 Xfree (hd);
920
921 return mem_reqd;
922}
923
924
925
926static void
927Write_charfreqs (FILE * f, DictData * dd, int words,
928 int zero_freq_permitted)
929{
930 int j;
931 long freqs[256];
932 WordData **wd = dd->wd;
933 huff_data *hd;
934
935 bzero ((char *) freqs, sizeof (freqs));
936
937 for (j = 0; j < dd->num_wds; j++, wd++)
938 {
939 u_char *buf = (*wd)->word;
940 int len = *buf++;
941 for (; len; len--, buf++)
942 freqs[(u_long) (*buf)] += (*wd)->freq;
943 }
944
945 if (!zero_freq_permitted)
946 for (j = 0; j < 256; j++)
947 if (!freqs[j] && PESINAWORD (j) == words)
948 freqs[j] = 1;
949
950 if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
951 FatalError (1, "Unable to allocate memory for huffman data");
952
953 if (Write_Huffman_Data (f, hd) == -1)
954 FatalError (1, "Unable to write huffman data");
955
956 Xfree (hd->clens);
957 Xfree (hd);
958}
959
960
961
962
963static void
964Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
965{
966 int j;
967 long freqs[16];
968 WordData **wd = dd->wd;
969 huff_data *hd;
970
971 bzero ((char *) freqs, sizeof (freqs));
972
973 for (j = 0; j < dd->num_wds; j++, wd++)
974 freqs[(*wd)->word[0]] += (*wd)->freq;
975
976 if (!zero_freq_permitted)
977 for (j = 0; j < 16; j++)
978 if (!freqs[j])
979 freqs[j] = 1;
980
981 if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
982 FatalError (1, "Unable to allocate memory for huffman data");
983
984 if (Write_Huffman_Data (f, hd) == -1)
985 FatalError (1, "Unable to write huffman data");
986
987
988 Xfree (hd->clens);
989 Xfree (hd);
990}
991
992
993
994static u_long
995WriteOutWords (char *file_name, u_long type, int lookback)
996{
997 FILE *f;
998 int i;
999 u_long mem_reqd = 0;
1000
1001 compression_dict_header cdh;
1002 f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
1003 MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
1004
1005 bzero((char *) &cdh, sizeof(cdh));
1006
1007 cdh.dict_type = type;
1008 cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
1009 MG_NOVEL_HUFFMAN_CHARS;
1010
1011 cdh.num_words[1] = keep[1].num_wds;
1012 cdh.num_words[0] = keep[0].num_wds;
1013 cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
1014 cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
1015 cdh.lookback = lookback;
1016
1017 Write_cdh (f, &cdh);
1018
1019 for (i = 0; i < 2; i++)
1020 switch (type)
1021 {
1022 case MG_COMPLETE_DICTIONARY:
1023 {
1024 mem_reqd += Write_data (f, &keep[i], lookback);
1025 }
1026 break;
1027 case MG_PARTIAL_DICTIONARY:
1028 {
1029 if (keep[i].num_wds)
1030 {
1031 int j;
1032 WordData esc;
1033 esc.freq = 0;
1034 esc.word = (u_char *) "";
1035 keep[i].wd[keep[i].num_wds++] = &esc;
1036 for (j = 0; j < discard[i].num_wds; j++)
1037 esc.freq += discard[i].wd[j]->freq;
1038 if (!esc.freq)
1039 esc.freq++;
1040 mem_reqd += Write_data (f, &keep[i], lookback);
1041 }
1042 Write_charfreqs (f, &discard[i], i, 1);
1043 Write_wordlens (f, &discard[i], 1);
1044 }
1045 break;
1046 case MG_SEED_DICTIONARY:
1047 {
1048 if (keep[i].num_wds)
1049 {
1050 int j;
1051 WordData esc;
1052 esc.freq = 0;
1053 esc.word = (u_char *) "";
1054 keep[i].wd[keep[i].num_wds++] = &esc;
1055 for (j = 0; j < all[i].num_wds; j++)
1056 if (all[i].wd[j]->freq == 1)
1057 esc.freq++;
1058 if (!esc.freq)
1059 esc.freq++;
1060 mem_reqd += Write_data (f, &keep[i], lookback);
1061 }
1062 switch (novel_method)
1063 {
1064 case MG_NOVEL_HUFFMAN_CHARS:
1065 Write_charfreqs (f, &all[i], i, 0);
1066 Write_wordlens (f, &all[i], 0);
1067 break;
1068 case MG_NOVEL_BINARY:
1069 break;
1070 case MG_NOVEL_DELTA:
1071 break;
1072 case MG_NOVEL_HYBRID:
1073 break;
1074 case MG_NOVEL_HYBRID_MTF:
1075 break;
1076 default:
1077 FatalError (1, "Bad novel method");
1078 }
1079 }
1080 break;
1081 }
1082 fclose (f);
1083 return mem_reqd;
1084}
Note: See TracBrowser for help on using the repository browser.