source: gsdl/trunk/trunk/mg/src/text/mg_compression_dict.c@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:keywords set to Author Date Id Revision
File size: 26.8 KB
Line 
1/**************************************************************************
2 *
3 * mg_compression_dict.c -- Routines for reading the compression dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_compression_dict.c 16583 2008-07-29 10:20:36Z davidb $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "local_strings.h"
29#include "bitio_gen.h"
30#include "bitio_m.h"
31#include "bitio_m_stdio.h"
32#include "timing.h"
33#include "mgheap.h"
34#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36#include "mg_files.h"
37#include "locallib.h"
38#include "invf.h"
39#include "text.h"
40#include "words.h"
41#include "mg.h"
42
43#define MAXBITS (sizeof(unsigned long) * 8)
44
45/*
46 $Log$
47 Revision 1.1 2003/02/20 21:18:24 mdewsnip
48 Addition of MG package for search and retrieval
49
50 Revision 1.2 1999/12/10 15:04:25 cs025
51 Incorrect use of NTOHUL on a double value (num_bytes).
52
53 Revision 1.1 1999/08/10 21:18:04 sjboddie
54 renamed mg-1.3d directory mg
55
56 Revision 1.4 1999/01/15 03:05:51 rjmcnab
57
58 Renamed lib/heap to be lib/mgheap (it was causing some problems with
59 some versions of the STL libraries which contained a heap.h)
60
61 Revision 1.3 1998/12/17 09:12:52 rjmcnab
62
63 Altered mg to process utf-8 encoded Unicode. The main changes
64 are in the parsing of the input, the casefolding, and the stemming.
65
66 Revision 1.2 1998/11/25 07:55:44 rjmcnab
67
68 Modified mg to that you can specify the stemmer you want
69 to use via a command line option. You specify it to
70 mg_passes during the build process. The number of the
71 stemmer that you used is stored within the inverted
72 dictionary header and the stemmed dictionary header so
73 the correct stemmer is used in later stages of building
74 and querying.
75
76 Revision 1.1 1998/11/17 09:34:52 rjmcnab
77 *** empty log message ***
78
79 * Revision 1.3 1994/10/20 03:56:54 tes
80 * I have rewritten the boolean query optimiser and abstracted out the
81 * components of the boolean query.
82 *
83 * Revision 1.2 1994/09/20 04:41:45 tes
84 * For version 1.1
85 *
86 */
87
88static char *RCSID = "$Id: mg_compression_dict.c 16583 2008-07-29 10:20:36Z davidb $";
89
90/* #define DEBUG1 */
91
92/* #define DEBUG */
93
94#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))
95
96#define MAX_RECALCULATIONS 100
97
98typedef struct WordData
99 {
100 u_long freq, occur_num;
101 float saving;
102 float char_bit_cost;
103 u_long num_trans;
104 u_char *word;
105 }
106WordData;
107
108static WordData *Words[2];
109static u_long Num[2];
110static u_long chars[2];
111
112#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
113#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
114#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)
115
116
117typedef struct DictData
118 {
119 WordData **wd;
120 u_long num_wds;
121 u_long chars;
122 }
123DictData;
124
125typedef DictData DictInfo[2];
126
127static DictInfo keep, discard, all;
128static compression_stats_header csh;
129
130static char *file_name = "";
131static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;
132
133
134static void ReadInWords (char *);
135static void Select_on (int k, heap_comp hc);
136static void Method3 (int k);
137static void Select_all (void);
138static u_long WriteOutWords (char *, u_long, int);
139static int DecFreqIncWL (void *a, void *b);
140static int OccuranceOrder (void *a, void *b);
141
142
143
144int main (int argc, char **argv)
145{
146 ProgTime StartTime;
147 int ch;
148 char type = 'C';
149 char mode = '0';
150 int lookback = 2;
151 double k = 0;
152 u_long mem_reqd;
153 opterr = 0;
154 msg_prefix = argv[0];
155 while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HBDYM")) != -1)
156 switch (ch)
157 {
158 case 'H':
159 novel_method = MG_NOVEL_HUFFMAN_CHARS;
160 break;
161 case 'B':
162 novel_method = MG_NOVEL_BINARY;
163 break;
164 case 'D':
165 novel_method = MG_NOVEL_DELTA;
166 break;
167 case 'Y':
168 novel_method = MG_NOVEL_HYBRID;
169 break;
170 case 'M':
171 novel_method = MG_NOVEL_HYBRID_MTF;
172 break;
173 case 'f': /* input file */
174 file_name = optarg;
175 break;
176 case 'd':
177 set_basepath (optarg);
178 break;
179 case 'C':
180 case 'P':
181 case 'S':
182 type = ch;
183 break;
184 case '0':
185 case '1':
186 case '2':
187 case '3':
188 mode = ch;
189 break;
190 case 'l':
191 lookback = atoi (optarg);
192 if (!is_power_of_two (lookback))
193 FatalError (1, "The lookback value must be a power of 2");
194 lookback = floorlog_2 (lookback);
195 break;
196 case 'k':
197 k = atof (optarg) * 1024;
198 break;
199 case 'h':
200 case '?':
201 fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
202 "[-d data directory] [-h] [-0|-1|-2|-3] [-C|-P|-S] [-k mem (Kb)]\n",
203 argv[0]);
204 exit (1);
205 }
206 GetTime (&StartTime);
207
208 ReadInWords (file_name);
209
210 if (type == 'C')
211 {
212 Select_all ();
213 mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
214 }
215 else
216 {
217 switch (mode)
218 {
219 case '0':
220 Select_all ();
221 break;
222 case '1':
223 Message ("Dictionary limit of %.2f Kb", k / 1024);
224 Select_on ((int) k, OccuranceOrder);
225 break;
226 case '2':
227 Message ("Dictionary limit of %.2f Kb", k / 1024);
228 Select_on ((int) k, DecFreqIncWL);
229 break;
230 case '3':
231 Message ("Dictionary limit of %.2f Kb", k / 1024);
232 Method3 ((int) k);
233 break;
234 }
235 if (type == 'P')
236 {
237 mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
238 }
239 else
240 {
241 mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
242 }
243 }
244
245 Message ("Num words : %8u -> %8u\n", Num[1], keep[1].num_wds);
246 Message ("Num non-words : %8u -> %8u\n", Num[0], keep[0].num_wds);
247 Message ("Chars of words : %8u -> %8u\n", chars[1], keep[1].chars);
248 Message ("Chars of non-words : %8u -> %8u\n", chars[0], keep[0].chars);
249 Message ("Mem usage : %8u -> %8u\n",
250 (Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
251 (keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
252 keep[0].chars + keep[1].chars);
253 Message ("Actual mem required : %8u\n", mem_reqd);
254
255 Message ("%s", ElapsedTime (&StartTime, NULL));
256 return 0;
257}
258
259
260
261
262static void
263ReadInWords (char *filename)
264{
265 FILE *f;
266 unsigned long i;
267 f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
268 MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
269
270 fread (&csh, sizeof (csh), 1, f);
271
272 /* [RPAP - Jan 97: Endian Ordering] */
273 NTOHUL(csh.num_docs);
274 NTOHD(csh.num_bytes);
275
276 for (i = 0; i < 2; i++)
277 {
278 frags_stats_header fsh;
279 char *buf;
280 WordData *wd;
281 int j;
282 chars[i] = 0;
283
284 fread (&fsh, sizeof (fsh), 1, f);
285 /* [RPAP - Jan 97: Endian Ordering] */
286 NTOHUL(fsh.num_frags);
287 NTOHUL(fsh.mem_for_frags);
288
289 Num[i] = all[i].num_wds = fsh.num_frags;
290
291 /* The +1 on the following line is to leave room for the esc code. */
292 all[i].wd = Xmalloc (sizeof (WordData *) * Num[i] + 1);
293
294 buf = Xmalloc (fsh.mem_for_frags);
295 wd = Words[i] = Xmalloc (sizeof (WordData) * Num[i]);
296 for (j = 0; j < Num[i]; j++, wd++)
297 {
298 int len;
299 fread (&wd->freq, sizeof (wd->freq), 1, f);
300 NTOHUL(wd->freq); /* [RPAP - Jan 97: Endian Ordering] */
301 fread (&wd->occur_num, sizeof (wd->occur_num), 1, f);
302 NTOHUL(wd->occur_num); /* [RPAP - Jan 97: Endian Ordering] */
303 len = fgetc (f);
304 wd->word = (u_char *) buf;
305 *buf++ = len;
306 fread (buf, len, 1, f);
307 buf += len;
308 all[i].wd[j] = wd;
309 }
310 chars[i] = fsh.mem_for_frags - fsh.num_frags;
311 }
312 fclose (f);
313}
314
315
316static void
317Alloc_keep_discard (void)
318{
319 keep[0].num_wds = 0;
320 keep[0].wd = Xmalloc ((Num[0] + 1) * sizeof (WordData *));
321 keep[1].num_wds = 0;
322 keep[1].wd = Xmalloc ((Num[1] + 1) * sizeof (WordData *));
323 discard[0].num_wds = 0;
324 discard[0].wd = Xmalloc ((Num[0] + 1) * sizeof (WordData *));
325 discard[1].num_wds = 0;
326 discard[1].wd = Xmalloc ((Num[1] + 1) * sizeof (WordData *));
327}
328
329
330static int
331sort_comp (const void *a, const void *b)
332{
333 WordData *aa = *(WordData **) a;
334 WordData *bb = *(WordData **) b;
335 return casecompare (aa->word, bb->word); /* [RPAP - Jan 97: Stem Index Change] */
336}
337
338
339
340static void
341SortAndCount_DictData (DictData * dd)
342{
343 int i;
344 WordData **wd;
345 qsort (dd->wd, dd->num_wds, sizeof (WordData *), sort_comp);
346 dd->chars = 0;
347 wd = dd->wd;
348 for (i = 0; i < dd->num_wds; i++, wd++)
349 dd->chars += (*wd)->word[0];
350}
351
352
353static void
354Select_all (void)
355{
356 int i;
357 Alloc_keep_discard ();
358 keep[0].num_wds = Num[0];
359 for (i = 0; i < Num[0]; i++)
360 keep[0].wd[i] = Words[0] + i;
361 keep[1].num_wds = Num[1];
362 for (i = 0; i < Num[1]; i++)
363 keep[1].wd[i] = Words[1] + i;
364 SortAndCount_DictData (&keep[0]);
365 SortAndCount_DictData (&keep[1]);
366}
367
368
369
370
371static int
372DecFreqIncWL (void *a, void *b)
373{
374 WordData *aa = *(WordData **) a;
375 WordData *bb = *(WordData **) b;
376 if (aa->freq > bb->freq)
377 return -1;
378 else if (aa->freq < bb->freq)
379 return 1;
380 else
381 return bb->word[0] - aa->word[0];
382}
383
384
385static int
386OccuranceOrder (void *a, void *b)
387{
388 WordData *aa = *(WordData **) a;
389 WordData *bb = *(WordData **) b;
390 if (aa->occur_num > bb->occur_num)
391 return 1;
392 else if (aa->occur_num < bb->occur_num)
393 return -1;
394 else
395 return 0;
396}
397
398
399static void
400Select_on (int k, heap_comp hc)
401{
402 int i, num, mem;
403 WordData **wd;
404
405 Alloc_keep_discard ();
406
407 num = Num[0] + Num[1];
408 wd = Xmalloc (num * sizeof (WordData *));
409 for (i = 0; i < Num[0]; i++)
410 wd[i] = Words[0] + i;
411 for (i = 0; i < Num[1]; i++)
412 wd[i + Num[0]] = Words[1] + i;
413
414 heap_build (wd, sizeof (*wd), num, hc);
415
416 mem = 0;
417 while (k > mem && num)
418 {
419 int idx;
420 WordData *word = wd[0];
421#ifdef DEBUG
422 fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
423 keep[0].num_wds, keep[1].num_wds,
424 mem, word->freq, word->occur_num, word2str (word->word));
425#endif
426 mem += sizeof (u_char *) + word->word[0];
427 heap_deletehead (wd, sizeof (*wd), &num, hc);
428 idx = KIND (word);
429 keep[idx].wd[keep[idx].num_wds++] = word;
430 }
431
432 for (i = 0; i < num; i++)
433 {
434 WordData *word = wd[i];
435 int idx = KIND (word);
436 discard[idx].wd[discard[idx].num_wds++] = word;
437 }
438 SortAndCount_DictData (&keep[0]);
439 SortAndCount_DictData (&keep[1]);
440 SortAndCount_DictData (&discard[0]);
441 SortAndCount_DictData (&discard[1]);
442
443 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
444 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
445
446}
447
448
449
450static int
451BigSaving (void *a, void *b)
452{
453 WordData *aa = *(WordData **) a;
454 WordData *bb = *(WordData **) b;
455 return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
456}
457
458static int
459SmallSaving (void *a, void *b)
460{
461 WordData *aa = *(WordData **) a;
462 WordData *bb = *(WordData **) b;
463 return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
464}
465
466
467static void
468CalcCharCounts (WordData ** wd, int num,
469 char *char_lens[2], char *len_lens[2],
470 u_long escape[2])
471{
472 long char_freqs[2][256];
473 long len_freqs[2][16];
474 huff_data hd;
475 int i;
476 escape[0] = 0;
477 escape[1] = 0;
478 bzero ((char *) char_freqs, sizeof (char_freqs));
479 bzero ((char *) len_freqs, sizeof (len_freqs));
480 for (i = 0; i < num; i++, wd++)
481 {
482 u_long freq = (*wd)->freq;
483 u_char *buf = (*wd)->word;
484 int len = *buf++;
485 int idx = KIND (*wd);
486 len_freqs[idx][len] += freq;
487 escape[idx] += freq;
488 for (; len; len--, buf++)
489 char_freqs[idx][(u_long) (*buf)] += freq;
490 }
491 Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
492 char_lens[0] = hd.clens;
493 Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
494 char_lens[1] = hd.clens;
495
496 Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
497 len_lens[0] = hd.clens;
498 Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
499 len_lens[1] = hd.clens;
500}
501
502
503
504
505
506
507
508
509
510void
511CalcBitCost (WordData ** discard_word, int discard_num,
512 WordData ** keep_word, int keep_num, double freqs_trans[2],
513 u_long escape[2], int num_trans)
514{
515 char *char_lens[2];
516 char *len_lens[2];
517 double esc[2];
518 int j;
519 CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
520 esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
521 esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
522 for (j = 0; j < discard_num; j++, discard_word++)
523 {
524 float cbc, wbc;
525 u_char *buf = (*discard_word)->word;
526 int len = *buf++;
527 u_long freq = (*discard_word)->freq;
528 int idx = KIND (*discard_word);
529
530 cbc = len_lens[idx][len];
531 for (; len; len--, buf++)
532 cbc += char_lens[idx][(u_long) (*buf)];
533
534 (*discard_word)->char_bit_cost = (cbc + esc[idx]) * freq;
535
536 wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
537
538 (*discard_word)->saving = ((*discard_word)->char_bit_cost - wbc) /
539 (sizeof (char *) + (*discard_word)->word[0]);
540
541 (*discard_word)->num_trans = num_trans;
542 }
543 for (j = 0; j < keep_num; j++, keep_word++)
544 {
545 float cbc, wbc;
546 u_char *buf = (*keep_word)->word;
547 int len = *buf++;
548 u_long freq = (*keep_word)->freq;
549 int idx = KIND (*keep_word);
550
551 cbc = len_lens[idx][len];
552 for (; len; len--, buf++)
553 cbc += char_lens[idx][(u_long) (*buf)];
554
555 (*keep_word)->char_bit_cost = (cbc + esc[idx]) * freq;
556
557 wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
558
559 (*keep_word)->saving = ((*keep_word)->char_bit_cost - wbc) /
560 (sizeof (char *) + (*keep_word)->word[0]);
561
562 (*keep_word)->num_trans = num_trans;
563 }
564 Xfree (char_lens[0]);
565 Xfree (char_lens[1]);
566 Xfree (len_lens[0]);
567 Xfree (len_lens[1]);
568}
569
570
571
572static void
573Method3 (int k)
574{
575 int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
576 int keep_to_discard = 0;
577 int discard_to_keep = 0;
578 int recalcs = 0;
579 double freqs_trans[2], total;
580 u_long escape[2];
581 WordData **keep_heap, **discard_heap;
582
583 freqs_trans[0] = freqs_trans[1] = 0;
584 num_trans = 0;
585
586 discard_num = Num[0] + Num[1];
587 discard_heap = Xmalloc (discard_num * sizeof (WordData *));
588
589 keep_num = 0;
590 keep_heap = Xmalloc (discard_num * sizeof (WordData *));
591
592
593 for (i = 0; i < Num[0]; i++)
594 discard_heap[i] = Words[0] + i;
595 for (i = 0; i < Num[1]; i++)
596 discard_heap[i + Num[0]] = Words[1] + i;
597
598 heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);
599
600 mem = 0;
601 while (k > mem && discard_num)
602 {
603 WordData *word = discard_heap[0];
604 mem += sizeof (u_char *) + word->word[0];
605 heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
606 keep_heap[keep_num++] = word;
607 freqs_trans[KIND (word)] += word->freq;
608 num_trans++;
609 }
610
611 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
612 freqs_trans, escape, num_trans);
613 heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
614 heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);
615
616 total = 0;
617 recalc_reqd = 0;
618 while (keep_num && discard_num)
619 {
620 if ((keep_num && keep_heap[0]->num_trans < num_trans) ||
621 (discard_num && discard_heap[0]->num_trans < num_trans))
622 {
623 if (keep_num && keep_heap[0]->num_trans < num_trans)
624 {
625 WordData *word = keep_heap[0];
626 int idx = KIND (word);
627 float wbc;
628#ifdef DEBUG1
629 fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
630 word->saving);
631#endif
632 wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
633 word->freq;
634 word->saving = (word->char_bit_cost - wbc) /
635 (sizeof (char *) + word->word[0]);
636#ifdef DEBUG1
637 fprintf (stderr, " %.2f\n", word->saving);
638#endif
639 word->num_trans = num_trans;
640 heap_changedhead (keep_heap, sizeof (word), keep_num, SmallSaving);
641 }
642
643 if (discard_num && discard_heap[0]->num_trans < num_trans)
644 {
645 WordData *word = discard_heap[0];
646 int idx = KIND (word);
647 float wbc;
648#ifdef DEBUG1
649 fprintf (stderr, "DISCARD \"%s\" %.2f ->", word2str (word->word),
650 word->saving);
651#endif
652 wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
653 word->freq;
654 word->saving = (word->char_bit_cost - wbc) /
655 (sizeof (char *) + word->word[0]);
656#ifdef DEBUG1
657 fprintf (stderr, " %.2f\n", word->saving);
658#endif
659 word->num_trans = num_trans;
660 heap_changedhead (discard_heap, sizeof (word),
661 discard_num, BigSaving);
662 }
663 }
664 else if (keep_heap[0]->saving < discard_heap[0]->saving)
665 {
666 assert (keep_num && discard_num);
667 if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > k)
668 {
669 /* Transfer the word at the top of the keep heap to the
670 discard heap. */
671 WordData *word = keep_heap[0];
672 int idx = KIND (word);
673 heap_deletehead (keep_heap, sizeof (word), &keep_num, SmallSaving);
674 discard_heap[discard_num] = word;
675 heap_additem (discard_heap, sizeof (word), &discard_num,
676 BigSaving);
677 freqs_trans[idx] -= word->freq;
678 mem -= sizeof (u_char *) + word->word[0];
679 num_trans++;
680 total += word->saving;
681 keep_to_discard++;
682#ifdef DEBUG
683 fprintf (stderr,
684 "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
685 mem, word->freq, word->char_bit_cost,
686 word->saving, word2str (word->word));
687#endif
688 }
689 else
690 {
691 /* Transfer the word at the top of the discard heap to the
692 keep heap. */
693 WordData *word = discard_heap[0];
694 int idx = KIND (word);
695 heap_deletehead (discard_heap, sizeof (word),
696 &discard_num, BigSaving);
697 keep_heap[keep_num] = word;
698 heap_additem (keep_heap, sizeof (word), &keep_num, SmallSaving);
699 freqs_trans[idx] += word->freq;
700 mem += sizeof (u_char *) + word->word[0];
701 num_trans++;
702 total += word->saving;
703 discard_to_keep++;
704#ifdef DEBUG
705 fprintf (stderr,
706 "DISCARD -> KEEP %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
707 mem, word->freq, word->char_bit_cost,
708 word->saving, word2str (word->word));
709#endif
710 }
711
712 recalc_reqd = 1;
713
714 }
715 else
716 {
717 if (recalc_reqd == 0)
718 break;
719#ifdef DEBUG1
720 fprintf (stderr, "--------------\n");
721#endif
722 if (recalcs == MAX_RECALCULATIONS)
723 break;
724 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
725 freqs_trans, escape, num_trans);
726 heap_build (discard_heap, sizeof (*discard_heap),
727 discard_num, BigSaving);
728 heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
729 recalc_reqd = 0;
730 recalcs++;
731 }
732 }
733
734 Alloc_keep_discard ();
735
736 for (i = 0; i < discard_num; i++)
737 {
738 WordData *word = discard_heap[i];
739 int idx = KIND (word);
740 assert (IsWord (word) || IsNonWord (word));
741 discard[idx].wd[discard[idx].num_wds++] = word;
742 }
743 for (i = 0; i < keep_num; i++)
744 {
745 WordData *word = keep_heap[i];
746 int idx = KIND (word);
747 assert (IsWord (word) || IsNonWord (word));
748 keep[idx].wd[keep[idx].num_wds++] = word;
749 }
750 SortAndCount_DictData (&keep[0]);
751 SortAndCount_DictData (&keep[1]);
752 SortAndCount_DictData (&discard[0]);
753 SortAndCount_DictData (&discard[1]);
754
755 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
756 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
757 Message ("Keep -> Discard : %8d", keep_to_discard);
758 Message ("Discard -> Keep : %8d", discard_to_keep);
759 Message ("Huffman Recalculations : %8d", recalcs);
760 if (recalcs == MAX_RECALCULATIONS)
761 Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
762}
763
764
765
766
767
768
769
770
771
772
773/****************************************************************************
774 ***** *****
775 ***** Dictionary saving code *****
776 ***** *****
777 ****************************************************************************/
778
779
780
781static void
782Write_cdh (FILE * f, compression_dict_header * cdh)
783{
784 /* [RPAP - Jan 97: Endian Ordering] */
785 int i;
786 compression_dict_header tmp = *cdh;
787 HTONUL(tmp.dict_type);
788 HTONUL(tmp.novel_method);
789 for (i = 0; i < TEXT_PARAMS; i++)
790 HTONUL(tmp.params[i]);
791 HTONUL(tmp.num_words[0]);
792 HTONUL(tmp.num_words[1]);
793 HTONUL(tmp.num_word_chars[0]);
794 HTONUL(tmp.num_word_chars[1]);
795 HTONUL(tmp.lookback);
796
797 fwrite (&tmp, sizeof (tmp), 1, f);
798}
799
800
801static void
802Write_words (FILE * f, DictData * dd)
803{
804 int i;
805 u_char *curr, *prev = NULL;
806 for (i = 0; i < dd->num_wds; i++)
807 {
808 int len;
809 curr = dd->wd[i]->word;
810 if (prev)
811 /* look for prefix match with prev string */
812 len = prefixlen (prev, curr);
813 else
814 len = 0;
815 fputc ((len << 4) + (curr[0] - len), f);
816 fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
817 prev = curr;
818 }
819
820}
821
822
823static int
824Uncompressed_size (DictData * dd)
825{
826 int i, us;
827 for (us = i = 0; i < dd->num_wds; i++)
828 us += dd->wd[i]->word[0];
829 return us;
830}
831
832
833static u_long
834Write_data (FILE * f, DictData * dd, int lookback)
835{
836 u_long mem_reqd;
837 huff_data *hd;
838 int i;
839 u_long us = dd->num_wds;
840 long *freqs;
841 u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
842 u_long lencounts[MAX_HUFFCODE_LEN + 1];
843 u_char *lastword[MAX_HUFFCODE_LEN + 1];
844
845 if (!(freqs = Xmalloc ((dd->num_wds) * sizeof (*freqs))))
846 FatalError (1, "Unable to allocate memory for freqs");
847
848 for (i = 0; i < dd->num_wds; i++)
849 {
850 freqs[i] = dd->wd[i]->freq;
851 us += dd->wd[i]->word[0];
852 }
853
854 if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
855 FatalError (1, "Unable to allocate memory for huffman data");
856
857 Xfree (freqs), freqs = NULL;
858
859 if (Write_Huffman_Data (f, hd) == -1)
860 FatalError (1, "Unable to write huffman data");
861
862 HTONUL(us); /* [RPAP - Jan 97: Endian Ordering] */
863 fwrite (&us, sizeof (us), 1, f);
864 NTOHUL(us); /* [RPAP - Jan 97: Endian Ordering] */
865
866
867/* Calculate the amount of memory that will be required to store the text for
868 each different huffman code len. Every 1<<lookback words for each different
869 codelen length will not be prefixed by previous strings. */
870
871
872 bzero ((char *) &huff_words_size, sizeof (huff_words_size));
873 bzero ((char *) &lencounts, sizeof (lencounts));
874
875 mem_reqd = 0;
876
877 for (i = 0; i < dd->num_wds; i++)
878 {
879 int codelen = hd->clens[i];
880 u_char *word = dd->wd[i]->word;
881
882 if (!codelen)
883 FatalError (1, "The length of a code for a word was zero");
884
885 huff_words_size[codelen] += word[0] + 1;
886 mem_reqd += word[0] + (lookback != 0);
887#if 0
888 if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
889 lastword[codelen] = word;
890 else
891 huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
892#else
893 if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
894 {
895 int save = prefixlen (lastword[codelen], word);
896 mem_reqd -= save;
897 huff_words_size[codelen] -= save;
898 }
899 else
900 {
901 mem_reqd += sizeof (u_char *);
902 }
903 lastword[codelen] = word;
904#endif
905 lencounts[codelen]++;
906 }
907
908 /* [RPAP - Jan 97: Endian Ordering] */
909 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
910 HTONUL(huff_words_size[i]);
911
912 fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
913 hd->maxcodelen - hd->mincodelen + 1, f);
914
915 /* [RPAP - Jan 97: Endian Ordering] */
916 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
917 NTOHUL(huff_words_size[i]);
918
919 Write_words (f, dd);
920
921 Xfree (hd->clens);
922 Xfree (hd);
923
924 return mem_reqd;
925}
926
927
928
929static void
930Write_charfreqs (FILE * f, DictData * dd, int words,
931 int zero_freq_permitted)
932{
933 int j;
934 long freqs[256];
935 WordData **wd = dd->wd;
936 huff_data *hd;
937
938 bzero ((char *) freqs, sizeof (freqs));
939
940 for (j = 0; j < dd->num_wds; j++, wd++)
941 {
942 u_char *buf = (*wd)->word;
943 int len = *buf++;
944 for (; len; len--, buf++)
945 freqs[(u_long) (*buf)] += (*wd)->freq;
946 }
947
948 if (!zero_freq_permitted)
949 for (j = 0; j < 256; j++)
950 if (!freqs[j] && PESINAWORD (j) == words)
951 freqs[j] = 1;
952
953 if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
954 FatalError (1, "Unable to allocate memory for huffman data");
955
956 if (Write_Huffman_Data (f, hd) == -1)
957 FatalError (1, "Unable to write huffman data");
958
959 Xfree (hd->clens);
960 Xfree (hd);
961}
962
963
964
965
966static void
967Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
968{
969 int j;
970 long freqs[16];
971 WordData **wd = dd->wd;
972 huff_data *hd;
973
974 bzero ((char *) freqs, sizeof (freqs));
975
976 for (j = 0; j < dd->num_wds; j++, wd++)
977 freqs[(*wd)->word[0]] += (*wd)->freq;
978
979 if (!zero_freq_permitted)
980 for (j = 0; j < 16; j++)
981 if (!freqs[j])
982 freqs[j] = 1;
983
984 if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
985 FatalError (1, "Unable to allocate memory for huffman data");
986
987 if (Write_Huffman_Data (f, hd) == -1)
988 FatalError (1, "Unable to write huffman data");
989
990
991 Xfree (hd->clens);
992 Xfree (hd);
993}
994
995
996
997static u_long
998WriteOutWords (char *file_name, u_long type, int lookback)
999{
1000 FILE *f;
1001 int i;
1002 u_long mem_reqd = 0;
1003
1004 compression_dict_header cdh;
1005 f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
1006 MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
1007
1008 bzero((char *) &cdh, sizeof(cdh));
1009
1010 cdh.dict_type = type;
1011 cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
1012 MG_NOVEL_HUFFMAN_CHARS;
1013
1014 cdh.num_words[1] = keep[1].num_wds;
1015 cdh.num_words[0] = keep[0].num_wds;
1016 cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
1017 cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
1018 cdh.lookback = lookback;
1019
1020 Write_cdh (f, &cdh);
1021
1022 for (i = 0; i < 2; i++)
1023 switch (type)
1024 {
1025 case MG_COMPLETE_DICTIONARY:
1026 {
1027 mem_reqd += Write_data (f, &keep[i], lookback);
1028 }
1029 break;
1030 case MG_PARTIAL_DICTIONARY:
1031 {
1032 if (keep[i].num_wds)
1033 {
1034 int j;
1035 WordData esc;
1036 esc.freq = 0;
1037 esc.word = (u_char *) "";
1038 keep[i].wd[keep[i].num_wds++] = &esc;
1039 for (j = 0; j < discard[i].num_wds; j++)
1040 esc.freq += discard[i].wd[j]->freq;
1041 if (!esc.freq)
1042 esc.freq++;
1043 mem_reqd += Write_data (f, &keep[i], lookback);
1044 }
1045 Write_charfreqs (f, &discard[i], i, 1);
1046 Write_wordlens (f, &discard[i], 1);
1047 }
1048 break;
1049 case MG_SEED_DICTIONARY:
1050 {
1051 if (keep[i].num_wds)
1052 {
1053 int j;
1054 WordData esc;
1055 esc.freq = 0;
1056 esc.word = (u_char *) "";
1057 keep[i].wd[keep[i].num_wds++] = &esc;
1058 for (j = 0; j < all[i].num_wds; j++)
1059 if (all[i].wd[j]->freq == 1)
1060 esc.freq++;
1061 if (!esc.freq)
1062 esc.freq++;
1063 mem_reqd += Write_data (f, &keep[i], lookback);
1064 }
1065 switch (novel_method)
1066 {
1067 case MG_NOVEL_HUFFMAN_CHARS:
1068 Write_charfreqs (f, &all[i], i, 0);
1069 Write_wordlens (f, &all[i], 0);
1070 break;
1071 case MG_NOVEL_BINARY:
1072 break;
1073 case MG_NOVEL_DELTA:
1074 break;
1075 case MG_NOVEL_HYBRID:
1076 break;
1077 case MG_NOVEL_HYBRID_MTF:
1078 break;
1079 default:
1080 FatalError (1, "Bad novel method");
1081 }
1082 }
1083 break;
1084 }
1085 fclose (f);
1086 return mem_reqd;
1087}
Note: See TracBrowser for help on using the repository browser.