source: trunk/gsdl/src/mgpp/text/mg_compression_dict.cpp@ 2442

Last change on this file since 2442 was 2442, checked in by jrm21, 23 years ago

portability changes, use getopt from unistd.h (all POSIX systems)

  • Property svn:keywords set to Author Date Id Revision
File size: 26.9 KB
Line 
1/**************************************************************************
2 *
3 * mg_compression_dict.cpp -- Routines for creating compression dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_compression_dict.cpp 2442 2001-05-17 04:38:16Z jrm21 $
21 *
22 **************************************************************************/
23#define _XOPEN_SOURCE 1
24#define _XOPEN_SOURCE_EXTENDED 1
25#include <unistd.h>
26
27#include "sysfuncs.h"
28
29#include "memlib.h"
30#include "messages.h"
31#include "local_strings.h"
32#include "bitio_gen.h"
33#include "bitio_m_stdio.h"
34#include "mgheap.h"
35#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
36
37#include "mg_files.h"
38#include "locallib.h"
39#include "invf.h"
40#include "text.h"
41#include "words.h"
42#include "mg.h"
43#include "WordData.h"
44
45#define MAXBITS (sizeof(unsigned long) * 8)
46
47/*
48 $Log$
49 Revision 1.3 2001/05/17 04:38:15 jrm21
50 portability changes, use getopt from unistd.h (all POSIX systems)
51
52 Revision 1.2 2000/01/18 03:53:22 rjmcnab
53 Fixed a couple of bugs and made building silent if needed.
54
55 Revision 1.1 2000/01/14 02:26:11 sjboddie
56 Rodgers new C++ mg
57
58 Revision 1.3 1999/10/17 23:43:25 cs025
59 Changes to eradicate Xmalloc
60
61 Revision 1.2 1999/10/11 22:03:03 cs025
62 Changed mistaken use of NTOHUL on a double to NTOHD as per report
63 from Craig Neville-Manning of problems installing mg/gsdl.
64
65 Revision 1.1 1999/10/11 02:57:46 cs025
66 Base install of MG-PP
67
68 Revision 1.1 1999/08/10 21:18:04 sjboddie
69 renamed mg-1.3d directory mg
70
71 Revision 1.4 1999/01/15 03:05:51 rjmcnab
72
73 Renamed lib/heap to be lib/mgheap (it was causing some problems with
74 some versions of the STL libraries which contained a heap.h)
75
76 Revision 1.3 1998/12/17 09:12:52 rjmcnab
77
78 Altered mg to process utf-8 encoded Unicode. The main changes
79 are in the parsing of the input, the casefolding, and the stemming.
80
81 Revision 1.2 1998/11/25 07:55:44 rjmcnab
82
83 Modified mg to that you can specify the stemmer you want
84 to use via a command line option. You specify it to
85 mg_passes during the build process. The number of the
86 stemmer that you used is stored within the inverted
87 dictionary header and the stemmed dictionary header so
88 the correct stemmer is used in later stages of building
89 and querying.
90
91 Revision 1.1 1998/11/17 09:34:52 rjmcnab
92 *** empty log message ***
93
94 * Revision 1.3 1994/10/20 03:56:54 tes
95 * I have rewritten the boolean query optimiser and abstracted out the
96 * components of the boolean query.
97 *
98 * Revision 1.2 1994/09/20 04:41:45 tes
99 * For version 1.1
100 *
101 */
102
103/* #define DEBUG1 */
104
105/* #define DEBUG */
106
107#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))
108
109#define MAX_RECALCULATIONS 100
110
111typedef struct DictWordData : public WordData
112 {
113 float saving;
114 float char_bit_cost;
115 u_long num_trans;
116 u_char *word;
117 }
118DictWordData;
119
120static DictWordData *Words[2];
121static u_long Num[2];
122static u_long chars[2];
123
124#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
125#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
126#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)
127
128
129typedef struct DictData
130 {
131 DictWordData **wd;
132 u_long num_wds;
133 u_long chars;
134 }
135DictData;
136
137typedef DictData DictInfo[2];
138
139static DictInfo keep, discard, all;
140static compression_stats_header csh;
141
142static char *file_name = "";
143static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;
144
145
146static void ReadInWords (char *);
147static void Select_on (int k, heap_comp hc);
148static void Method3 (int k);
149static void Select_all (void);
150static u_long WriteOutWords (char *, u_long, int);
151static int DecFreqIncWL (void *a, void *b);
152static int OccuranceOrder (void *a, void *b);
153
154
155
156int main (int argc, char **argv)
157{
158 int ch;
159 char type = 'C';
160 char mode = '0';
161 int lookback = 2;
162 double k = 0;
163 u_long mem_reqd;
164 opterr = 0;
165 msg_prefix = argv[0];
166 while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HDY")) != -1)
167 switch (ch)
168 {
169 case 'H':
170 novel_method = MG_NOVEL_HUFFMAN_CHARS;
171 break;
172 case 'D':
173 novel_method = MG_NOVEL_DELTA;
174 break;
175 case 'Y':
176 novel_method = MG_NOVEL_HYBRID;
177 break;
178 case 'f': /* input file */
179 file_name = optarg;
180 break;
181 case 'd':
182 set_basepath (optarg);
183 break;
184 case 'C':
185 case 'P':
186 case 'S':
187 type = ch;
188 break;
189 case '0':
190 case '1':
191 case '2':
192 case '3':
193 mode = ch;
194 break;
195 case 'l':
196 lookback = atoi (optarg);
197 if (!is_power_of_two (lookback))
198 FatalError (1, "The lookback value must be a power of 2");
199 lookback = floorlog_2 (lookback);
200 break;
201 case 'k':
202 k = atof (optarg) * 1024;
203 break;
204 case 'h':
205 case '?':
206 fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
207 "[-d data directory] [-h] [-0|-1|-2|-3] [-C|-P|-S] [-k mem (Kb)]\n",
208 argv[0]);
209 exit (1);
210 }
211
212 ReadInWords (file_name);
213
214 if (type == 'C')
215 {
216 Select_all ();
217 mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
218 }
219 else
220 {
221 switch (mode)
222 {
223 case '0':
224 Select_all ();
225 break;
226 case '1':
227#ifndef SILENT
228 Message ("Dictionary limit of %.2f Kb", k / 1024);
229#endif
230 Select_on ((int) k, OccuranceOrder);
231 break;
232 case '2':
233#ifndef SILENT
234 Message ("Dictionary limit of %.2f Kb", k / 1024);
235#endif
236 Select_on ((int) k, DecFreqIncWL);
237 break;
238 case '3':
239#ifndef SILENT
240 Message ("Dictionary limit of %.2f Kb", k / 1024);
241#endif
242 Method3 ((int) k);
243 break;
244 }
245 if (type == 'P')
246 {
247 mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
248 }
249 else
250 {
251 mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
252 }
253 }
254
255#ifndef SILENT
256 Message ("Num words : %8u -> %8u\n", Num[1], keep[1].num_wds);
257 Message ("Num non-words : %8u -> %8u\n", Num[0], keep[0].num_wds);
258 Message ("Chars of words : %8u -> %8u\n", chars[1], keep[1].chars);
259 Message ("Chars of non-words : %8u -> %8u\n", chars[0], keep[0].chars);
260 Message ("Mem usage : %8u -> %8u\n",
261 (Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
262 (keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
263 keep[0].chars + keep[1].chars);
264 Message ("Actual mem required : %8u\n", mem_reqd);
265#endif
266 exit (0);
267}
268
269
270
271
272static void
273ReadInWords (char *filename)
274{
275 FILE *f;
276 unsigned long i;
277 f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
278 MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
279
280 fread (&csh, sizeof (csh), 1, f);
281
282 /* [RPAP - Jan 97: Endian Ordering] */
283 NTOHUL(csh.num_docs);
284 NTOHD(csh.num_bytes);
285
286 for (i = 0; i < 2; i++)
287 {
288 frags_stats_header fsh;
289 char *buf;
290 DictWordData *wd;
291 chars[i] = 0;
292
293 fread (&fsh, sizeof (fsh), 1, f);
294 /* [RPAP - Jan 97: Endian Ordering] */
295 NTOHUL(fsh.num_frags);
296 NTOHUL(fsh.mem_for_frags);
297
298 Num[i] = all[i].num_wds = fsh.num_frags;
299
300 /* The +1 on the following line is to leave room for the esc code. */
301 all[i].wd = (DictWordData **) Xmalloc (sizeof (DictWordData *) * Num[i] + 1);
302
303 buf = (char *) Xmalloc (fsh.mem_for_frags);
304 wd = Words[i] = (DictWordData *) Xmalloc (sizeof (DictWordData) * Num[i]);
305 unsigned int j;
306 for (j = 0; j < Num[i]; j++, wd++)
307 {
308 int len;
309
310 // read docCount and wordCount
311 wd->read(f);
312
313 // read an mgString
314 len = fgetc (f);
315 wd->word = (u_char *) buf;
316 *buf++ = len;
317 fread (buf, len, 1, f);
318 buf += len;
319 all[i].wd[j] = wd;
320 }
321 chars[i] = fsh.mem_for_frags - fsh.num_frags;
322 }
323 fclose (f);
324}
325
326
327static void
328Alloc_keep_discard (void)
329{
330 keep[0].num_wds = 0;
331 keep[0].wd = (DictWordData **) Xmalloc ((Num[0] + 1) * sizeof (DictWordData *));
332 keep[1].num_wds = 0;
333 keep[1].wd = (DictWordData **) Xmalloc ((Num[1] + 1) * sizeof (DictWordData *));
334 discard[0].num_wds = 0;
335 discard[0].wd = (DictWordData **) Xmalloc ((Num[0] + 1) * sizeof (DictWordData *));
336 discard[1].num_wds = 0;
337 discard[1].wd = (DictWordData **) Xmalloc ((Num[1] + 1) * sizeof (DictWordData *));
338}
339
340
341static int
342sort_comp (const void *a, const void *b)
343{
344 DictWordData *aa = *(DictWordData **) a;
345 DictWordData *bb = *(DictWordData **) b;
346 return casecompare (aa->word, bb->word); /* [RPAP - Jan 97: Stem Index Change] */
347}
348
349
350
351static void
352SortAndCount_DictData (DictData * dd)
353{
354 unsigned int i;
355 DictWordData **wd;
356 qsort (dd->wd, dd->num_wds, sizeof (DictWordData *), sort_comp);
357 dd->chars = 0;
358 wd = dd->wd;
359 for (i = 0; i < dd->num_wds; i++, wd++)
360 dd->chars += (*wd)->word[0];
361}
362
363
364static void
365Select_all (void)
366{
367 unsigned int i;
368 Alloc_keep_discard ();
369 keep[0].num_wds = Num[0];
370 for (i = 0; i < Num[0]; i++)
371 keep[0].wd[i] = Words[0] + i;
372 keep[1].num_wds = Num[1];
373 for (i = 0; i < Num[1]; i++)
374 keep[1].wd[i] = Words[1] + i;
375 SortAndCount_DictData (&keep[0]);
376 SortAndCount_DictData (&keep[1]);
377}
378
379
380
381
382static int
383DecFreqIncWL (void *a, void *b)
384{
385 DictWordData *aa = *(DictWordData **) a;
386 DictWordData *bb = *(DictWordData **) b;
387 if (aa->docCount > bb->docCount)
388 return -1;
389 else if (aa->docCount < bb->docCount)
390 return 1;
391 else
392 return bb->word[0] - aa->word[0];
393}
394
395
396static int
397OccuranceOrder (void *a, void *b)
398{
399 DictWordData *aa = *(DictWordData **) a;
400 DictWordData *bb = *(DictWordData **) b;
401 if (aa->words() > bb->words())
402 return 1;
403 else if (aa->words() < bb->words())
404 return -1;
405 else
406 return 0;
407}
408
409
410static void
411Select_on (int k, heap_comp hc)
412{
413 int i, num, mem;
414 DictWordData **wd;
415
416 Alloc_keep_discard ();
417
418 num = Num[0] + Num[1];
419 wd = (DictWordData **) Xmalloc (num * sizeof (DictWordData *));
420 for (i = 0; (unsigned int)i < Num[0]; i++)
421 wd[i] = Words[0] + i;
422 for (i = 0; (unsigned int)i < Num[1]; i++)
423 wd[i + Num[0]] = Words[1] + i;
424
425 heap_build (wd, sizeof (*wd), num, hc);
426
427 mem = 0;
428 while (k > mem && num)
429 {
430 int idx;
431 DictWordData *word = wd[0];
432#ifdef DEBUG
433 fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
434 keep[0].num_wds, keep[1].num_wds,
435 mem, word->documents(), word->word, word2str (word->word));
436#endif
437 mem += sizeof (u_char *) + word->word[0];
438 heap_deletehead (wd, sizeof (*wd), &num, hc);
439 idx = KIND (word);
440 keep[idx].wd[keep[idx].num_wds++] = word;
441 }
442
443 for (i = 0; i < num; i++)
444 {
445 DictWordData *word = wd[i];
446 int idx = KIND (word);
447 discard[idx].wd[discard[idx].num_wds++] = word;
448 }
449 SortAndCount_DictData (&keep[0]);
450 SortAndCount_DictData (&keep[1]);
451 SortAndCount_DictData (&discard[0]);
452 SortAndCount_DictData (&discard[1]);
453
454 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
455 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
456
457}
458
459
460
461static int
462BigSaving (void *a, void *b)
463{
464 DictWordData *aa = *(DictWordData **) a;
465 DictWordData *bb = *(DictWordData **) b;
466 return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
467}
468
469static int
470SmallSaving (void *a, void *b)
471{
472 DictWordData *aa = *(DictWordData **) a;
473 DictWordData *bb = *(DictWordData **) b;
474 return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
475}
476
477
478static void
479CalcCharCounts (DictWordData ** wd, int num,
480 char *char_lens[2], char *len_lens[2],
481 u_long escape[2])
482{
483 long char_freqs[2][256];
484 long len_freqs[2][16];
485 huff_data hd;
486 int i;
487 escape[0] = 0;
488 escape[1] = 0;
489 bzero ((char *) char_freqs, sizeof (char_freqs));
490 bzero ((char *) len_freqs, sizeof (len_freqs));
491 for (i = 0; i < num; i++, wd++)
492 {
493 u_long freq = (*wd)->documents();
494 u_char *buf = (*wd)->word;
495 int len = *buf++;
496 int idx = KIND (*wd);
497 len_freqs[idx][len] += freq;
498 escape[idx] += freq;
499 for (; len; len--, buf++)
500 char_freqs[idx][(u_long) (*buf)] += freq;
501 }
502 Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
503 char_lens[0] = hd.clens;
504 Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
505 char_lens[1] = hd.clens;
506
507 Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
508 len_lens[0] = hd.clens;
509 Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
510 len_lens[1] = hd.clens;
511}
512
513
514
515
516
517inline void CalcBitCostForWordData(DictWordData **word, int num,
518 double freqs_trans[2], u_long escape[2],
519 char * char_lens[2], char *len_lens[2],
520 double esc[2], int num_trans)
521{
522 int j;
523
524 for (j = 0; j < num; j++, word++)
525 {
526 float cbc, wbc;
527 u_char *buf = (*word)->word;
528 int len = *buf++;
529 u_long freq = (*word)->documents();
530 int idx = KIND (*word);
531
532 cbc = len_lens[idx][len];
533 for (; len; len--, buf++)
534 cbc += char_lens[idx][(u_long) (*buf)];
535
536 (*word)->char_bit_cost = (cbc + esc[idx]) * freq;
537
538 wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
539
540 (*word)->saving = ((*word)->char_bit_cost - wbc) /
541 (sizeof (char *) + (*word)->word[0]);
542
543 (*word)->num_trans = num_trans;
544 }
545}
546
547
548
549void
550CalcBitCost (DictWordData ** discard_word, int discard_num,
551 DictWordData ** keep_word, int keep_num, double freqs_trans[2],
552 u_long escape[2], int num_trans)
553{
554 char *char_lens[2];
555 char *len_lens[2];
556 double esc[2];
557 CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
558 esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
559 esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
560 CalcBitCostForWordData(discard_word, discard_num, freqs_trans, escape,
561 char_lens, len_lens, esc, num_trans);
562 CalcBitCostForWordData(keep_word, keep_num, freqs_trans, escape,
563 char_lens, len_lens, esc, num_trans);
564 Xfree (char_lens[0]);
565 Xfree (char_lens[1]);
566 Xfree (len_lens[0]);
567 Xfree (len_lens[1]);
568}
569
570inline void m3_transferWord(DictWordData **toHeap, int *toNum, heap_comp toSaving,
571 DictWordData **fromHeap, int *fromNum, heap_comp fromSaving,
572 int *num_trans, double freqs_trans[], int *mem, double *total,
573 int *count)
574{
575 /* Transfer the word at the top of the keep heap to the
576 discard heap. */
577 DictWordData *word = fromHeap[0];
578 int idx = KIND (word);
579 heap_deletehead (fromHeap, sizeof (word), fromNum, fromSaving);
580 toHeap[*toNum] = word;
581 heap_additem (toHeap, sizeof (word), toNum, toSaving);
582 freqs_trans[idx] -= word->documents();
583 *mem = (*mem) - sizeof (u_char *) + word->word[0];
584 *num_trans += 1;
585 *total += word->saving;
586 *count += 1;
587#ifdef DEBUG
588 fprintf (stderr,
589 "KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
590 *mem, word->documents(), word->char_bit_cost,
591 word->saving, word2str (word->word));
592#endif
593}
594
595inline void m3_storeWord(DictWordData **wordHeap, int wordNum, int num_trans,
596 double freqs_trans[], u_long escape[], heap_comp hc)
597{
598 DictWordData *word = wordHeap[0];
599 int idx = KIND (word);
600 float wbc;
601#ifdef DEBUG1
602 fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
603 word->saving);
604#endif
605 wbc = -log2 (word->documents() / (freqs_trans[idx] + escape[idx])) *
606 word->documents();
607 word->saving = (word->char_bit_cost - wbc) /
608 (sizeof (char *) + word->word[0]);
609#ifdef DEBUG1
610 fprintf (stderr, " %.2f\n", word->saving);
611#endif
612 word->num_trans = num_trans;
613 heap_changedhead (wordHeap, sizeof (word), wordNum, hc);
614}
615
616
617static void
618Method3 (int k)
619{
620 int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
621 int keep_to_discard = 0;
622 int discard_to_keep = 0;
623 int recalcs = 0;
624 double freqs_trans[2], total;
625 u_long escape[2];
626 DictWordData **keep_heap, **discard_heap;
627
628 freqs_trans[0] = freqs_trans[1] = 0;
629 num_trans = 0;
630
631 discard_num = Num[0] + Num[1];
632 discard_heap = (DictWordData **) Xmalloc (discard_num * sizeof (DictWordData *));
633
634 keep_num = 0;
635 keep_heap = (DictWordData **) Xmalloc (discard_num * sizeof (DictWordData *));
636
637
638 for (i = 0; (unsigned int)i < Num[0]; i++)
639 discard_heap[i] = Words[0] + i;
640 for (i = 0; (unsigned int)i < Num[1]; i++)
641 discard_heap[i + Num[0]] = Words[1] + i;
642
643 heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);
644
645 mem = 0;
646 while (k > mem && discard_num)
647 {
648 DictWordData *word = discard_heap[0];
649 mem += sizeof (u_char *) + word->word[0];
650 heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
651 keep_heap[keep_num++] = word;
652 freqs_trans[KIND (word)] += word->documents();
653 num_trans++;
654 }
655
656 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
657 freqs_trans, escape, num_trans);
658 heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
659 heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);
660
661 total = 0;
662 recalc_reqd = 0;
663 while (keep_num && discard_num)
664 {
665 if ((keep_num && keep_heap[0]->num_trans < (unsigned)num_trans) ||
666 (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans))
667 {
668 if (keep_num && keep_heap[0]->num_trans < (unsigned)num_trans)
669 {
670 m3_storeWord(keep_heap, keep_num, num_trans, freqs_trans, escape, SmallSaving);
671 }
672
673 if (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans)
674 {
675 m3_storeWord(discard_heap, discard_num, num_trans, freqs_trans, escape, BigSaving);
676 }
677 }
678 else if (keep_heap[0]->saving < discard_heap[0]->saving)
679 {
680 assert (keep_num && discard_num);
681 if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > (unsigned)k)
682 {
683 /* Transfer the word at the top of the keep heap to the
684 discard heap. */
685 m3_transferWord(discard_heap, &discard_num, BigSaving,
686 keep_heap, &keep_num, SmallSaving,
687 &num_trans, freqs_trans, &mem, &total, &keep_to_discard);
688 }
689 else
690 {
691 /* Transfer the word at the top of the discard heap to the
692 keep heap. */
693 m3_transferWord(keep_heap, &keep_num, SmallSaving,
694 discard_heap, &discard_num, BigSaving,
695 &num_trans, freqs_trans, &mem, &total, &discard_to_keep);
696 }
697
698 recalc_reqd = 1;
699
700 }
701 else
702 {
703 if (recalc_reqd == 0)
704 break;
705#ifdef DEBUG1
706 fprintf (stderr, "--------------\n");
707#endif
708 if (recalcs == MAX_RECALCULATIONS)
709 break;
710 CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
711 freqs_trans, escape, num_trans);
712 heap_build (discard_heap, sizeof (*discard_heap),
713 discard_num, BigSaving);
714 heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
715 recalc_reqd = 0;
716 recalcs++;
717 }
718 }
719
720 Alloc_keep_discard ();
721
722 for (i = 0; i < discard_num; i++)
723 {
724 DictWordData *word = discard_heap[i];
725 int idx = KIND (word);
726 assert (IsWord (word) || IsNonWord (word));
727 discard[idx].wd[discard[idx].num_wds++] = word;
728 }
729 for (i = 0; i < keep_num; i++)
730 {
731 DictWordData *word = keep_heap[i];
732 int idx = KIND (word);
733 assert (IsWord (word) || IsNonWord (word));
734 keep[idx].wd[keep[idx].num_wds++] = word;
735 }
736 SortAndCount_DictData (&keep[0]);
737 SortAndCount_DictData (&keep[1]);
738 SortAndCount_DictData (&discard[0]);
739 SortAndCount_DictData (&discard[1]);
740
741 assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
742 assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
743#ifndef SILENT
744 Message ("Keep -> Discard : %8d", keep_to_discard);
745 Message ("Discard -> Keep : %8d", discard_to_keep);
746 Message ("Huffman Recalculations : %8d", recalcs);
747 if (recalcs == MAX_RECALCULATIONS)
748 Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
749#endif
750}
751
752
753
754
755
756
757
758
759
760
761/****************************************************************************
762 ***** *****
763 ***** Dictionary saving code *****
764 ***** *****
765 ****************************************************************************/
766
767
768
769static void
770Write_cdh (FILE * f, compression_dict_header * cdh)
771{
772 /* [RPAP - Jan 97: Endian Ordering] */
773 int i;
774 compression_dict_header tmp = *cdh;
775 HTONUL(tmp.dict_type);
776 HTONUL(tmp.novel_method);
777 for (i = 0; i < TEXT_PARAMS; i++)
778 HTONUL(tmp.params[i]);
779 HTONUL(tmp.num_words[0]);
780 HTONUL(tmp.num_words[1]);
781 HTONUL(tmp.num_word_chars[0]);
782 HTONUL(tmp.num_word_chars[1]);
783 HTONUL(tmp.lookback);
784
785 fwrite (&tmp, sizeof (tmp), 1, f);
786}
787
788
789static void
790Write_words (FILE * f, DictData * dd)
791{
792 unsigned int i;
793 u_char *curr, *prev = NULL;
794 for (i = 0; i < dd->num_wds; i++)
795 {
796 int len;
797 curr = dd->wd[i]->word;
798 if (prev)
799 /* look for prefix match with prev string */
800 len = prefixlen (prev, curr);
801 else
802 len = 0;
803 fputc ((len << 4) + (curr[0] - len), f);
804 fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
805 prev = curr;
806 }
807
808}
809
810
811static int
812Uncompressed_size (DictData * dd)
813{
814 unsigned int i, us;
815 for (us = i = 0; i < dd->num_wds; i++)
816 us += dd->wd[i]->word[0];
817 return us;
818}
819
820
821static u_long
822Write_data (FILE * f, DictData * dd, int lookback)
823{
824 u_long mem_reqd;
825 huff_data *hd;
826 int i;
827 u_long us = dd->num_wds;
828 long *freqs;
829 u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
830 u_long lencounts[MAX_HUFFCODE_LEN + 1];
831 u_char *lastword[MAX_HUFFCODE_LEN + 1];
832
833 if (!(freqs = new long [dd->num_wds]))
834 FatalError (1, "Unable to allocate memory for freqs");
835
836 for (i = 0; (unsigned)i < dd->num_wds; i++)
837 {
838 freqs[i] = dd->wd[i]->documents();
839 us += dd->wd[i]->word[0];
840 }
841
842 if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
843 FatalError (1, "Unable to allocate memory for huffman data");
844
845 delete (freqs);
846 freqs = NULL;
847
848 if (Write_Huffman_Data (f, hd) == -1)
849 FatalError (1, "Unable to write huffman data");
850
851 HTONUL(us); /* [RPAP - Jan 97: Endian Ordering] */
852 fwrite (&us, sizeof (us), 1, f);
853 NTOHUL(us); /* [RPAP - Jan 97: Endian Ordering] */
854
855
856/* Calculate the amount of memory that will be required to store the text for
857 each different huffman code len. Every 1<<lookback words for each different
858 codelen length will not be prefixed by previous strings. */
859
860
861 bzero ((char *) &huff_words_size, sizeof (huff_words_size));
862 bzero ((char *) &lencounts, sizeof (lencounts));
863
864 mem_reqd = 0;
865
866 for (i = 0; (unsigned)i < dd->num_wds; i++)
867 {
868 int codelen = hd->clens[i];
869 u_char *word = dd->wd[i]->word;
870
871 if (!codelen)
872 FatalError (1, "The length of a code for a word was zero");
873
874 huff_words_size[codelen] += word[0] + 1;
875 mem_reqd += word[0] + (lookback != 0);
876#if 0
877 if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
878 lastword[codelen] = word;
879 else
880 huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
881#else
882 if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
883 {
884 int save = prefixlen (lastword[codelen], word);
885 mem_reqd -= save;
886 huff_words_size[codelen] -= save;
887 }
888 else
889 {
890 mem_reqd += sizeof (u_char *);
891 }
892 lastword[codelen] = word;
893#endif
894 lencounts[codelen]++;
895 }
896
897 /* [RPAP - Jan 97: Endian Ordering] */
898 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
899 HTONUL(huff_words_size[i]);
900
901 fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
902 hd->maxcodelen - hd->mincodelen + 1, f);
903
904 /* [RPAP - Jan 97: Endian Ordering] */
905 for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
906 NTOHUL(huff_words_size[i]);
907
908 Write_words (f, dd);
909
910 delete hd->clens;
911 delete hd;
912
913 return mem_reqd;
914}
915
916
917
918static void
919Write_charfreqs (FILE * f, DictData * dd, int words,
920 int zero_freq_permitted)
921{
922 unsigned int j;
923 long freqs[256];
924 DictWordData **wd = dd->wd;
925 huff_data *hd;
926
927 bzero ((char *) freqs, sizeof (freqs));
928
929 for (j = 0; j < dd->num_wds; j++, wd++)
930 {
931 u_char *buf = (*wd)->word;
932 int len = *buf++;
933 for (; len; len--, buf++)
934 freqs[(u_long) (*buf)] += (*wd)->documents();
935 }
936
937 if (!zero_freq_permitted)
938 for (j = 0; j < 256; j++)
939 if (!freqs[j] && PESINAWORD (j) == words)
940 freqs[j] = 1;
941
942 if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
943 FatalError (1, "Unable to allocate memory for huffman data");
944
945 if (Write_Huffman_Data (f, hd) == -1)
946 FatalError (1, "Unable to write huffman data");
947
948 delete hd->clens;
949 delete hd;
950}
951
952
953
954
955static void
956Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
957{
958 unsigned int j;
959 long freqs[16];
960 DictWordData **wd = dd->wd;
961 huff_data *hd;
962
963 bzero ((char *) freqs, sizeof (freqs));
964
965 for (j = 0; j < dd->num_wds; j++, wd++)
966 freqs[(*wd)->word[0]] += (*wd)->documents();
967
968 if (!zero_freq_permitted)
969 for (j = 0; j < 16; j++)
970 if (!freqs[j])
971 freqs[j] = 1;
972
973 if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
974 FatalError (1, "Unable to allocate memory for huffman data");
975
976 if (Write_Huffman_Data (f, hd) == -1)
977 FatalError (1, "Unable to write huffman data");
978
979
980 delete hd->clens;
981 delete hd;
982}
983
984
985
986static u_long
987WriteOutWords (char *file_name, u_long type, int lookback)
988{
989 FILE *f;
990 int i;
991 u_long mem_reqd = 0;
992
993 compression_dict_header cdh;
994 f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
995 MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
996
997 bzero((char *) &cdh, sizeof(cdh));
998
999 cdh.dict_type = type;
1000 cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
1001 MG_NOVEL_HUFFMAN_CHARS;
1002
1003 cdh.num_words[1] = keep[1].num_wds;
1004 cdh.num_words[0] = keep[0].num_wds;
1005 cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
1006 cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
1007 cdh.lookback = lookback;
1008
1009 Write_cdh (f, &cdh);
1010
1011 for (i = 0; i < 2; i++)
1012 switch (type)
1013 {
1014 case MG_COMPLETE_DICTIONARY:
1015 {
1016 mem_reqd += Write_data (f, &keep[i], lookback);
1017 }
1018 break;
1019 case MG_PARTIAL_DICTIONARY:
1020 {
1021 if (keep[i].num_wds)
1022 {
1023 int j;
1024 DictWordData esc;
1025 esc.docCount = 0;
1026 esc.word = (u_char *) "";
1027 keep[i].wd[keep[i].num_wds++] = &esc;
1028 for (j = 0; (unsigned)j < discard[i].num_wds; j++)
1029 esc.docCount += discard[i].wd[j]->documents();
1030 if (!esc.docCount)
1031 esc.docCount++;
1032 mem_reqd += Write_data (f, &keep[i], lookback);
1033 }
1034 Write_charfreqs (f, &discard[i], i, 1);
1035 Write_wordlens (f, &discard[i], 1);
1036 }
1037 break;
1038 case MG_SEED_DICTIONARY:
1039 {
1040 if (keep[i].num_wds)
1041 {
1042 int j;
1043 DictWordData esc;
1044 esc.docCount = 0;
1045 esc.word = (u_char *) "";
1046 keep[i].wd[keep[i].num_wds++] = &esc;
1047 for (j = 0; (unsigned)j < all[i].num_wds; j++)
1048 if (all[i].wd[j]->documents() == 1)
1049 esc.docCount++;
1050 if (!esc.docCount)
1051 esc.docCount++;
1052 mem_reqd += Write_data (f, &keep[i], lookback);
1053 }
1054 switch (novel_method)
1055 {
1056 case MG_NOVEL_HUFFMAN_CHARS:
1057 Write_charfreqs (f, &all[i], i, 0);
1058 Write_wordlens (f, &all[i], 0);
1059 break;
1060 case MG_NOVEL_DELTA:
1061 break;
1062 case MG_NOVEL_HYBRID:
1063 break;
1064 default:
1065 FatalError (1, "Bad novel method");
1066 }
1067 }
1068 break;
1069 }
1070 fclose (f);
1071 return mem_reqd;
1072}
Note: See TracBrowser for help on using the repository browser.