Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/indexers/mg/src/text/mg_compression_dict.c@ 13663

Last change on this file since 13663 was 3745, checked in by mdewsnip, 21 years ago
Addition of MG package for search and retrieval
Property svn:keywords set to `Author Date Id Revision`
File size: 26.8 KB

Line
1	/**************************************************************************
2	*
3	* mg_compression_dict.c -- Routines for reading the compression dictionary
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: mg_compression_dict.c 3745 2003-02-20 21:20:24Z mdewsnip $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "memlib.h"
27	#include "messages.h"
28	#include "local_strings.h"
29	#include "bitio_gen.h"
30	#include "bitio_m.h"
31	#include "bitio_m_stdio.h"
32	#include "timing.h"
33	#include "mgheap.h"
34	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36	#include "mg_files.h"
37	#include "locallib.h"
38	#include "invf.h"
39	#include "text.h"
40	#include "words.h"
41	#include "mg.h"
42
43	#define MAXBITS (sizeof(unsigned long) * 8)
44
45	/*
46	$Log$
47	Revision 1.1 2003/02/20 21:18:24 mdewsnip
48	Addition of MG package for search and retrieval
49
50	Revision 1.2 1999/12/10 15:04:25 cs025
51	Incorrect use of NTOHUL on a double value (num_bytes).
52
53	Revision 1.1 1999/08/10 21:18:04 sjboddie
54	renamed mg-1.3d directory mg
55
56	Revision 1.4 1999/01/15 03:05:51 rjmcnab
57
58	Renamed lib/heap to be lib/mgheap (it was causing some problems with
59	some versions of the STL libraries which contained a heap.h)
60
61	Revision 1.3 1998/12/17 09:12:52 rjmcnab
62
63	Altered mg to process utf-8 encoded Unicode. The main changes
64	are in the parsing of the input, the casefolding, and the stemming.
65
66	Revision 1.2 1998/11/25 07:55:44 rjmcnab
67
68	Modified mg to that you can specify the stemmer you want
69	to use via a command line option. You specify it to
70	mg_passes during the build process. The number of the
71	stemmer that you used is stored within the inverted
72	dictionary header and the stemmed dictionary header so
73	the correct stemmer is used in later stages of building
74	and querying.
75
76	Revision 1.1 1998/11/17 09:34:52 rjmcnab
77	* empty log message *
78
79	* Revision 1.3 1994/10/20 03:56:54 tes
80	* I have rewritten the boolean query optimiser and abstracted out the
81	* components of the boolean query.
82	*
83	* Revision 1.2 1994/09/20 04:41:45 tes
84	* For version 1.1
85	*
86	*/
87
88	static char *RCSID = "$Id: mg_compression_dict.c 3745 2003-02-20 21:20:24Z mdewsnip $";
89
90	/* #define DEBUG1 */
91
92	/* #define DEBUG */
93
94	#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))
95
96	#define MAX_RECALCULATIONS 100
97
98	typedef struct WordData
99	{
100	u_long freq, occur_num;
101	float saving;
102	float char_bit_cost;
103	u_long num_trans;
104	u_char *word;
105	}
106	WordData;
107
108	static WordData *Words[2];
109	static u_long Num[2];
110	static u_long chars[2];
111
112	#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
113	#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
114	#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)
115
116
117	typedef struct DictData
118	{
119	WordData **wd;
120	u_long num_wds;
121	u_long chars;
122	}
123	DictData;
124
125	typedef DictData DictInfo[2];
126
127	static DictInfo keep, discard, all;
128	static compression_stats_header csh;
129
130	static char *file_name = "";
131	static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;
132
133
134	static void ReadInWords (char *);
135	static void Select_on (int k, heap_comp hc);
136	static void Method3 (int k);
137	static void Select_all (void);
138	static u_long WriteOutWords (char *, u_long, int);
139	static int DecFreqIncWL (void a, void b);
140	static int OccuranceOrder (void a, void b);
141
142
143
144	int main (int argc, char **argv)
145	{
146	ProgTime StartTime;
147	int ch;
148	char type = 'C';
149	char mode = '0';
150	int lookback = 2;
151	double k = 0;
152	u_long mem_reqd;
153	opterr = 0;
154	msg_prefix = argv[0];
155	while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HBDYM")) != -1)
156	switch (ch)
157	{
158	case 'H':
159	novel_method = MG_NOVEL_HUFFMAN_CHARS;
160	break;
161	case 'B':
162	novel_method = MG_NOVEL_BINARY;
163	break;
164	case 'D':
165	novel_method = MG_NOVEL_DELTA;
166	break;
167	case 'Y':
168	novel_method = MG_NOVEL_HYBRID;
169	break;
170	case 'M':
171	novel_method = MG_NOVEL_HYBRID_MTF;
172	break;
173	case 'f': /* input file */
174	file_name = optarg;
175	break;
176	case 'd':
177	set_basepath (optarg);
178	break;
179	case 'C':
180	case 'P':
181	case 'S':
182	type = ch;
183	break;
184	case '0':
185	case '1':
186	case '2':
187	case '3':
188	mode = ch;
189	break;
190	case 'l':
191	lookback = atoi (optarg);
192	if (!is_power_of_two (lookback))
193	FatalError (1, "The lookback value must be a power of 2");
194	lookback = floorlog_2 (lookback);
195	break;
196	case 'k':
197	k = atof (optarg) * 1024;
198	break;
199	case 'h':
200	case '?':
201	fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
202	"[-d data directory] [-h] [-0\|-1\|-2\|-3] [-C\|-P\|-S] [-k mem (Kb)]\n",
203	argv[0]);
204	exit (1);
205	}
206	GetTime (&StartTime);
207
208	ReadInWords (file_name);
209
210	if (type == 'C')
211	{
212	Select_all ();
213	mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
214	}
215	else
216	{
217	switch (mode)
218	{
219	case '0':
220	Select_all ();
221	break;
222	case '1':
223	Message ("Dictionary limit of %.2f Kb", k / 1024);
224	Select_on ((int) k, OccuranceOrder);
225	break;
226	case '2':
227	Message ("Dictionary limit of %.2f Kb", k / 1024);
228	Select_on ((int) k, DecFreqIncWL);
229	break;
230	case '3':
231	Message ("Dictionary limit of %.2f Kb", k / 1024);
232	Method3 ((int) k);
233	break;
234	}
235	if (type == 'P')
236	{
237	mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
238	}
239	else
240	{
241	mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
242	}
243	}
244
245	Message ("Num words : %8u -> %8u\n", Num[1], keep[1].num_wds);
246	Message ("Num non-words : %8u -> %8u\n", Num[0], keep[0].num_wds);
247	Message ("Chars of words : %8u -> %8u\n", chars[1], keep[1].chars);
248	Message ("Chars of non-words : %8u -> %8u\n", chars[0], keep[0].chars);
249	Message ("Mem usage : %8u -> %8u\n",
250	(Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
251	(keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
252	keep[0].chars + keep[1].chars);
253	Message ("Actual mem required : %8u\n", mem_reqd);
254
255	Message ("%s", ElapsedTime (&StartTime, NULL));
256	return 0;
257	}
258
259
260
261
262	static void
263	ReadInWords (char *filename)
264	{
265	FILE *f;
266	unsigned long i;
267	f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
268	MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
269
270	fread (&csh, sizeof (csh), 1, f);
271
272	/* [RPAP - Jan 97: Endian Ordering] */
273	NTOHUL(csh.num_docs);
274	NTOHD(csh.num_bytes);
275
276	for (i = 0; i < 2; i++)
277	{
278	frags_stats_header fsh;
279	char *buf;
280	WordData *wd;
281	int j;
282	chars[i] = 0;
283
284	fread (&fsh, sizeof (fsh), 1, f);
285	/* [RPAP - Jan 97: Endian Ordering] */
286	NTOHUL(fsh.num_frags);
287	NTOHUL(fsh.mem_for_frags);
288
289	Num[i] = all[i].num_wds = fsh.num_frags;
290
291	/* The +1 on the following line is to leave room for the esc code. */
292	all[i].wd = Xmalloc (sizeof (WordData ) Num[i] + 1);
293
294	buf = Xmalloc (fsh.mem_for_frags);
295	wd = Words[i] = Xmalloc (sizeof (WordData) * Num[i]);
296	for (j = 0; j < Num[i]; j++, wd++)
297	{
298	int len;
299	fread (&wd->freq, sizeof (wd->freq), 1, f);
300	NTOHUL(wd->freq); /* [RPAP - Jan 97: Endian Ordering] */
301	fread (&wd->occur_num, sizeof (wd->occur_num), 1, f);
302	NTOHUL(wd->occur_num); /* [RPAP - Jan 97: Endian Ordering] */
303	len = fgetc (f);
304	wd->word = (u_char *) buf;
305	*buf++ = len;
306	fread (buf, len, 1, f);
307	buf += len;
308	all[i].wd[j] = wd;
309	}
310	chars[i] = fsh.mem_for_frags - fsh.num_frags;
311	}
312	fclose (f);
313	}
314
315
316	static void
317	Alloc_keep_discard (void)
318	{
319	keep[0].num_wds = 0;
320	keep[0].wd = Xmalloc ((Num[0] + 1) * sizeof (WordData *));
321	keep[1].num_wds = 0;
322	keep[1].wd = Xmalloc ((Num[1] + 1) * sizeof (WordData *));
323	discard[0].num_wds = 0;
324	discard[0].wd = Xmalloc ((Num[0] + 1) * sizeof (WordData *));
325	discard[1].num_wds = 0;
326	discard[1].wd = Xmalloc ((Num[1] + 1) * sizeof (WordData *));
327	}
328
329
330	static int
331	sort_comp (const void a, const void b)
332	{
333	WordData aa = (WordData **) a;
334	WordData bb = (WordData **) b;
335	return casecompare (aa->word, bb->word); /* [RPAP - Jan 97: Stem Index Change] */
336	}
337
338
339
340	static void
341	SortAndCount_DictData (DictData * dd)
342	{
343	int i;
344	WordData **wd;
345	qsort (dd->wd, dd->num_wds, sizeof (WordData *), sort_comp);
346	dd->chars = 0;
347	wd = dd->wd;
348	for (i = 0; i < dd->num_wds; i++, wd++)
349	dd->chars += (*wd)->word[0];
350	}
351
352
353	static void
354	Select_all (void)
355	{
356	int i;
357	Alloc_keep_discard ();
358	keep[0].num_wds = Num[0];
359	for (i = 0; i < Num[0]; i++)
360	keep[0].wd[i] = Words[0] + i;
361	keep[1].num_wds = Num[1];
362	for (i = 0; i < Num[1]; i++)
363	keep[1].wd[i] = Words[1] + i;
364	SortAndCount_DictData (&keep[0]);
365	SortAndCount_DictData (&keep[1]);
366	}
367
368
369
370
371	static int
372	DecFreqIncWL (void a, void b)
373	{
374	WordData aa = (WordData **) a;
375	WordData bb = (WordData **) b;
376	if (aa->freq > bb->freq)
377	return -1;
378	else if (aa->freq < bb->freq)
379	return 1;
380	else
381	return bb->word[0] - aa->word[0];
382	}
383
384
385	static int
386	OccuranceOrder (void a, void b)
387	{
388	WordData aa = (WordData **) a;
389	WordData bb = (WordData **) b;
390	if (aa->occur_num > bb->occur_num)
391	return 1;
392	else if (aa->occur_num < bb->occur_num)
393	return -1;
394	else
395	return 0;
396	}
397
398
399	static void
400	Select_on (int k, heap_comp hc)
401	{
402	int i, num, mem;
403	WordData **wd;
404
405	Alloc_keep_discard ();
406
407	num = Num[0] + Num[1];
408	wd = Xmalloc (num * sizeof (WordData *));
409	for (i = 0; i < Num[0]; i++)
410	wd[i] = Words[0] + i;
411	for (i = 0; i < Num[1]; i++)
412	wd[i + Num[0]] = Words[1] + i;
413
414	heap_build (wd, sizeof (*wd), num, hc);
415
416	mem = 0;
417	while (k > mem && num)
418	{
419	int idx;
420	WordData *word = wd[0];
421	#ifdef DEBUG
422	fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
423	keep[0].num_wds, keep[1].num_wds,
424	mem, word->freq, word->occur_num, word2str (word->word));
425	#endif
426	mem += sizeof (u_char *) + word->word[0];
427	heap_deletehead (wd, sizeof (*wd), &num, hc);
428	idx = KIND (word);
429	keep[idx].wd[keep[idx].num_wds++] = word;
430	}
431
432	for (i = 0; i < num; i++)
433	{
434	WordData *word = wd[i];
435	int idx = KIND (word);
436	discard[idx].wd[discard[idx].num_wds++] = word;
437	}
438	SortAndCount_DictData (&keep[0]);
439	SortAndCount_DictData (&keep[1]);
440	SortAndCount_DictData (&discard[0]);
441	SortAndCount_DictData (&discard[1]);
442
443	assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
444	assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
445
446	}
447
448
449
450	static int
451	BigSaving (void a, void b)
452	{
453	WordData aa = (WordData **) a;
454	WordData bb = (WordData **) b;
455	return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
456	}
457
458	static int
459	SmallSaving (void a, void b)
460	{
461	WordData aa = (WordData **) a;
462	WordData bb = (WordData **) b;
463	return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
464	}
465
466
467	static void
468	CalcCharCounts (WordData ** wd, int num,
469	char char_lens[2], char len_lens[2],
470	u_long escape[2])
471	{
472	long char_freqs[2][256];
473	long len_freqs[2][16];
474	huff_data hd;
475	int i;
476	escape[0] = 0;
477	escape[1] = 0;
478	bzero ((char *) char_freqs, sizeof (char_freqs));
479	bzero ((char *) len_freqs, sizeof (len_freqs));
480	for (i = 0; i < num; i++, wd++)
481	{
482	u_long freq = (*wd)->freq;
483	u_char buf = (wd)->word;
484	int len = *buf++;
485	int idx = KIND (*wd);
486	len_freqs[idx][len] += freq;
487	escape[idx] += freq;
488	for (; len; len--, buf++)
489	char_freqs[idx][(u_long) (*buf)] += freq;
490	}
491	Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
492	char_lens[0] = hd.clens;
493	Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
494	char_lens[1] = hd.clens;
495
496	Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
497	len_lens[0] = hd.clens;
498	Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
499	len_lens[1] = hd.clens;
500	}
501
502
503
504
505
506
507
508
509
510	void
511	CalcBitCost (WordData ** discard_word, int discard_num,
512	WordData ** keep_word, int keep_num, double freqs_trans[2],
513	u_long escape[2], int num_trans)
514	{
515	char *char_lens[2];
516	char *len_lens[2];
517	double esc[2];
518	int j;
519	CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
520	esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
521	esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
522	for (j = 0; j < discard_num; j++, discard_word++)
523	{
524	float cbc, wbc;
525	u_char buf = (discard_word)->word;
526	int len = *buf++;
527	u_long freq = (*discard_word)->freq;
528	int idx = KIND (*discard_word);
529
530	cbc = len_lens[idx][len];
531	for (; len; len--, buf++)
532	cbc += char_lens[idx][(u_long) (*buf)];
533
534	(discard_word)->char_bit_cost = (cbc + esc[idx]) freq;
535
536	wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
537
538	(discard_word)->saving = ((discard_word)->char_bit_cost - wbc) /
539	(sizeof (char ) + (discard_word)->word[0]);
540
541	(*discard_word)->num_trans = num_trans;
542	}
543	for (j = 0; j < keep_num; j++, keep_word++)
544	{
545	float cbc, wbc;
546	u_char buf = (keep_word)->word;
547	int len = *buf++;
548	u_long freq = (*keep_word)->freq;
549	int idx = KIND (*keep_word);
550
551	cbc = len_lens[idx][len];
552	for (; len; len--, buf++)
553	cbc += char_lens[idx][(u_long) (*buf)];
554
555	(keep_word)->char_bit_cost = (cbc + esc[idx]) freq;
556
557	wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
558
559	(keep_word)->saving = ((keep_word)->char_bit_cost - wbc) /
560	(sizeof (char ) + (keep_word)->word[0]);
561
562	(*keep_word)->num_trans = num_trans;
563	}
564	Xfree (char_lens[0]);
565	Xfree (char_lens[1]);
566	Xfree (len_lens[0]);
567	Xfree (len_lens[1]);
568	}
569
570
571
572	static void
573	Method3 (int k)
574	{
575	int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
576	int keep_to_discard = 0;
577	int discard_to_keep = 0;
578	int recalcs = 0;
579	double freqs_trans[2], total;
580	u_long escape[2];
581	WordData keep_heap, discard_heap;
582
583	freqs_trans[0] = freqs_trans[1] = 0;
584	num_trans = 0;
585
586	discard_num = Num[0] + Num[1];
587	discard_heap = Xmalloc (discard_num * sizeof (WordData *));
588
589	keep_num = 0;
590	keep_heap = Xmalloc (discard_num * sizeof (WordData *));
591
592
593	for (i = 0; i < Num[0]; i++)
594	discard_heap[i] = Words[0] + i;
595	for (i = 0; i < Num[1]; i++)
596	discard_heap[i + Num[0]] = Words[1] + i;
597
598	heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);
599
600	mem = 0;
601	while (k > mem && discard_num)
602	{
603	WordData *word = discard_heap[0];
604	mem += sizeof (u_char *) + word->word[0];
605	heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
606	keep_heap[keep_num++] = word;
607	freqs_trans[KIND (word)] += word->freq;
608	num_trans++;
609	}
610
611	CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
612	freqs_trans, escape, num_trans);
613	heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
614	heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);
615
616	total = 0;
617	recalc_reqd = 0;
618	while (keep_num && discard_num)
619	{
620	if ((keep_num && keep_heap[0]->num_trans < num_trans) \|\|
621	(discard_num && discard_heap[0]->num_trans < num_trans))
622	{
623	if (keep_num && keep_heap[0]->num_trans < num_trans)
624	{
625	WordData *word = keep_heap[0];
626	int idx = KIND (word);
627	float wbc;
628	#ifdef DEBUG1
629	fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
630	word->saving);
631	#endif
632	wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
633	word->freq;
634	word->saving = (word->char_bit_cost - wbc) /
635	(sizeof (char *) + word->word[0]);
636	#ifdef DEBUG1
637	fprintf (stderr, " %.2f\n", word->saving);
638	#endif
639	word->num_trans = num_trans;
640	heap_changedhead (keep_heap, sizeof (word), keep_num, SmallSaving);
641	}
642
643	if (discard_num && discard_heap[0]->num_trans < num_trans)
644	{
645	WordData *word = discard_heap[0];
646	int idx = KIND (word);
647	float wbc;
648	#ifdef DEBUG1
649	fprintf (stderr, "DISCARD \"%s\" %.2f ->", word2str (word->word),
650	word->saving);
651	#endif
652	wbc = -log2 (word->freq / (freqs_trans[idx] + escape[idx])) *
653	word->freq;
654	word->saving = (word->char_bit_cost - wbc) /
655	(sizeof (char *) + word->word[0]);
656	#ifdef DEBUG1
657	fprintf (stderr, " %.2f\n", word->saving);
658	#endif
659	word->num_trans = num_trans;
660	heap_changedhead (discard_heap, sizeof (word),
661	discard_num, BigSaving);
662	}
663	}
664	else if (keep_heap[0]->saving < discard_heap[0]->saving)
665	{
666	assert (keep_num && discard_num);
667	if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > k)
668	{
669	/* Transfer the word at the top of the keep heap to the
670	discard heap. */
671	WordData *word = keep_heap[0];
672	int idx = KIND (word);
673	heap_deletehead (keep_heap, sizeof (word), &keep_num, SmallSaving);
674	discard_heap[discard_num] = word;
675	heap_additem (discard_heap, sizeof (word), &discard_num,
676	BigSaving);
677	freqs_trans[idx] -= word->freq;
678	mem -= sizeof (u_char *) + word->word[0];
679	num_trans++;
680	total += word->saving;
681	keep_to_discard++;
682	#ifdef DEBUG
683	fprintf (stderr,
684	"KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
685	mem, word->freq, word->char_bit_cost,
686	word->saving, word2str (word->word));
687	#endif
688	}
689	else
690	{
691	/* Transfer the word at the top of the discard heap to the
692	keep heap. */
693	WordData *word = discard_heap[0];
694	int idx = KIND (word);
695	heap_deletehead (discard_heap, sizeof (word),
696	&discard_num, BigSaving);
697	keep_heap[keep_num] = word;
698	heap_additem (keep_heap, sizeof (word), &keep_num, SmallSaving);
699	freqs_trans[idx] += word->freq;
700	mem += sizeof (u_char *) + word->word[0];
701	num_trans++;
702	total += word->saving;
703	discard_to_keep++;
704	#ifdef DEBUG
705	fprintf (stderr,
706	"DISCARD -> KEEP %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
707	mem, word->freq, word->char_bit_cost,
708	word->saving, word2str (word->word));
709	#endif
710	}
711
712	recalc_reqd = 1;
713
714	}
715	else
716	{
717	if (recalc_reqd == 0)
718	break;
719	#ifdef DEBUG1
720	fprintf (stderr, "--------------\n");
721	#endif
722	if (recalcs == MAX_RECALCULATIONS)
723	break;
724	CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
725	freqs_trans, escape, num_trans);
726	heap_build (discard_heap, sizeof (*discard_heap),
727	discard_num, BigSaving);
728	heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
729	recalc_reqd = 0;
730	recalcs++;
731	}
732	}
733
734	Alloc_keep_discard ();
735
736	for (i = 0; i < discard_num; i++)
737	{
738	WordData *word = discard_heap[i];
739	int idx = KIND (word);
740	assert (IsWord (word) \|\| IsNonWord (word));
741	discard[idx].wd[discard[idx].num_wds++] = word;
742	}
743	for (i = 0; i < keep_num; i++)
744	{
745	WordData *word = keep_heap[i];
746	int idx = KIND (word);
747	assert (IsWord (word) \|\| IsNonWord (word));
748	keep[idx].wd[keep[idx].num_wds++] = word;
749	}
750	SortAndCount_DictData (&keep[0]);
751	SortAndCount_DictData (&keep[1]);
752	SortAndCount_DictData (&discard[0]);
753	SortAndCount_DictData (&discard[1]);
754
755	assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
756	assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
757	Message ("Keep -> Discard : %8d", keep_to_discard);
758	Message ("Discard -> Keep : %8d", discard_to_keep);
759	Message ("Huffman Recalculations : %8d", recalcs);
760	if (recalcs == MAX_RECALCULATIONS)
761	Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
762	}
763
764
765
766
767
768
769
770
771
772
773	/****************************************************************************
774	*** ***
775	*** Dictionary saving code ***
776	*** ***
777	****************************************************************************/
778
779
780
781	static void
782	Write_cdh (FILE * f, compression_dict_header * cdh)
783	{
784	/* [RPAP - Jan 97: Endian Ordering] */
785	int i;
786	compression_dict_header tmp = *cdh;
787	HTONUL(tmp.dict_type);
788	HTONUL(tmp.novel_method);
789	for (i = 0; i < TEXT_PARAMS; i++)
790	HTONUL(tmp.params[i]);
791	HTONUL(tmp.num_words[0]);
792	HTONUL(tmp.num_words[1]);
793	HTONUL(tmp.num_word_chars[0]);
794	HTONUL(tmp.num_word_chars[1]);
795	HTONUL(tmp.lookback);
796
797	fwrite (&tmp, sizeof (tmp), 1, f);
798	}
799
800
801	static void
802	Write_words (FILE * f, DictData * dd)
803	{
804	int i;
805	u_char curr, prev = NULL;
806	for (i = 0; i < dd->num_wds; i++)
807	{
808	int len;
809	curr = dd->wd[i]->word;
810	if (prev)
811	/* look for prefix match with prev string */
812	len = prefixlen (prev, curr);
813	else
814	len = 0;
815	fputc ((len << 4) + (curr[0] - len), f);
816	fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
817	prev = curr;
818	}
819
820	}
821
822
823	static int
824	Uncompressed_size (DictData * dd)
825	{
826	int i, us;
827	for (us = i = 0; i < dd->num_wds; i++)
828	us += dd->wd[i]->word[0];
829	return us;
830	}
831
832
833	static u_long
834	Write_data (FILE * f, DictData * dd, int lookback)
835	{
836	u_long mem_reqd;
837	huff_data *hd;
838	int i;
839	u_long us = dd->num_wds;
840	long *freqs;
841	u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
842	u_long lencounts[MAX_HUFFCODE_LEN + 1];
843	u_char *lastword[MAX_HUFFCODE_LEN + 1];
844
845	if (!(freqs = Xmalloc ((dd->num_wds) * sizeof (*freqs))))
846	FatalError (1, "Unable to allocate memory for freqs");
847
848	for (i = 0; i < dd->num_wds; i++)
849	{
850	freqs[i] = dd->wd[i]->freq;
851	us += dd->wd[i]->word[0];
852	}
853
854	if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
855	FatalError (1, "Unable to allocate memory for huffman data");
856
857	Xfree (freqs), freqs = NULL;
858
859	if (Write_Huffman_Data (f, hd) == -1)
860	FatalError (1, "Unable to write huffman data");
861
862	HTONUL(us); /* [RPAP - Jan 97: Endian Ordering] */
863	fwrite (&us, sizeof (us), 1, f);
864	NTOHUL(us); /* [RPAP - Jan 97: Endian Ordering] */
865
866
867	/* Calculate the amount of memory that will be required to store the text for
868	each different huffman code len. Every 1<<lookback words for each different
869	codelen length will not be prefixed by previous strings. */
870
871
872	bzero ((char *) &huff_words_size, sizeof (huff_words_size));
873	bzero ((char *) &lencounts, sizeof (lencounts));
874
875	mem_reqd = 0;
876
877	for (i = 0; i < dd->num_wds; i++)
878	{
879	int codelen = hd->clens[i];
880	u_char *word = dd->wd[i]->word;
881
882	if (!codelen)
883	FatalError (1, "The length of a code for a word was zero");
884
885	huff_words_size[codelen] += word[0] + 1;
886	mem_reqd += word[0] + (lookback != 0);
887	#if 0
888	if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
889	lastword[codelen] = word;
890	else
891	huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
892	#else
893	if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
894	{
895	int save = prefixlen (lastword[codelen], word);
896	mem_reqd -= save;
897	huff_words_size[codelen] -= save;
898	}
899	else
900	{
901	mem_reqd += sizeof (u_char *);
902	}
903	lastword[codelen] = word;
904	#endif
905	lencounts[codelen]++;
906	}
907
908	/* [RPAP - Jan 97: Endian Ordering] */
909	for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
910	HTONUL(huff_words_size[i]);
911
912	fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
913	hd->maxcodelen - hd->mincodelen + 1, f);
914
915	/* [RPAP - Jan 97: Endian Ordering] */
916	for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
917	NTOHUL(huff_words_size[i]);
918
919	Write_words (f, dd);
920
921	Xfree (hd->clens);
922	Xfree (hd);
923
924	return mem_reqd;
925	}
926
927
928
929	static void
930	Write_charfreqs (FILE * f, DictData * dd, int words,
931	int zero_freq_permitted)
932	{
933	int j;
934	long freqs[256];
935	WordData **wd = dd->wd;
936	huff_data *hd;
937
938	bzero ((char *) freqs, sizeof (freqs));
939
940	for (j = 0; j < dd->num_wds; j++, wd++)
941	{
942	u_char buf = (wd)->word;
943	int len = *buf++;
944	for (; len; len--, buf++)
945	freqs[(u_long) (buf)] += (wd)->freq;
946	}
947
948	if (!zero_freq_permitted)
949	for (j = 0; j < 256; j++)
950	if (!freqs[j] && PESINAWORD (j) == words)
951	freqs[j] = 1;
952
953	if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
954	FatalError (1, "Unable to allocate memory for huffman data");
955
956	if (Write_Huffman_Data (f, hd) == -1)
957	FatalError (1, "Unable to write huffman data");
958
959	Xfree (hd->clens);
960	Xfree (hd);
961	}
962
963
964
965
966	static void
967	Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
968	{
969	int j;
970	long freqs[16];
971	WordData **wd = dd->wd;
972	huff_data *hd;
973
974	bzero ((char *) freqs, sizeof (freqs));
975
976	for (j = 0; j < dd->num_wds; j++, wd++)
977	freqs[(wd)->word[0]] += (wd)->freq;
978
979	if (!zero_freq_permitted)
980	for (j = 0; j < 16; j++)
981	if (!freqs[j])
982	freqs[j] = 1;
983
984	if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
985	FatalError (1, "Unable to allocate memory for huffman data");
986
987	if (Write_Huffman_Data (f, hd) == -1)
988	FatalError (1, "Unable to write huffman data");
989
990
991	Xfree (hd->clens);
992	Xfree (hd);
993	}
994
995
996
997	static u_long
998	WriteOutWords (char *file_name, u_long type, int lookback)
999	{
1000	FILE *f;
1001	int i;
1002	u_long mem_reqd = 0;
1003
1004	compression_dict_header cdh;
1005	f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
1006	MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
1007
1008	bzero((char *) &cdh, sizeof(cdh));
1009
1010	cdh.dict_type = type;
1011	cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
1012	MG_NOVEL_HUFFMAN_CHARS;
1013
1014	cdh.num_words[1] = keep[1].num_wds;
1015	cdh.num_words[0] = keep[0].num_wds;
1016	cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
1017	cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
1018	cdh.lookback = lookback;
1019
1020	Write_cdh (f, &cdh);
1021
1022	for (i = 0; i < 2; i++)
1023	switch (type)
1024	{
1025	case MG_COMPLETE_DICTIONARY:
1026	{
1027	mem_reqd += Write_data (f, &keep[i], lookback);
1028	}
1029	break;
1030	case MG_PARTIAL_DICTIONARY:
1031	{
1032	if (keep[i].num_wds)
1033	{
1034	int j;
1035	WordData esc;
1036	esc.freq = 0;
1037	esc.word = (u_char *) "";
1038	keep[i].wd[keep[i].num_wds++] = &esc;
1039	for (j = 0; j < discard[i].num_wds; j++)
1040	esc.freq += discard[i].wd[j]->freq;
1041	if (!esc.freq)
1042	esc.freq++;
1043	mem_reqd += Write_data (f, &keep[i], lookback);
1044	}
1045	Write_charfreqs (f, &discard[i], i, 1);
1046	Write_wordlens (f, &discard[i], 1);
1047	}
1048	break;
1049	case MG_SEED_DICTIONARY:
1050	{
1051	if (keep[i].num_wds)
1052	{
1053	int j;
1054	WordData esc;
1055	esc.freq = 0;
1056	esc.word = (u_char *) "";
1057	keep[i].wd[keep[i].num_wds++] = &esc;
1058	for (j = 0; j < all[i].num_wds; j++)
1059	if (all[i].wd[j]->freq == 1)
1060	esc.freq++;
1061	if (!esc.freq)
1062	esc.freq++;
1063	mem_reqd += Write_data (f, &keep[i], lookback);
1064	}
1065	switch (novel_method)
1066	{
1067	case MG_NOVEL_HUFFMAN_CHARS:
1068	Write_charfreqs (f, &all[i], i, 0);
1069	Write_wordlens (f, &all[i], 0);
1070	break;
1071	case MG_NOVEL_BINARY:
1072	break;
1073	case MG_NOVEL_DELTA:
1074	break;
1075	case MG_NOVEL_HYBRID:
1076	break;
1077	case MG_NOVEL_HYBRID_MTF:
1078	break;
1079	default:
1080	FatalError (1, "Bad novel method");
1081	}
1082	}
1083	break;
1084	}
1085	fclose (f);
1086	return mem_reqd;
1087	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: