Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/mgpp/text/mg_compression_dict.cpp@ 856

Last change on this file since 856 was 856, checked in by sjboddie, 24 years ago
Rodgers new C++ mg
Property svn:keywords set to `Author Date Id Revision`
File size: 26.5 KB

Line
1	/**************************************************************************
2	*
3	* mg_compression_dict.cpp -- Routines for creating compression dictionary
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: mg_compression_dict.cpp 856 2000-01-14 02:26:25Z sjboddie $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "memlib.h"
27	#include "messages.h"
28	#include "local_strings.h"
29	#include "bitio_gen.h"
30	#include "bitio_m_stdio.h"
31	#include "mgheap.h"
32	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34	#include "mg_files.h"
35	#include "locallib.h"
36	#include "invf.h"
37	#include "text.h"
38	#include "words.h"
39	#include "mg.h"
40	#include "WordData.h"
41
42	#define MAXBITS (sizeof(unsigned long) * 8)
43
44	/*
45	$Log$
46	Revision 1.1 2000/01/14 02:26:11 sjboddie
47	Rodgers new C++ mg
48
49	Revision 1.3 1999/10/17 23:43:25 cs025
50	Changes to eradicate Xmalloc
51
52	Revision 1.2 1999/10/11 22:03:03 cs025
53	Changed mistaken use of NTOHUL on a double to NTOHD as per report
54	from Craig Neville-Manning of problems installing mg/gsdl.
55
56	Revision 1.1 1999/10/11 02:57:46 cs025
57	Base install of MG-PP
58
59	Revision 1.1 1999/08/10 21:18:04 sjboddie
60	renamed mg-1.3d directory mg
61
62	Revision 1.4 1999/01/15 03:05:51 rjmcnab
63
64	Renamed lib/heap to be lib/mgheap (it was causing some problems with
65	some versions of the STL libraries which contained a heap.h)
66
67	Revision 1.3 1998/12/17 09:12:52 rjmcnab
68
69	Altered mg to process utf-8 encoded Unicode. The main changes
70	are in the parsing of the input, the casefolding, and the stemming.
71
72	Revision 1.2 1998/11/25 07:55:44 rjmcnab
73
74	Modified mg to that you can specify the stemmer you want
75	to use via a command line option. You specify it to
76	mg_passes during the build process. The number of the
77	stemmer that you used is stored within the inverted
78	dictionary header and the stemmed dictionary header so
79	the correct stemmer is used in later stages of building
80	and querying.
81
82	Revision 1.1 1998/11/17 09:34:52 rjmcnab
83	* empty log message *
84
85	* Revision 1.3 1994/10/20 03:56:54 tes
86	* I have rewritten the boolean query optimiser and abstracted out the
87	* components of the boolean query.
88	*
89	* Revision 1.2 1994/09/20 04:41:45 tes
90	* For version 1.1
91	*
92	*/
93
94	/* #define DEBUG1 */
95
96	/* #define DEBUG */
97
98	#define is_power_of_two(a) ((a) != 0 && (((a) & ((a)-1)) == 0))
99
100	#define MAX_RECALCULATIONS 100
101
102	typedef struct DictWordData : public WordData
103	{
104	float saving;
105	float char_bit_cost;
106	u_long num_trans;
107	u_char *word;
108	}
109	DictWordData;
110
111	static DictWordData *Words[2];
112	static u_long Num[2];
113	static u_long chars[2];
114
115	#define KIND(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 0 : 1)
116	#define IsWord(p) (((p) >= Words[1] && (p) < Words[1] + Num[1]) ? 1 : 0)
117	#define IsNonWord(p) (((p) >= Words[0] && (p) < Words[0] + Num[0]) ? 1 : 0)
118
119
120	typedef struct DictData
121	{
122	DictWordData **wd;
123	u_long num_wds;
124	u_long chars;
125	}
126	DictData;
127
128	typedef DictData DictInfo[2];
129
130	static DictInfo keep, discard, all;
131	static compression_stats_header csh;
132
133	static char *file_name = "";
134	static u_long novel_method = MG_NOVEL_HUFFMAN_CHARS;
135
136
137	static void ReadInWords (char *);
138	static void Select_on (int k, heap_comp hc);
139	static void Method3 (int k);
140	static void Select_all (void);
141	static u_long WriteOutWords (char *, u_long, int);
142	static int DecFreqIncWL (void a, void b);
143	static int OccuranceOrder (void a, void b);
144
145
146
147	int main (int argc, char **argv)
148	{
149	int ch;
150	char type = 'C';
151	char mode = '0';
152	int lookback = 2;
153	double k = 0;
154	u_long mem_reqd;
155	opterr = 0;
156	msg_prefix = argv[0];
157	while ((ch = getopt (argc, argv, "0123CPSf:d:l:hk:HDY")) != -1)
158	switch (ch)
159	{
160	case 'H':
161	novel_method = MG_NOVEL_HUFFMAN_CHARS;
162	break;
163	case 'D':
164	novel_method = MG_NOVEL_DELTA;
165	break;
166	case 'Y':
167	novel_method = MG_NOVEL_HYBRID;
168	break;
169	case 'f': /* input file */
170	file_name = optarg;
171	break;
172	case 'd':
173	set_basepath (optarg);
174	break;
175	case 'C':
176	case 'P':
177	case 'S':
178	type = ch;
179	break;
180	case '0':
181	case '1':
182	case '2':
183	case '3':
184	mode = ch;
185	break;
186	case 'l':
187	lookback = atoi (optarg);
188	if (!is_power_of_two (lookback))
189	FatalError (1, "The lookback value must be a power of 2");
190	lookback = floorlog_2 (lookback);
191	break;
192	case 'k':
193	k = atof (optarg) * 1024;
194	break;
195	case 'h':
196	case '?':
197	fprintf (stderr, "usage: %s [-l lookback] [-f input_file]"
198	"[-d data directory] [-h] [-0\|-1\|-2\|-3] [-C\|-P\|-S] [-k mem (Kb)]\n",
199	argv[0]);
200	exit (1);
201	}
202
203	ReadInWords (file_name);
204
205	if (type == 'C')
206	{
207	Select_all ();
208	mem_reqd = WriteOutWords (file_name, MG_COMPLETE_DICTIONARY, lookback);
209	}
210	else
211	{
212	switch (mode)
213	{
214	case '0':
215	Select_all ();
216	break;
217	case '1':
218	Message ("Dictionary limit of %.2f Kb", k / 1024);
219	Select_on ((int) k, OccuranceOrder);
220	break;
221	case '2':
222	Message ("Dictionary limit of %.2f Kb", k / 1024);
223	Select_on ((int) k, DecFreqIncWL);
224	break;
225	case '3':
226	Message ("Dictionary limit of %.2f Kb", k / 1024);
227	Method3 ((int) k);
228	break;
229	}
230	if (type == 'P')
231	{
232	mem_reqd = WriteOutWords (file_name, MG_PARTIAL_DICTIONARY, lookback);
233	}
234	else
235	{
236	mem_reqd = WriteOutWords (file_name, MG_SEED_DICTIONARY, lookback);
237	}
238	}
239
240	Message ("Num words : %8u -> %8u\n", Num[1], keep[1].num_wds);
241	Message ("Num non-words : %8u -> %8u\n", Num[0], keep[0].num_wds);
242	Message ("Chars of words : %8u -> %8u\n", chars[1], keep[1].chars);
243	Message ("Chars of non-words : %8u -> %8u\n", chars[0], keep[0].chars);
244	Message ("Mem usage : %8u -> %8u\n",
245	(Num[0] + Num[1]) * sizeof (char *) + chars[0] + chars[1],
246	(keep[0].num_wds + keep[1].num_wds) * sizeof (char *) +
247	keep[0].chars + keep[1].chars);
248	Message ("Actual mem required : %8u\n", mem_reqd);
249
250	exit (0);
251	}
252
253
254
255
256	static void
257	ReadInWords (char *filename)
258	{
259	FILE *f;
260	unsigned long i;
261	f = open_file (filename, TEXT_STATS_DICT_SUFFIX, "rb",
262	MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
263
264	fread (&csh, sizeof (csh), 1, f);
265
266	/* [RPAP - Jan 97: Endian Ordering] */
267	NTOHUL(csh.num_docs);
268	NTOHD(csh.num_bytes);
269
270	for (i = 0; i < 2; i++)
271	{
272	frags_stats_header fsh;
273	char *buf;
274	DictWordData *wd;
275	chars[i] = 0;
276
277	fread (&fsh, sizeof (fsh), 1, f);
278	/* [RPAP - Jan 97: Endian Ordering] */
279	NTOHUL(fsh.num_frags);
280	NTOHUL(fsh.mem_for_frags);
281
282	Num[i] = all[i].num_wds = fsh.num_frags;
283
284	/* The +1 on the following line is to leave room for the esc code. */
285	all[i].wd = (DictWordData *) Xmalloc (sizeof (DictWordData ) * Num[i] + 1);
286
287	buf = (char *) Xmalloc (fsh.mem_for_frags);
288	wd = Words[i] = (DictWordData ) Xmalloc (sizeof (DictWordData) Num[i]);
289	unsigned int j;
290	for (j = 0; j < Num[i]; j++, wd++)
291	{
292	int len;
293
294	// read docCount and wordCount
295	wd->read(f);
296
297	// read an mgString
298	len = fgetc (f);
299	wd->word = (u_char *) buf;
300	*buf++ = len;
301	fread (buf, len, 1, f);
302	buf += len;
303	all[i].wd[j] = wd;
304	}
305	chars[i] = fsh.mem_for_frags - fsh.num_frags;
306	}
307	fclose (f);
308	}
309
310
311	static void
312	Alloc_keep_discard (void)
313	{
314	keep[0].num_wds = 0;
315	keep[0].wd = (DictWordData *) Xmalloc ((Num[0] + 1) sizeof (DictWordData *));
316	keep[1].num_wds = 0;
317	keep[1].wd = (DictWordData *) Xmalloc ((Num[1] + 1) sizeof (DictWordData *));
318	discard[0].num_wds = 0;
319	discard[0].wd = (DictWordData *) Xmalloc ((Num[0] + 1) sizeof (DictWordData *));
320	discard[1].num_wds = 0;
321	discard[1].wd = (DictWordData *) Xmalloc ((Num[1] + 1) sizeof (DictWordData *));
322	}
323
324
325	static int
326	sort_comp (const void a, const void b)
327	{
328	DictWordData aa = (DictWordData **) a;
329	DictWordData bb = (DictWordData **) b;
330	return casecompare (aa->word, bb->word); /* [RPAP - Jan 97: Stem Index Change] */
331	}
332
333
334
335	static void
336	SortAndCount_DictData (DictData * dd)
337	{
338	unsigned int i;
339	DictWordData **wd;
340	qsort (dd->wd, dd->num_wds, sizeof (DictWordData *), sort_comp);
341	dd->chars = 0;
342	wd = dd->wd;
343	for (i = 0; i < dd->num_wds; i++, wd++)
344	dd->chars += (*wd)->word[0];
345	}
346
347
348	static void
349	Select_all (void)
350	{
351	unsigned int i;
352	Alloc_keep_discard ();
353	keep[0].num_wds = Num[0];
354	for (i = 0; i < Num[0]; i++)
355	keep[0].wd[i] = Words[0] + i;
356	keep[1].num_wds = Num[1];
357	for (i = 0; i < Num[1]; i++)
358	keep[1].wd[i] = Words[1] + i;
359	SortAndCount_DictData (&keep[0]);
360	SortAndCount_DictData (&keep[1]);
361	}
362
363
364
365
366	static int
367	DecFreqIncWL (void a, void b)
368	{
369	DictWordData aa = (DictWordData **) a;
370	DictWordData bb = (DictWordData **) b;
371	if (aa->docCount > bb->docCount)
372	return -1;
373	else if (aa->docCount < bb->docCount)
374	return 1;
375	else
376	return bb->word[0] - aa->word[0];
377	}
378
379
380	static int
381	OccuranceOrder (void a, void b)
382	{
383	DictWordData aa = (DictWordData **) a;
384	DictWordData bb = (DictWordData **) b;
385	if (aa->words() > bb->words())
386	return 1;
387	else if (aa->words() < bb->words())
388	return -1;
389	else
390	return 0;
391	}
392
393
394	static void
395	Select_on (int k, heap_comp hc)
396	{
397	int i, num, mem;
398	DictWordData **wd;
399
400	Alloc_keep_discard ();
401
402	num = Num[0] + Num[1];
403	wd = (DictWordData *) Xmalloc (num sizeof (DictWordData *));
404	for (i = 0; (unsigned int)i < Num[0]; i++)
405	wd[i] = Words[0] + i;
406	for (i = 0; (unsigned int)i < Num[1]; i++)
407	wd[i + Num[0]] = Words[1] + i;
408
409	heap_build (wd, sizeof (*wd), num, hc);
410
411	mem = 0;
412	while (k > mem && num)
413	{
414	int idx;
415	DictWordData *word = wd[0];
416	#ifdef DEBUG
417	fprintf (stderr, "%4d:%4d:%8d :%8d :%8d : \"%s\"\n",
418	keep[0].num_wds, keep[1].num_wds,
419	mem, word->documents(), word->word, word2str (word->word));
420	#endif
421	mem += sizeof (u_char *) + word->word[0];
422	heap_deletehead (wd, sizeof (*wd), &num, hc);
423	idx = KIND (word);
424	keep[idx].wd[keep[idx].num_wds++] = word;
425	}
426
427	for (i = 0; i < num; i++)
428	{
429	DictWordData *word = wd[i];
430	int idx = KIND (word);
431	discard[idx].wd[discard[idx].num_wds++] = word;
432	}
433	SortAndCount_DictData (&keep[0]);
434	SortAndCount_DictData (&keep[1]);
435	SortAndCount_DictData (&discard[0]);
436	SortAndCount_DictData (&discard[1]);
437
438	assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
439	assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
440
441	}
442
443
444
445	static int
446	BigSaving (void a, void b)
447	{
448	DictWordData aa = (DictWordData **) a;
449	DictWordData bb = (DictWordData **) b;
450	return (aa->saving > bb->saving) ? -1 : (aa->saving < bb->saving);
451	}
452
453	static int
454	SmallSaving (void a, void b)
455	{
456	DictWordData aa = (DictWordData **) a;
457	DictWordData bb = (DictWordData **) b;
458	return (aa->saving < bb->saving) ? -1 : (aa->saving > bb->saving);
459	}
460
461
462	static void
463	CalcCharCounts (DictWordData ** wd, int num,
464	char char_lens[2], char len_lens[2],
465	u_long escape[2])
466	{
467	long char_freqs[2][256];
468	long len_freqs[2][16];
469	huff_data hd;
470	int i;
471	escape[0] = 0;
472	escape[1] = 0;
473	bzero ((char *) char_freqs, sizeof (char_freqs));
474	bzero ((char *) len_freqs, sizeof (len_freqs));
475	for (i = 0; i < num; i++, wd++)
476	{
477	u_long freq = (*wd)->documents();
478	u_char buf = (wd)->word;
479	int len = *buf++;
480	int idx = KIND (*wd);
481	len_freqs[idx][len] += freq;
482	escape[idx] += freq;
483	for (; len; len--, buf++)
484	char_freqs[idx][(u_long) (*buf)] += freq;
485	}
486	Generate_Huffman_Data (256, char_freqs[0], &hd, NULL);
487	char_lens[0] = hd.clens;
488	Generate_Huffman_Data (256, char_freqs[1], &hd, NULL);
489	char_lens[1] = hd.clens;
490
491	Generate_Huffman_Data (16, len_freqs[0], &hd, NULL);
492	len_lens[0] = hd.clens;
493	Generate_Huffman_Data (16, len_freqs[1], &hd, NULL);
494	len_lens[1] = hd.clens;
495	}
496
497
498
499
500
501	inline void CalcBitCostForWordData(DictWordData **word, int num,
502	double freqs_trans[2], u_long escape[2],
503	char * char_lens[2], char *len_lens[2],
504	double esc[2], int num_trans)
505	{
506	int j;
507
508	for (j = 0; j < num; j++, word++)
509	{
510	float cbc, wbc;
511	u_char buf = (word)->word;
512	int len = *buf++;
513	u_long freq = (*word)->documents();
514	int idx = KIND (*word);
515
516	cbc = len_lens[idx][len];
517	for (; len; len--, buf++)
518	cbc += char_lens[idx][(u_long) (*buf)];
519
520	(word)->char_bit_cost = (cbc + esc[idx]) freq;
521
522	wbc = -log2 (freq / (freqs_trans[idx] + escape[idx])) * freq;
523
524	(word)->saving = ((word)->char_bit_cost - wbc) /
525	(sizeof (char ) + (word)->word[0]);
526
527	(*word)->num_trans = num_trans;
528	}
529	}
530
531
532
533	void
534	CalcBitCost (DictWordData ** discard_word, int discard_num,
535	DictWordData ** keep_word, int keep_num, double freqs_trans[2],
536	u_long escape[2], int num_trans)
537	{
538	char *char_lens[2];
539	char *len_lens[2];
540	double esc[2];
541	CalcCharCounts (discard_word, discard_num, char_lens, len_lens, escape);
542	esc[0] = -log2 (escape[0] / (freqs_trans[0] + escape[0]));
543	esc[1] = -log2 (escape[1] / (freqs_trans[1] + escape[1]));
544	CalcBitCostForWordData(discard_word, discard_num, freqs_trans, escape,
545	char_lens, len_lens, esc, num_trans);
546	CalcBitCostForWordData(keep_word, keep_num, freqs_trans, escape,
547	char_lens, len_lens, esc, num_trans);
548	Xfree (char_lens[0]);
549	Xfree (char_lens[1]);
550	Xfree (len_lens[0]);
551	Xfree (len_lens[1]);
552	}
553
554	inline void m3_transferWord(DictWordData *toHeap, int toNum, heap_comp toSaving,
555	DictWordData *fromHeap, int fromNum, heap_comp fromSaving,
556	int num_trans, double freqs_trans[], int mem, double *total,
557	int *count)
558	{
559	/* Transfer the word at the top of the keep heap to the
560	discard heap. */
561	DictWordData *word = fromHeap[0];
562	int idx = KIND (word);
563	heap_deletehead (fromHeap, sizeof (word), fromNum, fromSaving);
564	toHeap[*toNum] = word;
565	heap_additem (toHeap, sizeof (word), toNum, toSaving);
566	freqs_trans[idx] -= word->documents();
567	mem = (mem) - sizeof (u_char *) + word->word[0];
568	*num_trans += 1;
569	*total += word->saving;
570	*count += 1;
571	#ifdef DEBUG
572	fprintf (stderr,
573	"KEEP -> DISCARD %8d :%8d :%8.0f :%8.0f : \"%s\"\n",
574	*mem, word->documents(), word->char_bit_cost,
575	word->saving, word2str (word->word));
576	#endif
577	}
578
579	inline void m3_storeWord(DictWordData **wordHeap, int wordNum, int num_trans,
580	double freqs_trans[], u_long escape[], heap_comp hc)
581	{
582	DictWordData *word = wordHeap[0];
583	int idx = KIND (word);
584	float wbc;
585	#ifdef DEBUG1
586	fprintf (stderr, "KEEP \"%s\" %.2f ->", word2str (word->word),
587	word->saving);
588	#endif
589	wbc = -log2 (word->documents() / (freqs_trans[idx] + escape[idx])) *
590	word->documents();
591	word->saving = (word->char_bit_cost - wbc) /
592	(sizeof (char *) + word->word[0]);
593	#ifdef DEBUG1
594	fprintf (stderr, " %.2f\n", word->saving);
595	#endif
596	word->num_trans = num_trans;
597	heap_changedhead (wordHeap, sizeof (word), wordNum, hc);
598	}
599
600
601	static void
602	Method3 (int k)
603	{
604	int i, keep_num, discard_num, mem, num_trans, recalc_reqd;
605	int keep_to_discard = 0;
606	int discard_to_keep = 0;
607	int recalcs = 0;
608	double freqs_trans[2], total;
609	u_long escape[2];
610	DictWordData keep_heap, discard_heap;
611
612	freqs_trans[0] = freqs_trans[1] = 0;
613	num_trans = 0;
614
615	discard_num = Num[0] + Num[1];
616	discard_heap = (DictWordData *) Xmalloc (discard_num sizeof (DictWordData *));
617
618	keep_num = 0;
619	keep_heap = (DictWordData *) Xmalloc (discard_num sizeof (DictWordData *));
620
621
622	for (i = 0; (unsigned int)i < Num[0]; i++)
623	discard_heap[i] = Words[0] + i;
624	for (i = 0; (unsigned int)i < Num[1]; i++)
625	discard_heap[i + Num[0]] = Words[1] + i;
626
627	heap_build (discard_heap, sizeof (*discard_heap), discard_num, DecFreqIncWL);
628
629	mem = 0;
630	while (k > mem && discard_num)
631	{
632	DictWordData *word = discard_heap[0];
633	mem += sizeof (u_char *) + word->word[0];
634	heap_deletehead (discard_heap, sizeof (word), &discard_num, DecFreqIncWL);
635	keep_heap[keep_num++] = word;
636	freqs_trans[KIND (word)] += word->documents();
637	num_trans++;
638	}
639
640	CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
641	freqs_trans, escape, num_trans);
642	heap_build (discard_heap, sizeof (*discard_heap), discard_num, BigSaving);
643	heap_build (keep_heap, sizeof (*keep_heap), keep_num, SmallSaving);
644
645	total = 0;
646	recalc_reqd = 0;
647	while (keep_num && discard_num)
648	{
649	if ((keep_num && keep_heap[0]->num_trans < (unsigned)num_trans) \|\|
650	(discard_num && discard_heap[0]->num_trans < (unsigned)num_trans))
651	{
652	if (keep_num && keep_heap[0]->num_trans < (unsigned)num_trans)
653	{
654	m3_storeWord(keep_heap, keep_num, num_trans, freqs_trans, escape, SmallSaving);
655	}
656
657	if (discard_num && discard_heap[0]->num_trans < (unsigned)num_trans)
658	{
659	m3_storeWord(discard_heap, discard_num, num_trans, freqs_trans, escape, BigSaving);
660	}
661	}
662	else if (keep_heap[0]->saving < discard_heap[0]->saving)
663	{
664	assert (keep_num && discard_num);
665	if (keep_num && mem + sizeof (char *) + discard_heap[0]->word[0] > (unsigned)k)
666	{
667	/* Transfer the word at the top of the keep heap to the
668	discard heap. */
669	m3_transferWord(discard_heap, &discard_num, BigSaving,
670	keep_heap, &keep_num, SmallSaving,
671	&num_trans, freqs_trans, &mem, &total, &keep_to_discard);
672	}
673	else
674	{
675	/* Transfer the word at the top of the discard heap to the
676	keep heap. */
677	m3_transferWord(keep_heap, &keep_num, SmallSaving,
678	discard_heap, &discard_num, BigSaving,
679	&num_trans, freqs_trans, &mem, &total, &discard_to_keep);
680	}
681
682	recalc_reqd = 1;
683
684	}
685	else
686	{
687	if (recalc_reqd == 0)
688	break;
689	#ifdef DEBUG1
690	fprintf (stderr, "--------------\n");
691	#endif
692	if (recalcs == MAX_RECALCULATIONS)
693	break;
694	CalcBitCost (discard_heap, discard_num, keep_heap, keep_num,
695	freqs_trans, escape, num_trans);
696	heap_build (discard_heap, sizeof (*discard_heap),
697	discard_num, BigSaving);
698	heap_build (keep_heap, sizeof (keep_heap), keep_num, SmallSaving);
699	recalc_reqd = 0;
700	recalcs++;
701	}
702	}
703
704	Alloc_keep_discard ();
705
706	for (i = 0; i < discard_num; i++)
707	{
708	DictWordData *word = discard_heap[i];
709	int idx = KIND (word);
710	assert (IsWord (word) \|\| IsNonWord (word));
711	discard[idx].wd[discard[idx].num_wds++] = word;
712	}
713	for (i = 0; i < keep_num; i++)
714	{
715	DictWordData *word = keep_heap[i];
716	int idx = KIND (word);
717	assert (IsWord (word) \|\| IsNonWord (word));
718	keep[idx].wd[keep[idx].num_wds++] = word;
719	}
720	SortAndCount_DictData (&keep[0]);
721	SortAndCount_DictData (&keep[1]);
722	SortAndCount_DictData (&discard[0]);
723	SortAndCount_DictData (&discard[1]);
724
725	assert (keep[0].num_wds + discard[0].num_wds == Num[0]);
726	assert (keep[1].num_wds + discard[1].num_wds == Num[1]);
727	Message ("Keep -> Discard : %8d", keep_to_discard);
728	Message ("Discard -> Keep : %8d", discard_to_keep);
729	Message ("Huffman Recalculations : %8d", recalcs);
730	if (recalcs == MAX_RECALCULATIONS)
731	Message ("WARNING: The number of recalculations == %d", MAX_RECALCULATIONS);
732	}
733
734
735
736
737
738
739
740
741
742
743	/****************************************************************************
744	*** ***
745	*** Dictionary saving code ***
746	*** ***
747	****************************************************************************/
748
749
750
751	static void
752	Write_cdh (FILE * f, compression_dict_header * cdh)
753	{
754	/* [RPAP - Jan 97: Endian Ordering] */
755	int i;
756	compression_dict_header tmp = *cdh;
757	HTONUL(tmp.dict_type);
758	HTONUL(tmp.novel_method);
759	for (i = 0; i < TEXT_PARAMS; i++)
760	HTONUL(tmp.params[i]);
761	HTONUL(tmp.num_words[0]);
762	HTONUL(tmp.num_words[1]);
763	HTONUL(tmp.num_word_chars[0]);
764	HTONUL(tmp.num_word_chars[1]);
765	HTONUL(tmp.lookback);
766
767	fwrite (&tmp, sizeof (tmp), 1, f);
768	}
769
770
771	static void
772	Write_words (FILE * f, DictData * dd)
773	{
774	unsigned int i;
775	u_char curr, prev = NULL;
776	for (i = 0; i < dd->num_wds; i++)
777	{
778	int len;
779	curr = dd->wd[i]->word;
780	if (prev)
781	/* look for prefix match with prev string */
782	len = prefixlen (prev, curr);
783	else
784	len = 0;
785	fputc ((len << 4) + (curr[0] - len), f);
786	fwrite (curr + len + 1, sizeof (u_char), curr[0] - len, f);
787	prev = curr;
788	}
789
790	}
791
792
793	static int
794	Uncompressed_size (DictData * dd)
795	{
796	unsigned int i, us;
797	for (us = i = 0; i < dd->num_wds; i++)
798	us += dd->wd[i]->word[0];
799	return us;
800	}
801
802
803	static u_long
804	Write_data (FILE * f, DictData * dd, int lookback)
805	{
806	u_long mem_reqd;
807	huff_data *hd;
808	int i;
809	u_long us = dd->num_wds;
810	long *freqs;
811	u_long huff_words_size[MAX_HUFFCODE_LEN + 1];
812	u_long lencounts[MAX_HUFFCODE_LEN + 1];
813	u_char *lastword[MAX_HUFFCODE_LEN + 1];
814
815	if (!(freqs = new long [dd->num_wds]))
816	FatalError (1, "Unable to allocate memory for freqs");
817
818	for (i = 0; (unsigned)i < dd->num_wds; i++)
819	{
820	freqs[i] = dd->wd[i]->documents();
821	us += dd->wd[i]->word[0];
822	}
823
824	if (!(hd = Generate_Huffman_Data (dd->num_wds, freqs, NULL, NULL)))
825	FatalError (1, "Unable to allocate memory for huffman data");
826
827	delete (freqs);
828	freqs = NULL;
829
830	if (Write_Huffman_Data (f, hd) == -1)
831	FatalError (1, "Unable to write huffman data");
832
833	HTONUL(us); /* [RPAP - Jan 97: Endian Ordering] */
834	fwrite (&us, sizeof (us), 1, f);
835	NTOHUL(us); /* [RPAP - Jan 97: Endian Ordering] */
836
837
838	/* Calculate the amount of memory that will be required to store the text for
839	each different huffman code len. Every 1<<lookback words for each different
840	codelen length will not be prefixed by previous strings. */
841
842
843	bzero ((char *) &huff_words_size, sizeof (huff_words_size));
844	bzero ((char *) &lencounts, sizeof (lencounts));
845
846	mem_reqd = 0;
847
848	for (i = 0; (unsigned)i < dd->num_wds; i++)
849	{
850	int codelen = hd->clens[i];
851	u_char *word = dd->wd[i]->word;
852
853	if (!codelen)
854	FatalError (1, "The length of a code for a word was zero");
855
856	huff_words_size[codelen] += word[0] + 1;
857	mem_reqd += word[0] + (lookback != 0);
858	#if 0
859	if ((lencounts[codelen] & ((1 << lookback) - 1)) == 0)
860	lastword[codelen] = word;
861	else
862	huff_words_size[codelen] -= prefixlen (lastword[codelen], word);
863	#else
864	if ((lencounts[codelen] & ((1 << lookback) - 1)) != 0)
865	{
866	int save = prefixlen (lastword[codelen], word);
867	mem_reqd -= save;
868	huff_words_size[codelen] -= save;
869	}
870	else
871	{
872	mem_reqd += sizeof (u_char *);
873	}
874	lastword[codelen] = word;
875	#endif
876	lencounts[codelen]++;
877	}
878
879	/* [RPAP - Jan 97: Endian Ordering] */
880	for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
881	HTONUL(huff_words_size[i]);
882
883	fwrite (huff_words_size + hd->mincodelen, sizeof (*huff_words_size),
884	hd->maxcodelen - hd->mincodelen + 1, f);
885
886	/* [RPAP - Jan 97: Endian Ordering] */
887	for (i = hd->mincodelen; i < hd->maxcodelen + 1; i++)
888	NTOHUL(huff_words_size[i]);
889
890	Write_words (f, dd);
891
892	delete hd->clens;
893	delete hd;
894
895	return mem_reqd;
896	}
897
898
899
900	static void
901	Write_charfreqs (FILE * f, DictData * dd, int words,
902	int zero_freq_permitted)
903	{
904	unsigned int j;
905	long freqs[256];
906	DictWordData **wd = dd->wd;
907	huff_data *hd;
908
909	bzero ((char *) freqs, sizeof (freqs));
910
911	for (j = 0; j < dd->num_wds; j++, wd++)
912	{
913	u_char buf = (wd)->word;
914	int len = *buf++;
915	for (; len; len--, buf++)
916	freqs[(u_long) (buf)] += (wd)->documents();
917	}
918
919	if (!zero_freq_permitted)
920	for (j = 0; j < 256; j++)
921	if (!freqs[j] && PESINAWORD (j) == words)
922	freqs[j] = 1;
923
924	if (!(hd = Generate_Huffman_Data (256, freqs, NULL, NULL)))
925	FatalError (1, "Unable to allocate memory for huffman data");
926
927	if (Write_Huffman_Data (f, hd) == -1)
928	FatalError (1, "Unable to write huffman data");
929
930	delete hd->clens;
931	delete hd;
932	}
933
934
935
936
937	static void
938	Write_wordlens (FILE * f, DictData * dd, int zero_freq_permitted)
939	{
940	unsigned int j;
941	long freqs[16];
942	DictWordData **wd = dd->wd;
943	huff_data *hd;
944
945	bzero ((char *) freqs, sizeof (freqs));
946
947	for (j = 0; j < dd->num_wds; j++, wd++)
948	freqs[(wd)->word[0]] += (wd)->documents();
949
950	if (!zero_freq_permitted)
951	for (j = 0; j < 16; j++)
952	if (!freqs[j])
953	freqs[j] = 1;
954
955	if (!(hd = Generate_Huffman_Data (16, freqs, NULL, NULL)))
956	FatalError (1, "Unable to allocate memory for huffman data");
957
958	if (Write_Huffman_Data (f, hd) == -1)
959	FatalError (1, "Unable to write huffman data");
960
961
962	delete hd->clens;
963	delete hd;
964	}
965
966
967
968	static u_long
969	WriteOutWords (char *file_name, u_long type, int lookback)
970	{
971	FILE *f;
972	int i;
973	u_long mem_reqd = 0;
974
975	compression_dict_header cdh;
976	f = create_file (file_name, TEXT_DICT_SUFFIX, "w+b",
977	MAGIC_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
978
979	bzero((char *) &cdh, sizeof(cdh));
980
981	cdh.dict_type = type;
982	cdh.novel_method = (type == MG_SEED_DICTIONARY) ? novel_method :
983	MG_NOVEL_HUFFMAN_CHARS;
984
985	cdh.num_words[1] = keep[1].num_wds;
986	cdh.num_words[0] = keep[0].num_wds;
987	cdh.num_word_chars[1] = Uncompressed_size (&keep[1]);
988	cdh.num_word_chars[0] = Uncompressed_size (&keep[0]);
989	cdh.lookback = lookback;
990
991	Write_cdh (f, &cdh);
992
993	for (i = 0; i < 2; i++)
994	switch (type)
995	{
996	case MG_COMPLETE_DICTIONARY:
997	{
998	mem_reqd += Write_data (f, &keep[i], lookback);
999	}
1000	break;
1001	case MG_PARTIAL_DICTIONARY:
1002	{
1003	if (keep[i].num_wds)
1004	{
1005	int j;
1006	DictWordData esc;
1007	esc.docCount = 0;
1008	esc.word = (u_char *) "";
1009	keep[i].wd[keep[i].num_wds++] = &esc;
1010	for (j = 0; (unsigned)j < discard[i].num_wds; j++)
1011	esc.docCount += discard[i].wd[j]->documents();
1012	if (!esc.docCount)
1013	esc.docCount++;
1014	mem_reqd += Write_data (f, &keep[i], lookback);
1015	}
1016	Write_charfreqs (f, &discard[i], i, 1);
1017	Write_wordlens (f, &discard[i], 1);
1018	}
1019	break;
1020	case MG_SEED_DICTIONARY:
1021	{
1022	if (keep[i].num_wds)
1023	{
1024	int j;
1025	DictWordData esc;
1026	esc.docCount = 0;
1027	esc.word = (u_char *) "";
1028	keep[i].wd[keep[i].num_wds++] = &esc;
1029	for (j = 0; (unsigned)j < all[i].num_wds; j++)
1030	if (all[i].wd[j]->documents() == 1)
1031	esc.docCount++;
1032	if (!esc.docCount)
1033	esc.docCount++;
1034	mem_reqd += Write_data (f, &keep[i], lookback);
1035	}
1036	switch (novel_method)
1037	{
1038	case MG_NOVEL_HUFFMAN_CHARS:
1039	Write_charfreqs (f, &all[i], i, 0);
1040	Write_wordlens (f, &all[i], 0);
1041	break;
1042	case MG_NOVEL_DELTA:
1043	break;
1044	case MG_NOVEL_HYBRID:
1045	break;
1046	default:
1047	FatalError (1, "Bad novel method");
1048	}
1049	}
1050	break;
1051	}
1052	fclose (f);
1053	return mem_reqd;
1054	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: