Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/packages/mg/src/text/mg_text_estimate.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago
renamed mg-1.3d directory mg
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 14.1 KB

Line
1	/**************************************************************************
2	*
3	* mg_text_estimate.c -- Program to estimate the compressed text size
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: mg_text_estimate.c 439 1999-08-10 21:23:37Z sjboddie $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "memlib.h"
27	#include "messages.h"
28	#include "local_strings.h"
29	#include "bitio_gen.h"
30	#include "bitio_m.h"
31	#include "bitio_m_stdio.h"
32	#include "timing.h"
33	#include "mgheap.h"
34	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36	#include "mg_files.h"
37	#include "locallib.h"
38	#include "invf.h"
39	#include "text.h"
40	#include "words.h"
41	#include "mg.h"
42	#include "comp_dict.h"
43	#include "hash.h"
44
45
46
47
48
49	/*
50	$Log$
51	Revision 1.1 1999/08/10 21:18:14 sjboddie
52	renamed mg-1.3d directory mg
53
54	Revision 1.4 1999/01/15 03:05:51 rjmcnab
55
56	Renamed lib/heap to be lib/mgheap (it was causing some problems with
57	some versions of the STL libraries which contained a heap.h)
58
59	Revision 1.3 1998/12/17 09:12:53 rjmcnab
60
61	Altered mg to process utf-8 encoded Unicode. The main changes
62	are in the parsing of the input, the casefolding, and the stemming.
63
64	Revision 1.2 1998/11/25 07:55:48 rjmcnab
65
66	Modified mg to that you can specify the stemmer you want
67	to use via a command line option. You specify it to
68	mg_passes during the build process. The number of the
69	stemmer that you used is stored within the inverted
70	dictionary header and the stemmed dictionary header so
71	the correct stemmer is used in later stages of building
72	and querying.
73
74	Revision 1.1 1998/11/17 09:35:19 rjmcnab
75	* empty log message *
76
77	* Revision 1.3 1994/10/20 03:56:59 tes
78	* I have rewritten the boolean query optimiser and abstracted out the
79	* components of the boolean query.
80	*
81	* Revision 1.2 1994/09/20 04:41:54 tes
82	* For version 1.1
83	*
84	*/
85
86	static char *RCSID = "$Id: mg_text_estimate.c 439 1999-08-10 21:23:37Z sjboddie $";
87
88	typedef struct stats_t
89	{
90	u_long freq, occur;
91	u_long len;
92	}
93	stats_t;
94
95	static u_long huff_count = 0;
96	static u_long escape_count = 0;
97
98
99	char mode = 'H';
100
101	/* [RJM 07/97: 4G limit] */
102	static double ReadInWordsStandard (char name, double num_bytes);
103	static double ReadInWordsSpecial (char name, double num_bytes,
104	double *num_extra_bytes,
105	u_long *aux_compressed);
106
107
108
109
110	int main (int argc, char **argv)
111	{
112	char stats_dict = NULL, comp_dict = NULL;
113	ProgTime StartTime;
114	double num_bytes, num_extra_bytes = 0; /* [RJM 07/97: 4G limit] */
115	u_long aux_compressed;
116	int ch;
117	double bytes;
118	opterr = 0;
119	msg_prefix = argv[0];
120	while ((ch = getopt (argc, argv, "BMDYHh")) != -1)
121	switch (ch)
122	{
123	case 'B':
124	Message ("The compressed text size for this method cannot be\n"
125	"estimated exactly, so an upper bound will be calculated.");
126	mode = ch;
127	break;
128	case 'M':
129	Message ("The compressed text size for this method cannot be\n"
130	"estimated exactly, so an upper bound will be calculated.");
131	mode = 'Y';
132	break;
133	case 'D':
134	case 'Y':
135	case 'H':
136	mode = ch;
137	break;
138	case 'h':
139	case '?':
140	fprintf (stderr, "usage: %s [-h] [B\|M\|D\|Y\|H] stats_dict comp_dict\n",
141	argv[0]);
142	exit (1);
143	}
144
145	if (optind < argc)
146	stats_dict = argv[optind++];
147	else
148	stats_dict = NULL;
149	if (optind < argc)
150	comp_dict = argv[optind++];
151	else
152	{
153	fprintf (stderr, "usage: %s [-h] [B\|M\|D\|Y\|H] stats_dict comp_dict\n",
154	argv[0]);
155	exit (1);
156	}
157
158
159
160	GetTime (&StartTime);
161
162	if (LoadCompressionDictionary (comp_dict) == COMPERROR)
163	exit (1);
164
165	if (mode == 'H')
166	bytes = ReadInWordsStandard (stats_dict, &num_bytes);
167	else
168	bytes = ReadInWordsSpecial (stats_dict, &num_bytes, &num_extra_bytes,
169	&aux_compressed);
170
171
172	Message ("Compressed Text %.2f Mb ( %.0f bytes %0.3f %%)",
173	bytes / 1024.0 / 1024, bytes, bytes * 100 / num_bytes);
174	Message ("Words portion of the dictionary %.2f Mb ( %.0f bytes %0.3f %%)",
175	Words_disk / 1024.0 / 1024, Words_disk * 1.0,
176	(Words_disk * 100.0) / num_bytes);
177	if (mode == 'H')
178	Message ("Huffman info for chars in dictionary %.2f Mb ( %.0f bytes %0.3f %%)",
179	Chars_disk / 1024.0 / 1024, Chars_disk * 1.0,
180	(Chars_disk * 100.0) / num_bytes);
181	else
182	{
183	Message ("Aux dictionary (Uncompressed) %.2f Mb ( %.0f bytes %0.3f %%)",
184	num_extra_bytes / 1024.0 / 1024, num_extra_bytes * 1.0,
185	(num_extra_bytes * 100.0) / num_bytes);
186	Message ("Aux dictionary (Compressed) %.2f Mb ( %.0f bytes %0.3f %%)",
187	aux_compressed / 1024.0 / 1024, aux_compressed * 1.0,
188	(aux_compressed * 100.0) / num_bytes);
189	}
190	Message ("Words and non-words in the text %u", huff_count + escape_count);
191	Message ("Words and non-words coded by escapes %u ( %0.3f %%)", escape_count,
192	escape_count * 100.0 / huff_count);
193	Message ("%s", ElapsedTime (&StartTime, NULL));
194	return 0;
195	}
196
197
198
199
200
201	static double
202	ReadInWordsStandard (char name, double num_bytes)
203	{
204	compression_stats_header csh;
205	FILE *f;
206	u_long which;
207	double bytes, bits = 0;
208
209	f = open_named_file (name, "rb", MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
210
211	fread (&csh, sizeof (csh), 1, f);
212
213	/* [RPAP - Jan 97: Endian Ordering] */
214	NTOHUL(csh.num_docs);
215	NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
216
217	for (which = 0; which < 2; which++)
218	{
219	frags_stats_header fsh;
220	int j;
221	fread (&fsh, sizeof (fsh), 1, f);
222
223	/* [RPAP - Jan 97: Endian Ordering] */
224	NTOHUL(fsh.num_frags);
225	NTOHUL(fsh.mem_for_frags);
226
227	for (j = 0; j < fsh.num_frags; j++)
228	{
229	int len, freq, res, occur_num;
230	u_char Word[16];
231	fread (&freq, sizeof (freq), 1, f);
232	fread (&occur_num, sizeof (occur_num), 1, f);
233	len = fgetc (f);
234
235	/* [RPAP - Jan 97: Endian Ordering] */
236	NTOHSI(freq);
237	NTOHSI(occur_num);
238
239	Word[0] = len;
240	fread (Word + 1, len, 1, f);
241
242	/* Search the hash table for Word */
243	if (ht[which])
244	{
245	register unsigned long hashval, step;
246	register int tsize = ht[which]->size;
247	register u_char **wptr;
248	HASH (hashval, step, Word, tsize);
249	for (;;)
250	{
251	register u_char *s1;
252	register u_char *s2;
253	register int len;
254	wptr = ht[which]->table[hashval];
255	if (wptr == NULL)
256	{
257	res = COMPERROR;
258	break;
259	}
260
261	/* Compare the words */
262	s1 = Word;
263	s2 = *wptr;
264	len = *s1 + 1;
265	for (; len; len--)
266	if (s1++ != s2++)
267	break;
268
269	if (len)
270	{
271	hashval += step;
272	if (hashval >= tsize)
273	hashval -= tsize;
274	}
275	else
276	{
277	res = ht[which]->table[hashval] - ht[which]->words;
278	break;
279	}
280	}
281	}
282	else
283	res = COMPERROR;
284
285	/* Check that the word was found in the dictionary */
286	if (res == COMPERROR)
287	{
288	escape_count += freq;
289	if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
290	Message ("Unknown word \"%.s\"\n", Word, Word + 1);
291	if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
292	{
293	unsigned i;
294	if (ht[which])
295	{
296	res = ht[which]->hd->num_codes - 1;
297	bits += ht[which]->hd->clens[res] * freq;
298	}
299	bits += lens_huff[which].clens[Word[0]] * freq;
300	for (i = 0; i < Word[0]; i++)
301	bits += char_huff[which].clens[Word[i + 1]] * freq;
302	}
303	if (cdh.dict_type == MG_SEED_DICTIONARY)
304	{
305	unsigned i;
306	if (ht[which])
307	{
308	res = ht[which]->hd->num_codes - 1;
309	bits += ht[which]->hd->clens[res] * freq;
310	}
311	bits += lens_huff[which].clens[Word[0]] * freq;
312	for (i = 0; i < Word[0]; i++)
313	bits += char_huff[which].clens[Word[i + 1]] * freq;
314	}
315	}
316	else
317	{
318	bits += ht[which]->hd->clens[res] * freq;
319	huff_count += freq;
320	}
321	}
322	}
323	fclose (f);
324
325
326	/*
327	* The 5.5 bits added on per document consist of
328	* 1 bit for the word/non-word flag at the start of the document.
329	* 1 bit on average for document termination.
330	* 3.5 bits on average for padding to a byte boundary.
331	*/
332	bits += 5.5 * csh.num_docs;
333
334	bytes = bits / 8 + sizeof (compressed_text_header) + sizeof (u_long);
335
336	*num_bytes = csh.num_bytes;
337
338	return bytes;
339	}
340
341
342	static int
343	stats_comp (const void a, const void b)
344	{
345	return ((stats_t ) a)->occur - ((stats_t ) b)->occur;
346	}
347
348
349
350
351
352	static double
353	ReadInWordsSpecial (char name, double num_bytes,
354	double *num_extra_bytes,
355	u_long *aux_compressed)
356	{
357	compression_stats_header csh;
358	FILE *f;
359	u_long magic, which;
360	double bytes, bits = 0;
361	double size[2];
362
363	if (!(f = fopen (name, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
364	FatalError (1, "Unable to open file \"%s\"\n", name);
365
366	if (fread (&magic, sizeof (magic), 1, f) != 1)
367	FatalError (1, "Unable to read magic number");
368
369	NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */
370
371	if (magic != MAGIC_STATS_DICT)
372	FatalError (1, "Bad magic number");
373
374	fread (&csh, sizeof (csh), 1, f);
375
376	/* [RPAP - Jan 97: Endian Ordering] */
377	NTOHUL(csh.num_docs);
378	NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
379
380	for (which = 0; which < 2; which++)
381	{
382	stats_t *stats;
383	frags_stats_header fsh;
384	int j, num = 0;
385	long lens[16], chars[256];
386
387	bzero ((char *) lens, sizeof (lens));
388	bzero ((char *) chars, sizeof (chars));
389
390	fread (&fsh, sizeof (fsh), 1, f);
391
392	/* [RPAP - Jan 97: Endian Ordering] */
393	NTOHUL(fsh.num_frags);
394	NTOHUL(fsh.mem_for_frags);
395
396	stats = Xmalloc (fsh.num_frags * sizeof (*stats));
397	if (!stats)
398	FatalError (1, "Unable to allocate memory for stats");
399	for (j = 0; j < fsh.num_frags; j++)
400	{
401	int len, res;
402	u_char Word[16];
403	fread (&stats[num].freq, sizeof (stats[num].freq), 1, f);
404	fread (&stats[num].occur, sizeof (stats[num].occur), 1, f);
405	len = fgetc (f);
406
407	/* [RPAP - Jan 97: Endian Ordering] */
408	NTOHSI(stats[num].freq);
409	NTOHSI(stats[num].occur);
410
411	stats[num].len = len + 1;
412
413	Word[0] = len;
414	fread (Word + 1, len, 1, f);
415
416	/* Search the hash table for Word */
417	if (ht[which])
418	{
419	register unsigned long hashval, step;
420	register int tsize = ht[which]->size;
421	register u_char **wptr;
422	HASH (hashval, step, Word, tsize);
423	for (;;)
424	{
425	register u_char *s1;
426	register u_char *s2;
427	register int len;
428	wptr = ht[which]->table[hashval];
429	if (wptr == NULL)
430	{
431	res = COMPERROR;
432	break;
433	}
434
435	/* Compare the words */
436	s1 = Word;
437	s2 = *wptr;
438	len = *s1 + 1;
439	for (; len; len--)
440	if (s1++ != s2++)
441	break;
442
443	if (len)
444	{
445	hashval += step;
446	if (hashval >= tsize)
447	hashval -= tsize;
448	}
449	else
450	{
451	res = ht[which]->table[hashval] - ht[which]->words;
452	break;
453	}
454	}
455	}
456	else
457	res = COMPERROR;
458
459	assert (stats[num].freq > 0);
460
461	/* Check that the word was found in the dictionary */
462	if (res == COMPERROR)
463	{
464	if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
465	Message ("Unknown word \"%.s\"\n", Word, Word + 1);
466	if ((cdh.dict_type == MG_PARTIAL_DICTIONARY \|\|
467	cdh.dict_type == MG_SEED_DICTIONARY) &&
468	ht[which])
469	{
470	int k;
471	res = ht[which]->hd->num_codes - 1;
472	bits += ht[which]->hd->clens[res] * stats[num].freq;
473	lens[Word[0]]++;
474	for (k = 0; k < (int) Word[0]; k++)
475	chars[Word[k + 1]]++;
476	}
477	num++;
478	}
479	else
480	{
481	bits += ht[which]->hd->clens[res] * stats[num].freq;
482	}
483	}
484
485	{
486	int j, num_buckets = 0;
487	int start[10], end[10];
488	long flens[16], fchars[256];
489
490	for (j = 0; j < 256; j++)
491	if (!chars[j] && PESINAWORD (j) == which)
492	fchars[j] = 1;
493	else
494	fchars[j] = chars[j];
495	for (j = 0; j < 16; j++)
496	if (!lens[j])
497	flens[j] = 1;
498	else
499	flens[j] = lens[j];
500
501	size[which] = Calculate_Huffman_Size (16, flens, lens) +
502	Calculate_Huffman_Size (256, fchars, chars);
503
504
505	qsort (stats, num, sizeof (*stats), stats_comp);
506
507
508	if (mode == 'Y')
509	{
510	int k;
511	num_buckets = 1;
512	start[0] = 0;
513	end[0] = cdh.num_words[which] - 1;
514	Message ("There are %d novel %swords", num, which ? "" : "non-");
515	while (end[num_buckets - 1] < num)
516	{
517	start[num_buckets] = end[num_buckets - 1] + 1;
518	end[num_buckets] = start[num_buckets] +
519	(end[num_buckets - 1] - start[num_buckets - 1]) * 2;
520	num_buckets++;
521	}
522	for (k = 0; k < num_buckets; k++)
523	Message ("Bucket %d %d <= x <= %d\n", k + 1, start[k], end[k]);
524	}
525
526	for (j = 0; j < num; j++)
527	{
528	(*num_extra_bytes) += stats[j].len;
529	switch (mode)
530	{
531	case 'D':
532	bits += BIO_Delta_Length (j + 1) * stats[j].freq;
533	break;
534	case 'B':
535	bits += BIO_Binary_Length (j + 1, num) * stats[j].freq;
536	break;
537	case 'Y':
538	{
539	int k = 0;
540	while (j > end[k])
541	k++;
542	if (!(j - start[k] + 1 >= 1 &&
543	j - start[k] + 1 <= end[k] - start[k] + 1))
544	{
545	Message ("k = %d, j = %d", k, j);
546	assert (j - start[k] + 1 >= 1 &&
547	j - start[k] + 1 <= end[k] - start[k] + 1);
548	}
549	else
550	{
551	bits += (BIO_Gamma_Length (k + 1) +
552	BIO_Binary_Length (j - start[k] + 1,
553	end[k] - start[k] + 1)) *
554	stats[j].freq;
555	}
556	}
557	}
558	}
559	if (mode == 'B' && num)
560	bits += BIO_Delta_Length (num) * csh.num_docs;
561	}
562	Xfree (stats);
563	}
564	fclose (f);
565
566	*aux_compressed = (size[0] + size[1]) / 8;
567
568	/*
569	* The 5.5 bits added on per document consist of
570	* 1 bit for the word/non-word flag at the start of the document.
571	* 1 bit on average for document termination.
572	* 3.5 bits on average for padding to a byte boundary.
573	*/
574	bits += 5.5 * csh.num_docs;
575
576	bytes = bits / 8 + sizeof (compressed_text_header) + sizeof (u_long);
577
578	*num_bytes = csh.num_bytes;
579
580	return bytes;
581	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: