Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/indexers/mg/src/text/mg_text_estimate.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago
Addition of MG package for search and retrieval
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 14.2 KB

Line
1	/**************************************************************************
2	*
3	* mg_text_estimate.c -- Program to estimate the compressed text size
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: mg_text_estimate.c 3745 2003-02-20 21:20:24Z mdewsnip $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "memlib.h"
27	#include "messages.h"
28	#include "local_strings.h"
29	#include "bitio_gen.h"
30	#include "bitio_m.h"
31	#include "bitio_m_stdio.h"
32	#include "timing.h"
33	#include "mgheap.h"
34	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36	#include "mg_files.h"
37	#include "locallib.h"
38	#include "invf.h"
39	#include "text.h"
40	#include "words.h"
41	#include "mg.h"
42	#include "comp_dict.h"
43	#include "hash.h"
44
45
46
47
48
49	/*
50	$Log$
51	Revision 1.1 2003/02/20 21:18:24 mdewsnip
52	Addition of MG package for search and retrieval
53
54	Revision 1.1 1999/08/10 21:18:14 sjboddie
55	renamed mg-1.3d directory mg
56
57	Revision 1.4 1999/01/15 03:05:51 rjmcnab
58
59	Renamed lib/heap to be lib/mgheap (it was causing some problems with
60	some versions of the STL libraries which contained a heap.h)
61
62	Revision 1.3 1998/12/17 09:12:53 rjmcnab
63
64	Altered mg to process utf-8 encoded Unicode. The main changes
65	are in the parsing of the input, the casefolding, and the stemming.
66
67	Revision 1.2 1998/11/25 07:55:48 rjmcnab
68
69	Modified mg to that you can specify the stemmer you want
70	to use via a command line option. You specify it to
71	mg_passes during the build process. The number of the
72	stemmer that you used is stored within the inverted
73	dictionary header and the stemmed dictionary header so
74	the correct stemmer is used in later stages of building
75	and querying.
76
77	Revision 1.1 1998/11/17 09:35:19 rjmcnab
78	* empty log message *
79
80	* Revision 1.3 1994/10/20 03:56:59 tes
81	* I have rewritten the boolean query optimiser and abstracted out the
82	* components of the boolean query.
83	*
84	* Revision 1.2 1994/09/20 04:41:54 tes
85	* For version 1.1
86	*
87	*/
88
89	static char *RCSID = "$Id: mg_text_estimate.c 3745 2003-02-20 21:20:24Z mdewsnip $";
90
91	typedef struct stats_t
92	{
93	u_long freq, occur;
94	u_long len;
95	}
96	stats_t;
97
98	static u_long huff_count = 0;
99	static u_long escape_count = 0;
100
101
102	char mode = 'H';
103
104	/* [RJM 07/97: 4G limit] */
105	static double ReadInWordsStandard (char name, double num_bytes);
106	static double ReadInWordsSpecial (char name, double num_bytes,
107	double *num_extra_bytes,
108	u_long *aux_compressed);
109
110
111
112
113	int main (int argc, char **argv)
114	{
115	char stats_dict = NULL, comp_dict = NULL;
116	ProgTime StartTime;
117	double num_bytes, num_extra_bytes = 0; /* [RJM 07/97: 4G limit] */
118	u_long aux_compressed;
119	int ch;
120	double bytes;
121	opterr = 0;
122	msg_prefix = argv[0];
123	while ((ch = getopt (argc, argv, "BMDYHh")) != -1)
124	switch (ch)
125	{
126	case 'B':
127	Message ("The compressed text size for this method cannot be\n"
128	"estimated exactly, so an upper bound will be calculated.");
129	mode = ch;
130	break;
131	case 'M':
132	Message ("The compressed text size for this method cannot be\n"
133	"estimated exactly, so an upper bound will be calculated.");
134	mode = 'Y';
135	break;
136	case 'D':
137	case 'Y':
138	case 'H':
139	mode = ch;
140	break;
141	case 'h':
142	case '?':
143	fprintf (stderr, "usage: %s [-h] [B\|M\|D\|Y\|H] stats_dict comp_dict\n",
144	argv[0]);
145	exit (1);
146	}
147
148	if (optind < argc)
149	stats_dict = argv[optind++];
150	else
151	stats_dict = NULL;
152	if (optind < argc)
153	comp_dict = argv[optind++];
154	else
155	{
156	fprintf (stderr, "usage: %s [-h] [B\|M\|D\|Y\|H] stats_dict comp_dict\n",
157	argv[0]);
158	exit (1);
159	}
160
161
162
163	GetTime (&StartTime);
164
165	if (LoadCompressionDictionary (comp_dict) == COMPERROR)
166	exit (1);
167
168	if (mode == 'H')
169	bytes = ReadInWordsStandard (stats_dict, &num_bytes);
170	else
171	bytes = ReadInWordsSpecial (stats_dict, &num_bytes, &num_extra_bytes,
172	&aux_compressed);
173
174
175	Message ("Compressed Text %.2f Mb ( %.0f bytes %0.3f %%)",
176	bytes / 1024.0 / 1024, bytes, bytes * 100 / num_bytes);
177	Message ("Words portion of the dictionary %.2f Mb ( %.0f bytes %0.3f %%)",
178	Words_disk / 1024.0 / 1024, Words_disk * 1.0,
179	(Words_disk * 100.0) / num_bytes);
180	if (mode == 'H')
181	Message ("Huffman info for chars in dictionary %.2f Mb ( %.0f bytes %0.3f %%)",
182	Chars_disk / 1024.0 / 1024, Chars_disk * 1.0,
183	(Chars_disk * 100.0) / num_bytes);
184	else
185	{
186	Message ("Aux dictionary (Uncompressed) %.2f Mb ( %.0f bytes %0.3f %%)",
187	num_extra_bytes / 1024.0 / 1024, num_extra_bytes * 1.0,
188	(num_extra_bytes * 100.0) / num_bytes);
189	Message ("Aux dictionary (Compressed) %.2f Mb ( %.0f bytes %0.3f %%)",
190	aux_compressed / 1024.0 / 1024, aux_compressed * 1.0,
191	(aux_compressed * 100.0) / num_bytes);
192	}
193	Message ("Words and non-words in the text %u", huff_count + escape_count);
194	Message ("Words and non-words coded by escapes %u ( %0.3f %%)", escape_count,
195	escape_count * 100.0 / huff_count);
196	Message ("%s", ElapsedTime (&StartTime, NULL));
197	return 0;
198	}
199
200
201
202
203
204	static double
205	ReadInWordsStandard (char name, double num_bytes)
206	{
207	compression_stats_header csh;
208	FILE *f;
209	u_long which;
210	double bytes, bits = 0;
211
212	f = open_named_file (name, "rb", MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
213
214	fread (&csh, sizeof (csh), 1, f);
215
216	/* [RPAP - Jan 97: Endian Ordering] */
217	NTOHUL(csh.num_docs);
218	NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
219
220	for (which = 0; which < 2; which++)
221	{
222	frags_stats_header fsh;
223	int j;
224	fread (&fsh, sizeof (fsh), 1, f);
225
226	/* [RPAP - Jan 97: Endian Ordering] */
227	NTOHUL(fsh.num_frags);
228	NTOHUL(fsh.mem_for_frags);
229
230	for (j = 0; j < fsh.num_frags; j++)
231	{
232	int len, freq, res, occur_num;
233	u_char Word[16];
234	fread (&freq, sizeof (freq), 1, f);
235	fread (&occur_num, sizeof (occur_num), 1, f);
236	len = fgetc (f);
237
238	/* [RPAP - Jan 97: Endian Ordering] */
239	NTOHSI(freq);
240	NTOHSI(occur_num);
241
242	Word[0] = len;
243	fread (Word + 1, len, 1, f);
244
245	/* Search the hash table for Word */
246	if (ht[which])
247	{
248	register unsigned long hashval, step;
249	register int tsize = ht[which]->size;
250	register u_char **wptr;
251	HASH (hashval, step, Word, tsize);
252	for (;;)
253	{
254	register u_char *s1;
255	register u_char *s2;
256	register int len;
257	wptr = ht[which]->table[hashval];
258	if (wptr == NULL)
259	{
260	res = COMPERROR;
261	break;
262	}
263
264	/* Compare the words */
265	s1 = Word;
266	s2 = *wptr;
267	len = *s1 + 1;
268	for (; len; len--)
269	if (s1++ != s2++)
270	break;
271
272	if (len)
273	{
274	hashval += step;
275	if (hashval >= tsize)
276	hashval -= tsize;
277	}
278	else
279	{
280	res = ht[which]->table[hashval] - ht[which]->words;
281	break;
282	}
283	}
284	}
285	else
286	res = COMPERROR;
287
288	/* Check that the word was found in the dictionary */
289	if (res == COMPERROR)
290	{
291	escape_count += freq;
292	if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
293	Message ("Unknown word \"%.s\"\n", Word, Word + 1);
294	if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
295	{
296	unsigned i;
297	if (ht[which])
298	{
299	res = ht[which]->hd->num_codes - 1;
300	bits += ht[which]->hd->clens[res] * freq;
301	}
302	bits += lens_huff[which].clens[Word[0]] * freq;
303	for (i = 0; i < Word[0]; i++)
304	bits += char_huff[which].clens[Word[i + 1]] * freq;
305	}
306	if (cdh.dict_type == MG_SEED_DICTIONARY)
307	{
308	unsigned i;
309	if (ht[which])
310	{
311	res = ht[which]->hd->num_codes - 1;
312	bits += ht[which]->hd->clens[res] * freq;
313	}
314	bits += lens_huff[which].clens[Word[0]] * freq;
315	for (i = 0; i < Word[0]; i++)
316	bits += char_huff[which].clens[Word[i + 1]] * freq;
317	}
318	}
319	else
320	{
321	bits += ht[which]->hd->clens[res] * freq;
322	huff_count += freq;
323	}
324	}
325	}
326	fclose (f);
327
328
329	/*
330	* The 5.5 bits added on per document consist of
331	* 1 bit for the word/non-word flag at the start of the document.
332	* 1 bit on average for document termination.
333	* 3.5 bits on average for padding to a byte boundary.
334	*/
335	bits += 5.5 * csh.num_docs;
336
337	bytes = bits / 8 + sizeof (compressed_text_header) + sizeof (u_long);
338
339	*num_bytes = csh.num_bytes;
340
341	return bytes;
342	}
343
344
345	static int
346	stats_comp (const void a, const void b)
347	{
348	return ((stats_t ) a)->occur - ((stats_t ) b)->occur;
349	}
350
351
352
353
354
355	static double
356	ReadInWordsSpecial (char name, double num_bytes,
357	double *num_extra_bytes,
358	u_long *aux_compressed)
359	{
360	compression_stats_header csh;
361	FILE *f;
362	u_long magic, which;
363	double bytes, bits = 0;
364	double size[2];
365
366	if (!(f = fopen (name, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
367	FatalError (1, "Unable to open file \"%s\"\n", name);
368
369	if (fread (&magic, sizeof (magic), 1, f) != 1)
370	FatalError (1, "Unable to read magic number");
371
372	NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */
373
374	if (magic != MAGIC_STATS_DICT)
375	FatalError (1, "Bad magic number");
376
377	fread (&csh, sizeof (csh), 1, f);
378
379	/* [RPAP - Jan 97: Endian Ordering] */
380	NTOHUL(csh.num_docs);
381	NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
382
383	for (which = 0; which < 2; which++)
384	{
385	stats_t *stats;
386	frags_stats_header fsh;
387	int j, num = 0;
388	long lens[16], chars[256];
389
390	bzero ((char *) lens, sizeof (lens));
391	bzero ((char *) chars, sizeof (chars));
392
393	fread (&fsh, sizeof (fsh), 1, f);
394
395	/* [RPAP - Jan 97: Endian Ordering] */
396	NTOHUL(fsh.num_frags);
397	NTOHUL(fsh.mem_for_frags);
398
399	stats = Xmalloc (fsh.num_frags * sizeof (*stats));
400	if (!stats)
401	FatalError (1, "Unable to allocate memory for stats");
402	for (j = 0; j < fsh.num_frags; j++)
403	{
404	int len, res;
405	u_char Word[16];
406	fread (&stats[num].freq, sizeof (stats[num].freq), 1, f);
407	fread (&stats[num].occur, sizeof (stats[num].occur), 1, f);
408	len = fgetc (f);
409
410	/* [RPAP - Jan 97: Endian Ordering] */
411	NTOHSI(stats[num].freq);
412	NTOHSI(stats[num].occur);
413
414	stats[num].len = len + 1;
415
416	Word[0] = len;
417	fread (Word + 1, len, 1, f);
418
419	/* Search the hash table for Word */
420	if (ht[which])
421	{
422	register unsigned long hashval, step;
423	register int tsize = ht[which]->size;
424	register u_char **wptr;
425	HASH (hashval, step, Word, tsize);
426	for (;;)
427	{
428	register u_char *s1;
429	register u_char *s2;
430	register int len;
431	wptr = ht[which]->table[hashval];
432	if (wptr == NULL)
433	{
434	res = COMPERROR;
435	break;
436	}
437
438	/* Compare the words */
439	s1 = Word;
440	s2 = *wptr;
441	len = *s1 + 1;
442	for (; len; len--)
443	if (s1++ != s2++)
444	break;
445
446	if (len)
447	{
448	hashval += step;
449	if (hashval >= tsize)
450	hashval -= tsize;
451	}
452	else
453	{
454	res = ht[which]->table[hashval] - ht[which]->words;
455	break;
456	}
457	}
458	}
459	else
460	res = COMPERROR;
461
462	assert (stats[num].freq > 0);
463
464	/* Check that the word was found in the dictionary */
465	if (res == COMPERROR)
466	{
467	if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
468	Message ("Unknown word \"%.s\"\n", Word, Word + 1);
469	if ((cdh.dict_type == MG_PARTIAL_DICTIONARY \|\|
470	cdh.dict_type == MG_SEED_DICTIONARY) &&
471	ht[which])
472	{
473	int k;
474	res = ht[which]->hd->num_codes - 1;
475	bits += ht[which]->hd->clens[res] * stats[num].freq;
476	lens[Word[0]]++;
477	for (k = 0; k < (int) Word[0]; k++)
478	chars[Word[k + 1]]++;
479	}
480	num++;
481	}
482	else
483	{
484	bits += ht[which]->hd->clens[res] * stats[num].freq;
485	}
486	}
487
488	{
489	int j, num_buckets = 0;
490	int start[10], end[10];
491	long flens[16], fchars[256];
492
493	for (j = 0; j < 256; j++)
494	if (!chars[j] && PESINAWORD (j) == which)
495	fchars[j] = 1;
496	else
497	fchars[j] = chars[j];
498	for (j = 0; j < 16; j++)
499	if (!lens[j])
500	flens[j] = 1;
501	else
502	flens[j] = lens[j];
503
504	size[which] = Calculate_Huffman_Size (16, flens, lens) +
505	Calculate_Huffman_Size (256, fchars, chars);
506
507
508	qsort (stats, num, sizeof (*stats), stats_comp);
509
510
511	if (mode == 'Y')
512	{
513	int k;
514	num_buckets = 1;
515	start[0] = 0;
516	end[0] = cdh.num_words[which] - 1;
517	Message ("There are %d novel %swords", num, which ? "" : "non-");
518	while (end[num_buckets - 1] < num)
519	{
520	start[num_buckets] = end[num_buckets - 1] + 1;
521	end[num_buckets] = start[num_buckets] +
522	(end[num_buckets - 1] - start[num_buckets - 1]) * 2;
523	num_buckets++;
524	}
525	for (k = 0; k < num_buckets; k++)
526	Message ("Bucket %d %d <= x <= %d\n", k + 1, start[k], end[k]);
527	}
528
529	for (j = 0; j < num; j++)
530	{
531	(*num_extra_bytes) += stats[j].len;
532	switch (mode)
533	{
534	case 'D':
535	bits += BIO_Delta_Length (j + 1) * stats[j].freq;
536	break;
537	case 'B':
538	bits += BIO_Binary_Length (j + 1, num) * stats[j].freq;
539	break;
540	case 'Y':
541	{
542	int k = 0;
543	while (j > end[k])
544	k++;
545	if (!(j - start[k] + 1 >= 1 &&
546	j - start[k] + 1 <= end[k] - start[k] + 1))
547	{
548	Message ("k = %d, j = %d", k, j);
549	assert (j - start[k] + 1 >= 1 &&
550	j - start[k] + 1 <= end[k] - start[k] + 1);
551	}
552	else
553	{
554	bits += (BIO_Gamma_Length (k + 1) +
555	BIO_Binary_Length (j - start[k] + 1,
556	end[k] - start[k] + 1)) *
557	stats[j].freq;
558	}
559	}
560	}
561	}
562	if (mode == 'B' && num)
563	bits += BIO_Delta_Length (num) * csh.num_docs;
564	}
565	Xfree (stats);
566	}
567	fclose (f);
568
569	*aux_compressed = (size[0] + size[1]) / 8;
570
571	/*
572	* The 5.5 bits added on per document consist of
573	* 1 bit for the word/non-word flag at the start of the document.
574	* 1 bit on average for document termination.
575	* 3.5 bits on average for padding to a byte boundary.
576	*/
577	bits += 5.5 * csh.num_docs;
578
579	bytes = bits / 8 + sizeof (compressed_text_header) + sizeof (u_long);
580
581	*num_bytes = csh.num_bytes;
582
583	return bytes;
584	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: