source: trunk/indexers/mg/src/text/mg_text_estimate.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.2 KB
Line 
1/**************************************************************************
2 *
3 * mg_text_estimate.c -- Program to estimate the compressed text size
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_text_estimate.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "local_strings.h"
29#include "bitio_gen.h"
30#include "bitio_m.h"
31#include "bitio_m_stdio.h"
32#include "timing.h"
33#include "mgheap.h"
34#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36#include "mg_files.h"
37#include "locallib.h"
38#include "invf.h"
39#include "text.h"
40#include "words.h"
41#include "mg.h"
42#include "comp_dict.h"
43#include "hash.h"
44
45
46
47
48
49/*
50 $Log$
51 Revision 1.1 2003/02/20 21:18:24 mdewsnip
52 Addition of MG package for search and retrieval
53
54 Revision 1.1 1999/08/10 21:18:14 sjboddie
55 renamed mg-1.3d directory mg
56
57 Revision 1.4 1999/01/15 03:05:51 rjmcnab
58
59 Renamed lib/heap to be lib/mgheap (it was causing some problems with
60 some versions of the STL libraries which contained a heap.h)
61
62 Revision 1.3 1998/12/17 09:12:53 rjmcnab
63
64 Altered mg to process utf-8 encoded Unicode. The main changes
65 are in the parsing of the input, the casefolding, and the stemming.
66
67 Revision 1.2 1998/11/25 07:55:48 rjmcnab
68
69 Modified mg to that you can specify the stemmer you want
70 to use via a command line option. You specify it to
71 mg_passes during the build process. The number of the
72 stemmer that you used is stored within the inverted
73 dictionary header and the stemmed dictionary header so
74 the correct stemmer is used in later stages of building
75 and querying.
76
77 Revision 1.1 1998/11/17 09:35:19 rjmcnab
78 *** empty log message ***
79
80 * Revision 1.3 1994/10/20 03:56:59 tes
81 * I have rewritten the boolean query optimiser and abstracted out the
82 * components of the boolean query.
83 *
84 * Revision 1.2 1994/09/20 04:41:54 tes
85 * For version 1.1
86 *
87 */
88
89static char *RCSID = "$Id: mg_text_estimate.c 3745 2003-02-20 21:20:24Z mdewsnip $";
90
91typedef struct stats_t
92 {
93 u_long freq, occur;
94 u_long len;
95 }
96stats_t;
97
98static u_long huff_count = 0;
99static u_long escape_count = 0;
100
101
102char mode = 'H';
103
104/* [RJM 07/97: 4G limit] */
105static double ReadInWordsStandard (char *name, double *num_bytes);
106static double ReadInWordsSpecial (char *name, double *num_bytes,
107 double *num_extra_bytes,
108 u_long *aux_compressed);
109
110
111
112
113int main (int argc, char **argv)
114{
115 char *stats_dict = NULL, *comp_dict = NULL;
116 ProgTime StartTime;
117 double num_bytes, num_extra_bytes = 0; /* [RJM 07/97: 4G limit] */
118 u_long aux_compressed;
119 int ch;
120 double bytes;
121 opterr = 0;
122 msg_prefix = argv[0];
123 while ((ch = getopt (argc, argv, "BMDYHh")) != -1)
124 switch (ch)
125 {
126 case 'B':
127 Message ("The compressed text size for this method cannot be\n"
128 "estimated exactly, so an upper bound will be calculated.");
129 mode = ch;
130 break;
131 case 'M':
132 Message ("The compressed text size for this method cannot be\n"
133 "estimated exactly, so an upper bound will be calculated.");
134 mode = 'Y';
135 break;
136 case 'D':
137 case 'Y':
138 case 'H':
139 mode = ch;
140 break;
141 case 'h':
142 case '?':
143 fprintf (stderr, "usage: %s [-h] [B|M|D|Y|H] stats_dict comp_dict\n",
144 argv[0]);
145 exit (1);
146 }
147
148 if (optind < argc)
149 stats_dict = argv[optind++];
150 else
151 stats_dict = NULL;
152 if (optind < argc)
153 comp_dict = argv[optind++];
154 else
155 {
156 fprintf (stderr, "usage: %s [-h] [B|M|D|Y|H] stats_dict comp_dict\n",
157 argv[0]);
158 exit (1);
159 }
160
161
162
163 GetTime (&StartTime);
164
165 if (LoadCompressionDictionary (comp_dict) == COMPERROR)
166 exit (1);
167
168 if (mode == 'H')
169 bytes = ReadInWordsStandard (stats_dict, &num_bytes);
170 else
171 bytes = ReadInWordsSpecial (stats_dict, &num_bytes, &num_extra_bytes,
172 &aux_compressed);
173
174
175 Message ("Compressed Text %.2f Mb ( %.0f bytes %0.3f %%)",
176 bytes / 1024.0 / 1024, bytes, bytes * 100 / num_bytes);
177 Message ("Words portion of the dictionary %.2f Mb ( %.0f bytes %0.3f %%)",
178 Words_disk / 1024.0 / 1024, Words_disk * 1.0,
179 (Words_disk * 100.0) / num_bytes);
180 if (mode == 'H')
181 Message ("Huffman info for chars in dictionary %.2f Mb ( %.0f bytes %0.3f %%)",
182 Chars_disk / 1024.0 / 1024, Chars_disk * 1.0,
183 (Chars_disk * 100.0) / num_bytes);
184 else
185 {
186 Message ("Aux dictionary (Uncompressed) %.2f Mb ( %.0f bytes %0.3f %%)",
187 num_extra_bytes / 1024.0 / 1024, num_extra_bytes * 1.0,
188 (num_extra_bytes * 100.0) / num_bytes);
189 Message ("Aux dictionary (Compressed) %.2f Mb ( %.0f bytes %0.3f %%)",
190 aux_compressed / 1024.0 / 1024, aux_compressed * 1.0,
191 (aux_compressed * 100.0) / num_bytes);
192 }
193 Message ("Words and non-words in the text %u", huff_count + escape_count);
194 Message ("Words and non-words coded by escapes %u ( %0.3f %%)", escape_count,
195 escape_count * 100.0 / huff_count);
196 Message ("%s", ElapsedTime (&StartTime, NULL));
197 return 0;
198}
199
200
201
202
203
204static double
205ReadInWordsStandard (char *name, double *num_bytes)
206{
207 compression_stats_header csh;
208 FILE *f;
209 u_long which;
210 double bytes, bits = 0;
211
212 f = open_named_file (name, "rb", MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
213
214 fread (&csh, sizeof (csh), 1, f);
215
216 /* [RPAP - Jan 97: Endian Ordering] */
217 NTOHUL(csh.num_docs);
218 NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
219
220 for (which = 0; which < 2; which++)
221 {
222 frags_stats_header fsh;
223 int j;
224 fread (&fsh, sizeof (fsh), 1, f);
225
226 /* [RPAP - Jan 97: Endian Ordering] */
227 NTOHUL(fsh.num_frags);
228 NTOHUL(fsh.mem_for_frags);
229
230 for (j = 0; j < fsh.num_frags; j++)
231 {
232 int len, freq, res, occur_num;
233 u_char Word[16];
234 fread (&freq, sizeof (freq), 1, f);
235 fread (&occur_num, sizeof (occur_num), 1, f);
236 len = fgetc (f);
237
238 /* [RPAP - Jan 97: Endian Ordering] */
239 NTOHSI(freq);
240 NTOHSI(occur_num);
241
242 Word[0] = len;
243 fread (Word + 1, len, 1, f);
244
245 /* Search the hash table for Word */
246 if (ht[which])
247 {
248 register unsigned long hashval, step;
249 register int tsize = ht[which]->size;
250 register u_char **wptr;
251 HASH (hashval, step, Word, tsize);
252 for (;;)
253 {
254 register u_char *s1;
255 register u_char *s2;
256 register int len;
257 wptr = ht[which]->table[hashval];
258 if (wptr == NULL)
259 {
260 res = COMPERROR;
261 break;
262 }
263
264 /* Compare the words */
265 s1 = Word;
266 s2 = *wptr;
267 len = *s1 + 1;
268 for (; len; len--)
269 if (*s1++ != *s2++)
270 break;
271
272 if (len)
273 {
274 hashval += step;
275 if (hashval >= tsize)
276 hashval -= tsize;
277 }
278 else
279 {
280 res = ht[which]->table[hashval] - ht[which]->words;
281 break;
282 }
283 }
284 }
285 else
286 res = COMPERROR;
287
288 /* Check that the word was found in the dictionary */
289 if (res == COMPERROR)
290 {
291 escape_count += freq;
292 if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
293 Message ("Unknown word \"%.*s\"\n", *Word, Word + 1);
294 if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
295 {
296 unsigned i;
297 if (ht[which])
298 {
299 res = ht[which]->hd->num_codes - 1;
300 bits += ht[which]->hd->clens[res] * freq;
301 }
302 bits += lens_huff[which].clens[Word[0]] * freq;
303 for (i = 0; i < Word[0]; i++)
304 bits += char_huff[which].clens[Word[i + 1]] * freq;
305 }
306 if (cdh.dict_type == MG_SEED_DICTIONARY)
307 {
308 unsigned i;
309 if (ht[which])
310 {
311 res = ht[which]->hd->num_codes - 1;
312 bits += ht[which]->hd->clens[res] * freq;
313 }
314 bits += lens_huff[which].clens[Word[0]] * freq;
315 for (i = 0; i < Word[0]; i++)
316 bits += char_huff[which].clens[Word[i + 1]] * freq;
317 }
318 }
319 else
320 {
321 bits += ht[which]->hd->clens[res] * freq;
322 huff_count += freq;
323 }
324 }
325 }
326 fclose (f);
327
328
329 /*
330 * The 5.5 bits added on per document consist of
331 * 1 bit for the word/non-word flag at the start of the document.
332 * 1 bit on average for document termination.
333 * 3.5 bits on average for padding to a byte boundary.
334 */
335 bits += 5.5 * csh.num_docs;
336
337 bytes = bits / 8 + sizeof (compressed_text_header) + sizeof (u_long);
338
339 *num_bytes = csh.num_bytes;
340
341 return bytes;
342}
343
344
345static int
346stats_comp (const void *a, const void *b)
347{
348 return ((stats_t *) a)->occur - ((stats_t *) b)->occur;
349}
350
351
352
353
354
355static double
356ReadInWordsSpecial (char *name, double *num_bytes,
357 double *num_extra_bytes,
358 u_long *aux_compressed)
359{
360 compression_stats_header csh;
361 FILE *f;
362 u_long magic, which;
363 double bytes, bits = 0;
364 double size[2];
365
366 if (!(f = fopen (name, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
367 FatalError (1, "Unable to open file \"%s\"\n", name);
368
369 if (fread (&magic, sizeof (magic), 1, f) != 1)
370 FatalError (1, "Unable to read magic number");
371
372 NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */
373
374 if (magic != MAGIC_STATS_DICT)
375 FatalError (1, "Bad magic number");
376
377 fread (&csh, sizeof (csh), 1, f);
378
379 /* [RPAP - Jan 97: Endian Ordering] */
380 NTOHUL(csh.num_docs);
381 NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
382
383 for (which = 0; which < 2; which++)
384 {
385 stats_t *stats;
386 frags_stats_header fsh;
387 int j, num = 0;
388 long lens[16], chars[256];
389
390 bzero ((char *) lens, sizeof (lens));
391 bzero ((char *) chars, sizeof (chars));
392
393 fread (&fsh, sizeof (fsh), 1, f);
394
395 /* [RPAP - Jan 97: Endian Ordering] */
396 NTOHUL(fsh.num_frags);
397 NTOHUL(fsh.mem_for_frags);
398
399 stats = Xmalloc (fsh.num_frags * sizeof (*stats));
400 if (!stats)
401 FatalError (1, "Unable to allocate memory for stats");
402 for (j = 0; j < fsh.num_frags; j++)
403 {
404 int len, res;
405 u_char Word[16];
406 fread (&stats[num].freq, sizeof (stats[num].freq), 1, f);
407 fread (&stats[num].occur, sizeof (stats[num].occur), 1, f);
408 len = fgetc (f);
409
410 /* [RPAP - Jan 97: Endian Ordering] */
411 NTOHSI(stats[num].freq);
412 NTOHSI(stats[num].occur);
413
414 stats[num].len = len + 1;
415
416 Word[0] = len;
417 fread (Word + 1, len, 1, f);
418
419 /* Search the hash table for Word */
420 if (ht[which])
421 {
422 register unsigned long hashval, step;
423 register int tsize = ht[which]->size;
424 register u_char **wptr;
425 HASH (hashval, step, Word, tsize);
426 for (;;)
427 {
428 register u_char *s1;
429 register u_char *s2;
430 register int len;
431 wptr = ht[which]->table[hashval];
432 if (wptr == NULL)
433 {
434 res = COMPERROR;
435 break;
436 }
437
438 /* Compare the words */
439 s1 = Word;
440 s2 = *wptr;
441 len = *s1 + 1;
442 for (; len; len--)
443 if (*s1++ != *s2++)
444 break;
445
446 if (len)
447 {
448 hashval += step;
449 if (hashval >= tsize)
450 hashval -= tsize;
451 }
452 else
453 {
454 res = ht[which]->table[hashval] - ht[which]->words;
455 break;
456 }
457 }
458 }
459 else
460 res = COMPERROR;
461
462 assert (stats[num].freq > 0);
463
464 /* Check that the word was found in the dictionary */
465 if (res == COMPERROR)
466 {
467 if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
468 Message ("Unknown word \"%.*s\"\n", *Word, Word + 1);
469 if ((cdh.dict_type == MG_PARTIAL_DICTIONARY ||
470 cdh.dict_type == MG_SEED_DICTIONARY) &&
471 ht[which])
472 {
473 int k;
474 res = ht[which]->hd->num_codes - 1;
475 bits += ht[which]->hd->clens[res] * stats[num].freq;
476 lens[Word[0]]++;
477 for (k = 0; k < (int) Word[0]; k++)
478 chars[Word[k + 1]]++;
479 }
480 num++;
481 }
482 else
483 {
484 bits += ht[which]->hd->clens[res] * stats[num].freq;
485 }
486 }
487
488 {
489 int j, num_buckets = 0;
490 int start[10], end[10];
491 long flens[16], fchars[256];
492
493 for (j = 0; j < 256; j++)
494 if (!chars[j] && PESINAWORD (j) == which)
495 fchars[j] = 1;
496 else
497 fchars[j] = chars[j];
498 for (j = 0; j < 16; j++)
499 if (!lens[j])
500 flens[j] = 1;
501 else
502 flens[j] = lens[j];
503
504 size[which] = Calculate_Huffman_Size (16, flens, lens) +
505 Calculate_Huffman_Size (256, fchars, chars);
506
507
508 qsort (stats, num, sizeof (*stats), stats_comp);
509
510
511 if (mode == 'Y')
512 {
513 int k;
514 num_buckets = 1;
515 start[0] = 0;
516 end[0] = cdh.num_words[which] - 1;
517 Message ("There are %d novel %swords", num, which ? "" : "non-");
518 while (end[num_buckets - 1] < num)
519 {
520 start[num_buckets] = end[num_buckets - 1] + 1;
521 end[num_buckets] = start[num_buckets] +
522 (end[num_buckets - 1] - start[num_buckets - 1]) * 2;
523 num_buckets++;
524 }
525 for (k = 0; k < num_buckets; k++)
526 Message ("Bucket %d %d <= x <= %d\n", k + 1, start[k], end[k]);
527 }
528
529 for (j = 0; j < num; j++)
530 {
531 (*num_extra_bytes) += stats[j].len;
532 switch (mode)
533 {
534 case 'D':
535 bits += BIO_Delta_Length (j + 1) * stats[j].freq;
536 break;
537 case 'B':
538 bits += BIO_Binary_Length (j + 1, num) * stats[j].freq;
539 break;
540 case 'Y':
541 {
542 int k = 0;
543 while (j > end[k])
544 k++;
545 if (!(j - start[k] + 1 >= 1 &&
546 j - start[k] + 1 <= end[k] - start[k] + 1))
547 {
548 Message ("k = %d, j = %d", k, j);
549 assert (j - start[k] + 1 >= 1 &&
550 j - start[k] + 1 <= end[k] - start[k] + 1);
551 }
552 else
553 {
554 bits += (BIO_Gamma_Length (k + 1) +
555 BIO_Binary_Length (j - start[k] + 1,
556 end[k] - start[k] + 1)) *
557 stats[j].freq;
558 }
559 }
560 }
561 }
562 if (mode == 'B' && num)
563 bits += BIO_Delta_Length (num) * csh.num_docs;
564 }
565 Xfree (stats);
566 }
567 fclose (f);
568
569 *aux_compressed = (size[0] + size[1]) / 8;
570
571 /*
572 * The 5.5 bits added on per document consist of
573 * 1 bit for the word/non-word flag at the start of the document.
574 * 1 bit on average for document termination.
575 * 3.5 bits on average for padding to a byte boundary.
576 */
577 bits += 5.5 * csh.num_docs;
578
579 bytes = bits / 8 + sizeof (compressed_text_header) + sizeof (u_long);
580
581 *num_bytes = csh.num_bytes;
582
583 return bytes;
584}
Note: See TracBrowser for help on using the repository browser.