source: trunk/gsdl/packages/mg/src/text/mg_text_estimate.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.1 KB
Line 
1/**************************************************************************
2 *
3 * mg_text_estimate.c -- Program to estimate the compressed text size
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_text_estimate.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "local_strings.h"
29#include "bitio_gen.h"
30#include "bitio_m.h"
31#include "bitio_m_stdio.h"
32#include "timing.h"
33#include "mgheap.h"
34#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36#include "mg_files.h"
37#include "locallib.h"
38#include "invf.h"
39#include "text.h"
40#include "words.h"
41#include "mg.h"
42#include "comp_dict.h"
43#include "hash.h"
44
45
46
47
48
49/*
50 $Log$
51 Revision 1.1 1999/08/10 21:18:14 sjboddie
52 renamed mg-1.3d directory mg
53
54 Revision 1.4 1999/01/15 03:05:51 rjmcnab
55
56 Renamed lib/heap to be lib/mgheap (it was causing some problems with
57 some versions of the STL libraries which contained a heap.h)
58
59 Revision 1.3 1998/12/17 09:12:53 rjmcnab
60
61 Altered mg to process utf-8 encoded Unicode. The main changes
62 are in the parsing of the input, the casefolding, and the stemming.
63
64 Revision 1.2 1998/11/25 07:55:48 rjmcnab
65
66 Modified mg to that you can specify the stemmer you want
67 to use via a command line option. You specify it to
68 mg_passes during the build process. The number of the
69 stemmer that you used is stored within the inverted
70 dictionary header and the stemmed dictionary header so
71 the correct stemmer is used in later stages of building
72 and querying.
73
74 Revision 1.1 1998/11/17 09:35:19 rjmcnab
75 *** empty log message ***
76
77 * Revision 1.3 1994/10/20 03:56:59 tes
78 * I have rewritten the boolean query optimiser and abstracted out the
79 * components of the boolean query.
80 *
81 * Revision 1.2 1994/09/20 04:41:54 tes
82 * For version 1.1
83 *
84 */
85
86static char *RCSID = "$Id: mg_text_estimate.c 439 1999-08-10 21:23:37Z sjboddie $";
87
88typedef struct stats_t
89 {
90 u_long freq, occur;
91 u_long len;
92 }
93stats_t;
94
95static u_long huff_count = 0;
96static u_long escape_count = 0;
97
98
99char mode = 'H';
100
101/* [RJM 07/97: 4G limit] */
102static double ReadInWordsStandard (char *name, double *num_bytes);
103static double ReadInWordsSpecial (char *name, double *num_bytes,
104 double *num_extra_bytes,
105 u_long *aux_compressed);
106
107
108
109
110int main (int argc, char **argv)
111{
112 char *stats_dict = NULL, *comp_dict = NULL;
113 ProgTime StartTime;
114 double num_bytes, num_extra_bytes = 0; /* [RJM 07/97: 4G limit] */
115 u_long aux_compressed;
116 int ch;
117 double bytes;
118 opterr = 0;
119 msg_prefix = argv[0];
120 while ((ch = getopt (argc, argv, "BMDYHh")) != -1)
121 switch (ch)
122 {
123 case 'B':
124 Message ("The compressed text size for this method cannot be\n"
125 "estimated exactly, so an upper bound will be calculated.");
126 mode = ch;
127 break;
128 case 'M':
129 Message ("The compressed text size for this method cannot be\n"
130 "estimated exactly, so an upper bound will be calculated.");
131 mode = 'Y';
132 break;
133 case 'D':
134 case 'Y':
135 case 'H':
136 mode = ch;
137 break;
138 case 'h':
139 case '?':
140 fprintf (stderr, "usage: %s [-h] [B|M|D|Y|H] stats_dict comp_dict\n",
141 argv[0]);
142 exit (1);
143 }
144
145 if (optind < argc)
146 stats_dict = argv[optind++];
147 else
148 stats_dict = NULL;
149 if (optind < argc)
150 comp_dict = argv[optind++];
151 else
152 {
153 fprintf (stderr, "usage: %s [-h] [B|M|D|Y|H] stats_dict comp_dict\n",
154 argv[0]);
155 exit (1);
156 }
157
158
159
160 GetTime (&StartTime);
161
162 if (LoadCompressionDictionary (comp_dict) == COMPERROR)
163 exit (1);
164
165 if (mode == 'H')
166 bytes = ReadInWordsStandard (stats_dict, &num_bytes);
167 else
168 bytes = ReadInWordsSpecial (stats_dict, &num_bytes, &num_extra_bytes,
169 &aux_compressed);
170
171
172 Message ("Compressed Text %.2f Mb ( %.0f bytes %0.3f %%)",
173 bytes / 1024.0 / 1024, bytes, bytes * 100 / num_bytes);
174 Message ("Words portion of the dictionary %.2f Mb ( %.0f bytes %0.3f %%)",
175 Words_disk / 1024.0 / 1024, Words_disk * 1.0,
176 (Words_disk * 100.0) / num_bytes);
177 if (mode == 'H')
178 Message ("Huffman info for chars in dictionary %.2f Mb ( %.0f bytes %0.3f %%)",
179 Chars_disk / 1024.0 / 1024, Chars_disk * 1.0,
180 (Chars_disk * 100.0) / num_bytes);
181 else
182 {
183 Message ("Aux dictionary (Uncompressed) %.2f Mb ( %.0f bytes %0.3f %%)",
184 num_extra_bytes / 1024.0 / 1024, num_extra_bytes * 1.0,
185 (num_extra_bytes * 100.0) / num_bytes);
186 Message ("Aux dictionary (Compressed) %.2f Mb ( %.0f bytes %0.3f %%)",
187 aux_compressed / 1024.0 / 1024, aux_compressed * 1.0,
188 (aux_compressed * 100.0) / num_bytes);
189 }
190 Message ("Words and non-words in the text %u", huff_count + escape_count);
191 Message ("Words and non-words coded by escapes %u ( %0.3f %%)", escape_count,
192 escape_count * 100.0 / huff_count);
193 Message ("%s", ElapsedTime (&StartTime, NULL));
194 return 0;
195}
196
197
198
199
200
201static double
202ReadInWordsStandard (char *name, double *num_bytes)
203{
204 compression_stats_header csh;
205 FILE *f;
206 u_long which;
207 double bytes, bits = 0;
208
209 f = open_named_file (name, "rb", MAGIC_STATS_DICT, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
210
211 fread (&csh, sizeof (csh), 1, f);
212
213 /* [RPAP - Jan 97: Endian Ordering] */
214 NTOHUL(csh.num_docs);
215 NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
216
217 for (which = 0; which < 2; which++)
218 {
219 frags_stats_header fsh;
220 int j;
221 fread (&fsh, sizeof (fsh), 1, f);
222
223 /* [RPAP - Jan 97: Endian Ordering] */
224 NTOHUL(fsh.num_frags);
225 NTOHUL(fsh.mem_for_frags);
226
227 for (j = 0; j < fsh.num_frags; j++)
228 {
229 int len, freq, res, occur_num;
230 u_char Word[16];
231 fread (&freq, sizeof (freq), 1, f);
232 fread (&occur_num, sizeof (occur_num), 1, f);
233 len = fgetc (f);
234
235 /* [RPAP - Jan 97: Endian Ordering] */
236 NTOHSI(freq);
237 NTOHSI(occur_num);
238
239 Word[0] = len;
240 fread (Word + 1, len, 1, f);
241
242 /* Search the hash table for Word */
243 if (ht[which])
244 {
245 register unsigned long hashval, step;
246 register int tsize = ht[which]->size;
247 register u_char **wptr;
248 HASH (hashval, step, Word, tsize);
249 for (;;)
250 {
251 register u_char *s1;
252 register u_char *s2;
253 register int len;
254 wptr = ht[which]->table[hashval];
255 if (wptr == NULL)
256 {
257 res = COMPERROR;
258 break;
259 }
260
261 /* Compare the words */
262 s1 = Word;
263 s2 = *wptr;
264 len = *s1 + 1;
265 for (; len; len--)
266 if (*s1++ != *s2++)
267 break;
268
269 if (len)
270 {
271 hashval += step;
272 if (hashval >= tsize)
273 hashval -= tsize;
274 }
275 else
276 {
277 res = ht[which]->table[hashval] - ht[which]->words;
278 break;
279 }
280 }
281 }
282 else
283 res = COMPERROR;
284
285 /* Check that the word was found in the dictionary */
286 if (res == COMPERROR)
287 {
288 escape_count += freq;
289 if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
290 Message ("Unknown word \"%.*s\"\n", *Word, Word + 1);
291 if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
292 {
293 unsigned i;
294 if (ht[which])
295 {
296 res = ht[which]->hd->num_codes - 1;
297 bits += ht[which]->hd->clens[res] * freq;
298 }
299 bits += lens_huff[which].clens[Word[0]] * freq;
300 for (i = 0; i < Word[0]; i++)
301 bits += char_huff[which].clens[Word[i + 1]] * freq;
302 }
303 if (cdh.dict_type == MG_SEED_DICTIONARY)
304 {
305 unsigned i;
306 if (ht[which])
307 {
308 res = ht[which]->hd->num_codes - 1;
309 bits += ht[which]->hd->clens[res] * freq;
310 }
311 bits += lens_huff[which].clens[Word[0]] * freq;
312 for (i = 0; i < Word[0]; i++)
313 bits += char_huff[which].clens[Word[i + 1]] * freq;
314 }
315 }
316 else
317 {
318 bits += ht[which]->hd->clens[res] * freq;
319 huff_count += freq;
320 }
321 }
322 }
323 fclose (f);
324
325
326 /*
327 * The 5.5 bits added on per document consist of
328 * 1 bit for the word/non-word flag at the start of the document.
329 * 1 bit on average for document termination.
330 * 3.5 bits on average for padding to a byte boundary.
331 */
332 bits += 5.5 * csh.num_docs;
333
334 bytes = bits / 8 + sizeof (compressed_text_header) + sizeof (u_long);
335
336 *num_bytes = csh.num_bytes;
337
338 return bytes;
339}
340
341
342static int
343stats_comp (const void *a, const void *b)
344{
345 return ((stats_t *) a)->occur - ((stats_t *) b)->occur;
346}
347
348
349
350
351
352static double
353ReadInWordsSpecial (char *name, double *num_bytes,
354 double *num_extra_bytes,
355 u_long *aux_compressed)
356{
357 compression_stats_header csh;
358 FILE *f;
359 u_long magic, which;
360 double bytes, bits = 0;
361 double size[2];
362
363 if (!(f = fopen (name, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
364 FatalError (1, "Unable to open file \"%s\"\n", name);
365
366 if (fread (&magic, sizeof (magic), 1, f) != 1)
367 FatalError (1, "Unable to read magic number");
368
369 NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */
370
371 if (magic != MAGIC_STATS_DICT)
372 FatalError (1, "Bad magic number");
373
374 fread (&csh, sizeof (csh), 1, f);
375
376 /* [RPAP - Jan 97: Endian Ordering] */
377 NTOHUL(csh.num_docs);
378 NTOHD(csh.num_bytes); /* [RJM 07/97: 4G limit] */
379
380 for (which = 0; which < 2; which++)
381 {
382 stats_t *stats;
383 frags_stats_header fsh;
384 int j, num = 0;
385 long lens[16], chars[256];
386
387 bzero ((char *) lens, sizeof (lens));
388 bzero ((char *) chars, sizeof (chars));
389
390 fread (&fsh, sizeof (fsh), 1, f);
391
392 /* [RPAP - Jan 97: Endian Ordering] */
393 NTOHUL(fsh.num_frags);
394 NTOHUL(fsh.mem_for_frags);
395
396 stats = Xmalloc (fsh.num_frags * sizeof (*stats));
397 if (!stats)
398 FatalError (1, "Unable to allocate memory for stats");
399 for (j = 0; j < fsh.num_frags; j++)
400 {
401 int len, res;
402 u_char Word[16];
403 fread (&stats[num].freq, sizeof (stats[num].freq), 1, f);
404 fread (&stats[num].occur, sizeof (stats[num].occur), 1, f);
405 len = fgetc (f);
406
407 /* [RPAP - Jan 97: Endian Ordering] */
408 NTOHSI(stats[num].freq);
409 NTOHSI(stats[num].occur);
410
411 stats[num].len = len + 1;
412
413 Word[0] = len;
414 fread (Word + 1, len, 1, f);
415
416 /* Search the hash table for Word */
417 if (ht[which])
418 {
419 register unsigned long hashval, step;
420 register int tsize = ht[which]->size;
421 register u_char **wptr;
422 HASH (hashval, step, Word, tsize);
423 for (;;)
424 {
425 register u_char *s1;
426 register u_char *s2;
427 register int len;
428 wptr = ht[which]->table[hashval];
429 if (wptr == NULL)
430 {
431 res = COMPERROR;
432 break;
433 }
434
435 /* Compare the words */
436 s1 = Word;
437 s2 = *wptr;
438 len = *s1 + 1;
439 for (; len; len--)
440 if (*s1++ != *s2++)
441 break;
442
443 if (len)
444 {
445 hashval += step;
446 if (hashval >= tsize)
447 hashval -= tsize;
448 }
449 else
450 {
451 res = ht[which]->table[hashval] - ht[which]->words;
452 break;
453 }
454 }
455 }
456 else
457 res = COMPERROR;
458
459 assert (stats[num].freq > 0);
460
461 /* Check that the word was found in the dictionary */
462 if (res == COMPERROR)
463 {
464 if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
465 Message ("Unknown word \"%.*s\"\n", *Word, Word + 1);
466 if ((cdh.dict_type == MG_PARTIAL_DICTIONARY ||
467 cdh.dict_type == MG_SEED_DICTIONARY) &&
468 ht[which])
469 {
470 int k;
471 res = ht[which]->hd->num_codes - 1;
472 bits += ht[which]->hd->clens[res] * stats[num].freq;
473 lens[Word[0]]++;
474 for (k = 0; k < (int) Word[0]; k++)
475 chars[Word[k + 1]]++;
476 }
477 num++;
478 }
479 else
480 {
481 bits += ht[which]->hd->clens[res] * stats[num].freq;
482 }
483 }
484
485 {
486 int j, num_buckets = 0;
487 int start[10], end[10];
488 long flens[16], fchars[256];
489
490 for (j = 0; j < 256; j++)
491 if (!chars[j] && PESINAWORD (j) == which)
492 fchars[j] = 1;
493 else
494 fchars[j] = chars[j];
495 for (j = 0; j < 16; j++)
496 if (!lens[j])
497 flens[j] = 1;
498 else
499 flens[j] = lens[j];
500
501 size[which] = Calculate_Huffman_Size (16, flens, lens) +
502 Calculate_Huffman_Size (256, fchars, chars);
503
504
505 qsort (stats, num, sizeof (*stats), stats_comp);
506
507
508 if (mode == 'Y')
509 {
510 int k;
511 num_buckets = 1;
512 start[0] = 0;
513 end[0] = cdh.num_words[which] - 1;
514 Message ("There are %d novel %swords", num, which ? "" : "non-");
515 while (end[num_buckets - 1] < num)
516 {
517 start[num_buckets] = end[num_buckets - 1] + 1;
518 end[num_buckets] = start[num_buckets] +
519 (end[num_buckets - 1] - start[num_buckets - 1]) * 2;
520 num_buckets++;
521 }
522 for (k = 0; k < num_buckets; k++)
523 Message ("Bucket %d %d <= x <= %d\n", k + 1, start[k], end[k]);
524 }
525
526 for (j = 0; j < num; j++)
527 {
528 (*num_extra_bytes) += stats[j].len;
529 switch (mode)
530 {
531 case 'D':
532 bits += BIO_Delta_Length (j + 1) * stats[j].freq;
533 break;
534 case 'B':
535 bits += BIO_Binary_Length (j + 1, num) * stats[j].freq;
536 break;
537 case 'Y':
538 {
539 int k = 0;
540 while (j > end[k])
541 k++;
542 if (!(j - start[k] + 1 >= 1 &&
543 j - start[k] + 1 <= end[k] - start[k] + 1))
544 {
545 Message ("k = %d, j = %d", k, j);
546 assert (j - start[k] + 1 >= 1 &&
547 j - start[k] + 1 <= end[k] - start[k] + 1);
548 }
549 else
550 {
551 bits += (BIO_Gamma_Length (k + 1) +
552 BIO_Binary_Length (j - start[k] + 1,
553 end[k] - start[k] + 1)) *
554 stats[j].freq;
555 }
556 }
557 }
558 }
559 if (mode == 'B' && num)
560 bits += BIO_Delta_Length (num) * csh.num_docs;
561 }
562 Xfree (stats);
563 }
564 fclose (f);
565
566 *aux_compressed = (size[0] + size[1]) / 8;
567
568 /*
569 * The 5.5 bits added on per document consist of
570 * 1 bit for the word/non-word flag at the start of the document.
571 * 1 bit on average for document termination.
572 * 3.5 bits on average for padding to a byte boundary.
573 */
574 bits += 5.5 * csh.num_docs;
575
576 bytes = bits / 8 + sizeof (compressed_text_header) + sizeof (u_long);
577
578 *num_bytes = csh.num_bytes;
579
580 return bytes;
581}
Note: See TracBrowser for help on using the repository browser.