source: main/trunk/greenstone2/common-src/indexers/mg/src/text/text.pass2.c@ 25147

Last change on this file since 25147 was 25147, checked in by kjdon, 12 years ago

merged 64_bit_Greenstone branch into trunk, rev 25139

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.1 KB
Line 
1/**************************************************************************
2 *
3 * text.pass2.c -- Text compression (Pass 2)
4 * Copyright (C) 1994 Neil Sharman, Gary Eddy and Alistair Moffat
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: text.pass2.c 25147 2012-02-28 00:59:00Z kjdon $
21 *
22 **************************************************************************/
23
24
25#include "sysfuncs.h"
26
27#include "memlib.h"
28#include "messages.h"
29#include "local_strings.h"
30#include "bitio_m_mem.h"
31#include "bitio_m.h"
32#include "huffman.h"
33#include "bitio_stdio.h"
34#include "huffman_stdio.h"
35#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
36
37#include "mg.h"
38#include "mg_files.h"
39#include "build.h"
40#include "words.h"
41#include "text.h"
42#include "hash.h"
43#include "locallib.h"
44#include "comp_dict.h"
45
46
47
48
49/*
50 $Log$
51 Revision 1.1 2003/02/20 21:18:24 mdewsnip
52 Addition of MG package for search and retrieval
53
54 Revision 1.2 2002/01/22 02:41:14 jrm21
55 Try to print out an error if the aux file couldn't be written to.
56
57 Revision 1.1 1999/08/10 21:18:25 sjboddie
58 renamed mg-1.3d directory mg
59
60 Revision 1.2 1998/12/17 09:12:54 rjmcnab
61
62 Altered mg to process utf-8 encoded Unicode. The main changes
63 are in the parsing of the input, the casefolding, and the stemming.
64
65 Revision 1.1 1998/11/17 09:35:47 rjmcnab
66 *** empty log message ***
67
68 * Revision 1.3 1994/10/20 03:57:10 tes
69 * I have rewritten the boolean query optimiser and abstracted out the
70 * components of the boolean query.
71 *
72 * Revision 1.2 1994/09/20 04:42:14 tes
73 * For version 1.1
74 *
75 */
76
77static char *RCSID = "$Id: text.pass2.c 25147 2012-02-28 00:59:00Z kjdon $";
78
79#define POOL_SIZE 1024*256
80
81typedef struct char_pool
82 {
83 struct char_pool *next;
84 mg_u_long left;
85 u_char *ptr;
86 u_char pool[POOL_SIZE];
87 }
88char_pool;
89
90typedef struct novel_hash_rec
91 {
92 mg_u_long ordinal_num;
93 u_char *word;
94 }
95novel_hash_rec;
96
97
98#define INITIAL_HASH_SIZE 7927
99#define MAX_SWAPS 10000
100
101typedef struct novel_hash_table
102 {
103 novel_hash_rec *HashTable;
104 mg_u_long HashSize, HashUsed;
105 char_pool *first_pool;
106 char_pool *pool;
107 mg_u_long next_num, binary_start;
108 novel_hash_rec **code_to_nhr;
109 }
110novel_hash_table;
111
112
113static FILE *text, *text_idx;
114
115static u_char *comp_buffer;
116
117static mg_u_long text_length;
118
119/* [RJM 07/97: 4G limit] */
120static double stats_in_tot_bytes = 0.0;
121static double stats_in_bytes = 0.0;
122static double stats_out_bytes = 0.0;
123
124
125static novel_hash_table nht[2];
126
127static mg_u_long prefix_len = 0;
128
129int blk_start[2][33], blk_end[2][33];
130
131
132static char_pool *
133new_pool (char_pool * pool)
134{
135 char_pool *p = Xmalloc (sizeof (char_pool));
136 if (!p)
137 FatalError (1, "Unable to allocate memory for pool");
138 if (pool)
139 pool->next = p;
140 p->next = NULL;
141 p->left = POOL_SIZE;
142 p->ptr = p->pool;
143 return p;
144}
145
146
147
148int
149init_text_2 (char *file_name)
150{
151 char path[512];
152 int i;
153
154 if (LoadCompressionDictionary (make_name (file_name, TEXT_DICT_SUFFIX,
155 path)) == COMPERROR)
156 return COMPERROR;
157
158 if (!(text = create_file (file_name, TEXT_SUFFIX, "w+b",
159 MAGIC_TEXT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
160 return COMPERROR;
161
162 bzero ((char *) &cth, sizeof (cth));
163
164 if (fwrite (&cth, sizeof (cth), 1, text) != 1)
165 return COMPERROR;
166
167 text_length = sizeof (mg_u_long) + sizeof (cth);
168
169 if (!(text_idx = create_file (file_name, TEXT_IDX_SUFFIX, "w+b",
170 MAGIC_TEXI, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
171 return COMPERROR;
172
173 if (fwrite (&cth, sizeof (cth), 1, text_idx) != 1)
174 return COMPERROR;
175
176 if (!(comp_buffer = Xmalloc (sizeof (u_char) * buf_size)))
177 {
178 Message ("No memory for compression buffer");
179 return (COMPERROR);
180 }
181
182#if 0
183 MaxMemInUse += sizeof (u_char) * buf_size;
184#endif
185
186 if (cdh.novel_method != MG_NOVEL_HUFFMAN_CHARS)
187 for (i = 0; i <= 1; i++)
188 {
189 nht[i].HashSize = INITIAL_HASH_SIZE;
190 nht[i].HashTable = Xmalloc (sizeof (novel_hash_rec) * nht[i].HashSize);
191 bzero ((char *) nht[i].HashTable,
192 sizeof (novel_hash_rec) * nht[i].HashSize);
193 nht[i].HashUsed = 0;
194 nht[i].HashSize = INITIAL_HASH_SIZE;
195 nht[i].pool = nht[i].first_pool = new_pool (NULL);
196 nht[i].next_num = 1;
197 nht[i].binary_start = 1;
198 if (cdh.novel_method == MG_NOVEL_HYBRID_MTF)
199 nht[i].code_to_nhr = Xmalloc (sizeof (novel_hash_rec *) *
200 ((nht[i].HashSize >> 1) + 2));
201 else
202 nht[i].code_to_nhr = NULL;
203 if (cdh.novel_method == MG_NOVEL_HYBRID ||
204 cdh.novel_method == MG_NOVEL_HYBRID_MTF)
205 {
206 int num;
207 num = 1;
208 blk_start[i][0] = 0;
209 blk_end[i][0] = cdh.num_words[i] - 1;
210 while (num < 33)
211 {
212 blk_start[i][num] = blk_end[i][num - 1] + 1;
213 blk_end[i][num] = blk_start[i][num] +
214 (blk_end[i][num - 1] - blk_start[i][num - 1]) * 2;
215 num++;
216 }
217 }
218 }
219
220 return (COMPALLOK);
221}
222
223
224
225int
226ic (void *a, void *b)
227{
228 return *((int *) a) - *((int *) b);
229}
230
231
232
233/* #define DOCDUMP 477 */
234
235int
236process_text_2 (u_char * s_in, int l_in)
237{
238 int which, byte_length;
239 u_char *end = s_in + l_in - 1;
240 int novels_used[2];
241 int swaps[2][MAX_SWAPS];
242
243 which = inaword (s_in, end);
244
245 ENCODE_START (comp_buffer, buf_size)
246
247 ENCODE_BIT (which);
248
249 if (cdh.novel_method == MG_NOVEL_BINARY)
250 {
251 DELTA_ENCODE_L (nht[0].binary_start, prefix_len);
252 DELTA_ENCODE_L (nht[1].binary_start, prefix_len);
253 }
254
255 novels_used[0] = novels_used[1] = 0;
256
257#ifdef DOCDUMP
258 if (cth.num_of_docs == DOCDUMP)
259 {
260 printf ("---------------------------------------------------\n");
261 printf ("which = %d\n", which);
262 }
263#endif
264
265 for (; s_in <= end; which = !which)
266 {
267 u_char Word[MAXWORDLEN + 1];
268 int res;
269
270 if (which)
271 cth.num_of_words++;
272
273 /* First parse a word or non-word out of the string */
274 if (which)
275 PARSE_WORD (Word, s_in, end);
276 else
277 PARSE_NON_WORD (Word, s_in, end);
278
279#ifdef DOCDUMP
280 if (cth.num_of_docs == DOCDUMP)
281 {
282 printf ("%sword : \"%.*s\"", which ? " " : "non-", Word[0], Word + 1);
283 }
284#endif
285
286 /* Search the hash table for Word */
287 if (ht[which])
288 {
289 register mg_u_long hashval, step;
290 register int tsize = ht[which]->size;
291 register u_char **wptr;
292 HASH (hashval, step, Word, tsize);
293 for (;;)
294 {
295 register u_char *s1;
296 register u_char *s2;
297 register int len;
298 wptr = ht[which]->table[hashval];
299 if (wptr == NULL)
300 {
301 res = COMPERROR;
302 break;
303 }
304
305 /* Compare the words */
306 s1 = Word;
307 s2 = *wptr;
308 len = *s1 + 1;
309 for (; len; len--)
310 if (*s1++ != *s2++)
311 break;
312
313 if (len)
314 {
315 hashval += step;
316 if (hashval >= tsize)
317 hashval -= tsize;
318 }
319 else
320 {
321 res = ht[which]->table[hashval] - ht[which]->words;
322 break;
323 }
324 }
325 }
326 else
327 res = COMPERROR;
328 /* Check that the word was found in the dictionary */
329 if (res == COMPERROR)
330 {
331 if (cdh.dict_type == MG_COMPLETE_DICTIONARY)
332 {
333 Message ("Unknown word \"%.*s\"\n", *Word, Word + 1);
334 return (COMPERROR);
335 }
336 if (cdh.dict_type == MG_PARTIAL_DICTIONARY)
337 {
338 mg_u_long i;
339 if (ht[which])
340 {
341 res = ht[which]->hd->num_codes - 1;
342 HUFF_ENCODE (res, ht[which]->codes, ht[which]->hd->clens);
343 }
344 HUFF_ENCODE (Word[0], lens_codes[which], lens_huff[which].clens);
345 for (i = 0; i < Word[0]; i++)
346 HUFF_ENCODE (Word[i + 1], char_codes[which],
347 char_huff[which].clens);
348 }
349 if (cdh.dict_type == MG_SEED_DICTIONARY)
350 {
351 if (ht[which])
352 {
353 res = ht[which]->hd->num_codes - 1;
354 HUFF_ENCODE (res, ht[which]->codes, ht[which]->hd->clens);
355 }
356 switch (cdh.novel_method)
357 {
358 case MG_NOVEL_HUFFMAN_CHARS:
359 {
360 mg_u_long i;
361 HUFF_ENCODE (Word[0], lens_codes[which],
362 lens_huff[which].clens);
363 for (i = 0; i < Word[0]; i++)
364 HUFF_ENCODE (Word[i + 1], char_codes[which],
365 char_huff[which].clens);
366 }
367 break;
368 case MG_NOVEL_BINARY:
369 case MG_NOVEL_DELTA:
370 case MG_NOVEL_HYBRID:
371 case MG_NOVEL_HYBRID_MTF:
372 {
373 register mg_u_long hashval, step;
374 register novel_hash_table *h = &nht[which];
375 register int hsize = h->HashSize;
376 register novel_hash_rec *ent;
377 HASH (hashval, step, Word, hsize);
378 for (;;)
379 {
380 register u_char *s1, *s2;
381 register int len;
382 ent = h->HashTable + hashval;
383 if (!ent->word)
384 {
385 int len = *Word + 1;
386 if (len > h->pool->left)
387 h->pool = new_pool (h->pool);
388 ent->word = h->pool->ptr;
389 ent->ordinal_num = h->next_num++;
390 if (cdh.novel_method == MG_NOVEL_HYBRID_MTF)
391 h->code_to_nhr[ent->ordinal_num - 1] = ent;
392 memcpy (h->pool->ptr, Word, len);
393 h->pool->ptr += len;
394 h->pool->left -= len;
395 h->HashUsed++;
396 break;
397 }
398 /* Compare the words */
399 s1 = Word;
400 s2 = ent->word;
401 len = *s1 + 1;
402 for (; len; len--)
403 if (*s1++ != *s2++)
404 break;
405
406 if (!len)
407 break;
408
409 hashval = (hashval + step);
410 if (hashval >= hsize)
411 hashval -= hsize;
412 }
413
414 switch (cdh.novel_method)
415 {
416 case MG_NOVEL_BINARY:
417 {
418 BINARY_ENCODE (ent->ordinal_num, h->binary_start);
419 if (ent->ordinal_num == h->binary_start)
420 h->binary_start++;
421 }
422 break;
423 case MG_NOVEL_DELTA:
424 {
425 DELTA_ENCODE (ent->ordinal_num);
426 }
427 break;
428 case MG_NOVEL_HYBRID:
429 {
430 int k = 0;
431 int j = ent->ordinal_num - 1;
432 while (j > blk_end[which][k])
433 k++;
434 assert (j - blk_start[which][k] + 1 >= 1 &&
435 j - blk_start[which][k] + 1 <=
436 blk_end[which][k] - blk_start[which][k] + 1);
437
438 GAMMA_ENCODE (k + 1);
439 BINARY_ENCODE (j - blk_start[which][k] + 1,
440 blk_end[which][k] -
441 blk_start[which][k] + 1);
442 }
443 break;
444 case MG_NOVEL_HYBRID_MTF:
445 {
446 int k = 0;
447 int j = ent->ordinal_num - 1;
448 while (j > blk_end[which][k])
449 k++;
450 assert (j - blk_start[which][k] + 1 >= 1 &&
451 j - blk_start[which][k] + 1 <=
452 blk_end[which][k] - blk_start[which][k] + 1);
453 GAMMA_ENCODE (k + 1);
454 BINARY_ENCODE (j - blk_start[which][k] + 1,
455 blk_end[which][k] -
456 blk_start[which][k] + 1);
457
458 if (ent->ordinal_num - 1 >= novels_used[which])
459 {
460 int a = novels_used[which];
461 int b = ent->ordinal_num - 1;
462 novel_hash_rec *temp;
463
464
465/* fprintf(stderr, "a = %d , b = %d\n", a, b);
466 */
467 temp = h->code_to_nhr[a];
468 h->code_to_nhr[a] = h->code_to_nhr[b];
469 h->code_to_nhr[b] = temp;
470 h->code_to_nhr[a]->ordinal_num = a + 1;
471 h->code_to_nhr[b]->ordinal_num = b + 1;
472 if (novels_used[which] == MAX_SWAPS)
473 FatalError (1, "Not enough mem for swapping");
474 swaps[which][novels_used[which]] = b;
475 novels_used[which]++;
476 }
477 }
478 break;
479 }
480 if (h->HashUsed >= h->HashSize >> 1)
481 {
482 novel_hash_rec *ht;
483 mg_u_long size;
484 mg_u_long i;
485 size = prime (h->HashSize * 2);
486 if (cdh.novel_method == MG_NOVEL_HYBRID_MTF)
487 {
488 Xfree (h->code_to_nhr);
489 h->code_to_nhr = Xmalloc (sizeof (novel_hash_rec *) *
490 ((size >> 1) + 2));
491 }
492 if (!(ht = Xmalloc (sizeof (novel_hash_rec) * size)))
493 {
494 Message ("Unable to allocate memory for table");
495 return (COMPERROR);
496 }
497 bzero ((char *) ht, sizeof (novel_hash_rec) * size);
498
499 for (i = 0; i < h->HashSize; i++)
500 if (h->HashTable[i].word)
501 {
502 register u_char *wptr;
503 register mg_u_long hashval, step;
504
505 wptr = h->HashTable[i].word;
506 HASH (hashval, step, wptr, size);
507 wptr = (ht + hashval)->word;
508 while (wptr)
509 {
510 hashval += step;
511 if (hashval >= size)
512 hashval -= size;
513 wptr = (ht + hashval)->word;
514 }
515 ht[hashval] = h->HashTable[i];
516 if (cdh.novel_method == MG_NOVEL_HYBRID_MTF)
517 h->code_to_nhr[ht[hashval].ordinal_num - 1] =
518 &ht[hashval];
519 }
520 Xfree (h->HashTable);
521 h->HashTable = ht;
522 h->HashSize = size;
523 }
524 }
525 break;
526 }
527 }
528 }
529 else
530 {
531 HUFF_ENCODE (res, ht[which]->codes, ht[which]->hd->clens);
532#ifdef DOCDUMP
533 if (cth.num_of_docs == DOCDUMP)
534 {
535 printf (" %d %d\n", ht[which]->hd->clens[res],
536 ht[which]->codes[res]);
537 }
538#endif
539 }
540 }
541
542
543 /* Add a 1 bit onto the end of the buffer the remaining bits in the last
544 byte will all be zero */
545
546 ENCODE_BIT (1);
547
548 ENCODE_FLUSH;
549
550 byte_length = __pos - __base;
551 if (!__remaining)
552 {
553 Message ("The end of the buffer was probably overrun");
554 return COMPERROR;
555 }
556
557 ENCODE_DONE
558
559#ifdef DOCDUMP
560 if (cth.num_of_docs == DOCDUMP)
561 {
562 printf ("unused bits = %d\n", bits_unused);
563 }
564#endif
565
566 HTONUL(text_length); /* [RPAP - Jan 97: Endian Ordering] */
567 fwrite (&text_length, sizeof (text_length), 1, text_idx);
568 NTOHUL(text_length); /* [RPAP - Jan 97: Endian Ordering] */
569 text_length += byte_length;
570
571#ifdef DOCDUMP
572 if (cth.num_of_docs == DOCDUMP)
573 {
574 int i;
575 for (i = 0; i < byte_length; i++)
576 printf ("%02x ", comp_buffer[i]);
577 printf ("\n");
578 }
579#endif
580
581 if (cdh.novel_method == MG_NOVEL_HYBRID_MTF)
582 for (which = 0; which <= 1; which++)
583 for (novels_used[which]--; novels_used[which] >= 0; novels_used[which]--)
584 {
585 int a = novels_used[which];
586 int b = swaps[which][novels_used[which]];
587 novel_hash_rec *temp;
588 temp = nht[which].code_to_nhr[a];
589 nht[which].code_to_nhr[a] = nht[which].code_to_nhr[b];
590 nht[which].code_to_nhr[b] = temp;
591 nht[which].code_to_nhr[a]->ordinal_num = a + 1;
592 nht[which].code_to_nhr[b]->ordinal_num = b + 1;
593 }
594
595
596 fwrite (comp_buffer, sizeof (*comp_buffer), byte_length, text);
597
598 if ((double) l_in / (double) byte_length > cth.ratio)
599 cth.ratio = (double) l_in / (double) byte_length;
600
601 cth.num_of_docs++;
602 if (l_in > cth.length_of_longest_doc)
603 cth.length_of_longest_doc = l_in;
604
605 cth.num_of_bytes += l_in;
606
607 if (Comp_Stats)
608 {
609 stats_in_tot_bytes += l_in;
610 stats_in_bytes += l_in;
611 stats_out_bytes += byte_length;
612 if (stats_in_bytes >= comp_stat_point)
613 {
614 fprintf (Comp_Stats, "%10.0f %10.0f %10.0f %f\n", stats_in_tot_bytes,
615 stats_in_bytes, stats_out_bytes,
616 (double) stats_out_bytes / (double) stats_in_bytes);
617 stats_in_bytes = 0.0;
618 stats_out_bytes = 0.0;
619 }
620 }
621
622 return COMPALLOK;
623}
624
625
626
627
628
629
630int
631write_aux_dict (char *FileName)
632{
633 int i;
634 FILE *aux;
635 if (!(aux = create_file (FileName, TEXT_DICT_AUX_SUFFIX, "wb",
636 MAGIC_AUX_DICT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
637 {
638 fprintf(stderr,"Couldn't create file %s%s:%s\n",
639 FileName, TEXT_DICT_AUX_SUFFIX,
640#if defined(HAVE_STRERROR) || defined(__WIN32__)
641 strerror(errno)
642#else
643 " "
644#endif
645 );
646 return COMPERROR;
647 }
648 for (i = 0; i <= 1; i++)
649 {
650 aux_frags_header afh;
651 char_pool *cp;
652
653 afh.num_frags = nht[i].HashUsed;
654 afh.mem_for_frags = 0;
655 for (cp = nht[i].first_pool; cp; cp = cp->next)
656 afh.mem_for_frags += POOL_SIZE - cp->left;
657
658 /* [RPAP - Jan 97: Endian Ordering] */
659 HTONUL(afh.num_frags);
660 HTONUL(afh.mem_for_frags);
661
662 fwrite (&afh, sizeof (afh), 1, aux);
663
664 for (cp = nht[i].first_pool; cp; cp = cp->next)
665 fwrite (cp->pool, POOL_SIZE - cp->left, sizeof (u_char), aux);
666 }
667 fclose (aux);
668 return COMPALLOK;
669}
670
671
672void
673estimate_compressed_aux_dict (void)
674{
675 int i;
676 mg_u_long aux_compressed = 0, total_uncomp = 0;
677 for (i = 0; i <= 1; i++)
678 {
679 int j;
680 mg_s_long chars[256], fchars[256];
681 mg_s_long lens[16], flens[16];
682 char_pool *cp;
683 bzero ((char *) chars, sizeof (chars));
684 bzero ((char *) lens, sizeof (lens));
685 for (cp = nht[i].first_pool; cp; cp = cp->next)
686 {
687 u_char *buf = cp->pool;
688 while (buf != cp->ptr)
689 {
690 int len = *buf++;
691 lens[len]++;
692 total_uncomp += len + 4;
693 for (; len; len--)
694 chars[*buf++]++;
695 }
696 }
697 for (j = 0; j < 256; j++)
698 if (!chars[j] && PESINAWORD (j) == i)
699 fchars[j] = 1;
700 else
701 fchars[j] = chars[j];
702 for (j = 0; j < 16; j++)
703 if (!lens[j])
704 flens[j] = 1;
705 else
706 flens[j] = lens[j];
707
708 aux_compressed += (Calculate_Huffman_Size (16, flens, lens) +
709 Calculate_Huffman_Size (256, fchars, chars)) / 8;
710
711 }
712
713 Message ("Aux dictionary (Uncompressed) %.2f Mb ( %u bytes %0.3f %%)",
714 total_uncomp / 1024.0 / 1024, total_uncomp,
715 (total_uncomp * 100.0) / bytes_processed);
716 Message ("Aux dictionary (Compressed) %.2f Mb ( %.0f bytes %0.3f %%)",
717 aux_compressed / 1024.0 / 1024, aux_compressed * 1.0,
718 (aux_compressed * 100.0) / bytes_processed);
719}
720
721
722
723
724
725
726int
727done_text_2 (char *FileName)
728{
729 if (Comp_Stats)
730 fprintf (Comp_Stats, "%10.0f %10.0f %10.0f %f\n", stats_in_tot_bytes,
731 stats_in_bytes, stats_out_bytes,
732 (double) stats_out_bytes / (double) stats_in_bytes);
733
734 HTONUL(text_length); /* [RPAP - Jan 97: Endian Ordering] */
735 fwrite (&text_length, sizeof (text_length), 1, text_idx);
736 NTOHUL(text_length); /* [RPAP - Jan 97: Endian Ordering] */
737
738 /* [RPAP - Jan 97: Endian Ordering] */
739 HTONUL(cth.num_of_docs);
740 HTOND(cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
741 HTONUL(cth.num_of_words);
742 HTONUL(cth.length_of_longest_doc);
743 HTOND(cth.ratio);
744
745 if (fseek (text_idx, sizeof (mg_u_long), SEEK_SET) == -1 ||
746 fwrite (&cth, sizeof (cth), 1, text_idx) != 1)
747 return COMPERROR;
748 fclose (text_idx);
749
750 if (fseek (text, sizeof (mg_u_long), SEEK_SET) == -1 ||
751 fwrite (&cth, sizeof (cth), 1, text) != 1)
752 return COMPERROR;
753 fclose (text);
754
755 /* [RPAP - Jan 97: Endian Ordering] */
756 NTOHUL(cth.num_of_docs);
757 NTOHD(cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
758 NTOHUL(cth.num_of_words);
759 NTOHUL(cth.length_of_longest_doc);
760 NTOHD(cth.ratio);
761
762
763 Message ("Compressed Text %.2f Mb ( %u bytes %0.3f %%)",
764 text_length / 1024.0 / 1024.0, text_length,
765 (text_length * 100.0) / bytes_processed);
766 Message ("Words portion of the dictionary %.2f Mb ( %.0f bytes %0.3f %%)",
767 Words_disk / 1024.0 / 1024, Words_disk * 1.0,
768 (Words_disk * 100.0) / bytes_processed);
769
770 if (cdh.dict_type != MG_COMPLETE_DICTIONARY &&
771 (cdh.novel_method == MG_NOVEL_BINARY ||
772 cdh.novel_method == MG_NOVEL_DELTA ||
773 cdh.novel_method == MG_NOVEL_HYBRID ||
774 cdh.novel_method == MG_NOVEL_HYBRID_MTF))
775 {
776 if (write_aux_dict (FileName) == COMPERROR)
777 return COMPERROR;
778 estimate_compressed_aux_dict ();
779 }
780 else
781 {
782 if (cdh.dict_type != MG_COMPLETE_DICTIONARY)
783 Message ("Huffman info for chars in dictionary %.2f Mb"
784 " ( %u bytes %0.3f %%)",
785 Chars_disk / 1024.0 / 1024, Chars_disk,
786 (Chars_disk * 100.0) / bytes_processed);
787 unlink (make_name (FileName, TEXT_DICT_AUX_SUFFIX, NULL));
788 }
789
790 return (COMPALLOK);
791}
Note: See TracBrowser for help on using the repository browser.