source: trunk/indexers/mg/src/text/ivf.pass1.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 20.4 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass1.c -- Memory efficient pass 1 inversion
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: ivf.pass1.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "bitio_m.h"
29#include "bitio_m_stdio.h"
30#include "bitio_gen.h"
31#include "local_strings.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "invf.h"
36#include "mg.h"
37#include "build.h"
38#include "locallib.h"
39#include "words.h"
40#include "stemmer.h"
41#include "hash.h"
42
43#include "longlong.h"
44
45/*
46 $Log$
47 Revision 1.1 2003/02/20 21:18:23 mdewsnip
48 Addition of MG package for search and retrieval
49
50 Revision 1.2 2001/09/21 12:46:42 kjm18
51 updated mg to be in line with mg_1.3f. Now uses long long for some variables
52 to enable indexing of very large collections.
53
54 Revision 1.1 1999/08/10 21:17:54 sjboddie
55 renamed mg-1.3d directory mg
56
57 Revision 1.3 1998/12/17 09:12:50 rjmcnab
58
59 Altered mg to process utf-8 encoded Unicode. The main changes
60 are in the parsing of the input, the casefolding, and the stemming.
61
62 Revision 1.2 1998/11/25 07:55:43 rjmcnab
63
64 Modified mg to that you can specify the stemmer you want
65 to use via a command line option. You specify it to
66 mg_passes during the build process. The number of the
67 stemmer that you used is stored within the inverted
68 dictionary header and the stemmed dictionary header so
69 the correct stemmer is used in later stages of building
70 and querying.
71
72 Revision 1.1 1998/11/17 09:34:44 rjmcnab
73 *** empty log message ***
74
75 * Revision 1.6 1995/01/16 03:57:05 tes
76 * Fixed bug for index_string_bytes which was calculated
77 * by adding the suffix lengths except for every lookback (block)
78 * number of words. This was incorrect as invf.dict is fully
79 * front coded - perhaps it was partially coded in the past.
80 * This would only affect a statistic - hence the reason why the
81 * bug could exist ;-)
82 *
83 * Revision 1.5 1994/11/29 00:31:59 tes
84 * Committing the new merged files and changes.
85 *
86 * Revision 1.3 1994/10/20 03:56:48 tes
87 * I have rewritten the boolean query optimiser and abstracted out the
88 * components of the boolean query.
89 *
90 * Revision 1.2 1994/09/20 04:41:34 tes
91 * For version 1.1
92 *
93 */
94
95/*
96 * Modified:
97 * - long long bit counts for inverted file
98 * (1999-08-05 Tim Bell <[email protected]>)
99 */
100static char *RCSID = "$Id: ivf.pass1.c 3745 2003-02-20 21:20:24Z mdewsnip $";
101
102#define LOGLOOKBACK 2
103#define POOL_SIZE 1024*1024
104#define HASH_POOL_SIZE 8192
105#define INITIAL_HASH_SIZE 7927
106
107#define INIT_CHECK_FRAC 0.10
108#define CHECK_FRAC 0.75
109#define CHECK_CLOSE 0.999
110#define PARA_DIV 1.5
111#define NORM_DIV 1.0
112
113
114typedef struct hash_rec
115 {
116 unsigned long fcnt; /* fragment frequency */
117 unsigned long lfcnt; /* local fragment frequency */
118 unsigned long fnum; /* last fragment to use stem */
119 unsigned long wcnt; /* stem frequency */
120 unsigned long lwcnt; /* local stem frequency */
121 u_char *word;
122 }
123hash_rec;
124
125
126
127
128
129
130
131
132static unsigned long words_read = 0, words_diff = 0, bytes_diff = 0;
133static unsigned long outputbytes = 0;
134static unsigned long inputbytes = 0;
135static unsigned long MaxMemInUse = 0;
136static unsigned long MemInUse = 0;
137static unsigned long ChunksWritten = 0;
138
139
140static FILE *ic; /* The invf block file */
141static stdio_bitio_state sbs;
142
143static hash_rec **HashTable;
144static unsigned long HashSize;
145static unsigned long HashUsed;
146static u_char *Pool;
147static int PoolLeft;
148
149static hash_rec *hr_pool;
150static int hr_PoolLeft;
151
152static hash_rec **first_occr;
153static int max_first_occr;
154
155static mg_ullong L1_bits = 0;
156static mg_ullong L2_bits = 0;
157static mg_ullong L3_bits = 0;
158static unsigned long L1_ohead = 0;
159static unsigned long L2_ohead = 0;
160static unsigned long L3_ohead = 0;
161static unsigned long callnum = 0, lcallnum = 0, wordnum = 0, lwordnum = 0;
162static unsigned long ptrcnt = 0;
163static unsigned long checknum;
164static long max_mem = 0;
165
166
167static void
168ChangeMem (int Change)
169{
170 MemInUse += Change;
171 if (MemInUse > MaxMemInUse)
172 MaxMemInUse = MemInUse;
173}
174
175
176
177int
178init_ivf_1 (char *file_name)
179{
180 if (!(ic = create_file (file_name, INVF_CHUNK_SUFFIX, "wb", MAGIC_CHUNK,
181 MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
182 return (COMPERROR);
183 fwrite (" ", sizeof (u_long), 1, ic); /* Space for the maxmem */
184 ENCODE_START (ic)
185 ENCODE_PAUSE (sbs)
186
187 if (!(Pool = Xmalloc (POOL_SIZE)))
188 {
189 Message ("Unable to allocate memory for pool");
190 return (COMPERROR);
191 }
192 PoolLeft = POOL_SIZE;
193 ChangeMem (POOL_SIZE);
194
195 if (!(hr_pool = Xmalloc (HASH_POOL_SIZE * sizeof (hash_rec))))
196 {
197 Message ("Unable to allocate memory for pool");
198 return (COMPERROR);
199 }
200 hr_PoolLeft = HASH_POOL_SIZE;
201 ChangeMem (HASH_POOL_SIZE * sizeof (hash_rec));
202
203 HashSize = INITIAL_HASH_SIZE;
204 HashUsed = 0;
205 if (!(HashTable = Xmalloc (sizeof (hash_rec *) * HashSize)))
206 {
207 Message ("Unable to allocate memory for table");
208 return (COMPERROR);
209 }
210 ChangeMem (sizeof (hash_rec *) * HashSize);
211 bzero ((char *) HashTable, sizeof (hash_rec *) * HashSize);
212 max_first_occr = 8192;
213 if (!(first_occr = Xmalloc (sizeof (hash_rec *) * max_first_occr)))
214 {
215 Message ("Unable to allocate memory for first_occur");
216 return (COMPERROR);
217 }
218 ChangeMem (sizeof (hash_rec *) * max_first_occr);
219
220 checknum = (invf_buffer_size * INIT_CHECK_FRAC) /
221 (InvfLevel == 3 ? PARA_DIV : NORM_DIV);
222
223 return (COMPALLOK);
224}
225
226
227
228static unsigned long
229mem_reqd (void)
230{
231 register int i;
232 register unsigned long total = 0;
233/* register unsigned long N = InvfLevel == 3 ? lwordnum : lcallnum; */
234 register unsigned long N = lcallnum;
235 for (i = 0; i < HashUsed; i++)
236 {
237 register hash_rec *ent = first_occr[i];
238/* register unsigned long p = InvfLevel == 3 ? ent->lwcnt : ent->lfcnt; */
239 register unsigned long p = ent->lfcnt;
240 if (p)
241 total += BIO_Bblock_Bound (N, p);
242 if (InvfLevel >= 2)
243 total += ent->lwcnt;
244 }
245
246 total = (total + 7) >> 3;
247 return total;
248}
249
250static unsigned long
251max_mem_reqd (void)
252{
253 register int i;
254 register unsigned long total = 0;
255/* register unsigned long N = InvfLevel == 3 ? wordnum : callnum; */
256 register unsigned long N = callnum;
257 for (i = 0; i < HashUsed; i++)
258 {
259 register hash_rec *ent = first_occr[i];
260/* register unsigned long p = InvfLevel == 3 ? ent->wcnt : ent->fcnt; */
261 register unsigned long p = ent->fcnt;
262 if (p)
263 total += BIO_Bblock_Bound (N, p);
264 if (InvfLevel >= 2)
265 total += ent->wcnt;
266 }
267
268 total = (total + 7) >> 3;
269 return total;
270}
271
272
273static void
274dump_dict (unsigned long mem)
275{
276 int i;
277
278 ChunksWritten++;
279
280 ENCODE_CONTINUE (sbs)
281
282 GAMMA_ENCODE (lcallnum + 1);
283 if (mem > max_mem)
284 max_mem = mem;
285 GAMMA_ENCODE (mem + 1);
286 lwordnum = lcallnum = 0;
287 GAMMA_ENCODE (HashUsed + 1);
288 for (i = 0; i < HashUsed; i++)
289 {
290 hash_rec *ent = first_occr[i];
291 GAMMA_ENCODE (ent->lwcnt + 1);
292 if (ent->lwcnt >= 2)
293 GAMMA_ENCODE (ent->lfcnt);
294 ent->lwcnt = ent->lfcnt = 0;
295 }
296 ptrcnt = 0;
297 ENCODE_PAUSE (sbs)
298}
299
300
301
302static int
303process_doc (u_char * s_in, int l_in)
304{
305 u_char *end = s_in + l_in - 1;
306
307 callnum++;
308 lcallnum++;
309 inputbytes += l_in;
310
311 if (!inaword (s_in, end))
312 if (SkipSGML)
313 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
314 else
315 PARSE_NON_STEM_WORD (s_in, end);
316 /*
317 ** Alternately parse off words and non-words from the input
318 ** stream beginning with a non-word. Each token is then
319 ** inserted into the set if it does not exist or has it's
320 ** frequency count incremented if it does.
321 */
322 while (s_in <= end)
323 {
324 u_char Word[MAXSTEMLEN + 1];
325
326 PARSE_STEM_WORD (Word, s_in, end);
327
328 stemmer (stem_method, stemmer_num, Word);
329 if (SkipSGML)
330 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
331 else
332 PARSE_NON_STEM_WORD (s_in, end);
333
334 words_read++;
335 wordnum++;
336 lwordnum++;
337 /* Search the hash table for Word */
338 {
339 register unsigned long hashval, step;
340 register int hsize = HashSize;
341 HASH (hashval, step, Word, hsize);
342 for (;;)
343 {
344 register u_char *s1;
345 register u_char *s2;
346 register int len;
347 register hash_rec *ent;
348 ent = HashTable[hashval];
349 if (!ent)
350 {
351 int len = *Word + 1;
352 if (!hr_PoolLeft)
353 {
354 if (!(hr_pool = Xmalloc (HASH_POOL_SIZE *
355 sizeof (hash_rec))))
356 {
357 Message ("Unable to allocate memory for pool");
358 return (COMPERROR);
359 }
360 hr_PoolLeft = HASH_POOL_SIZE;
361 ChangeMem (HASH_POOL_SIZE * sizeof (hash_rec));
362 }
363 ent = hr_pool++;
364 hr_PoolLeft--;
365 if (len > PoolLeft)
366 {
367 if (!(Pool = Xmalloc (POOL_SIZE)))
368 {
369 Message ("Unable to allocate memory for pool");
370 return (COMPERROR);
371 }
372 PoolLeft = POOL_SIZE;
373 ChangeMem (POOL_SIZE);
374 }
375 ent->wcnt = 1;
376 ent->fcnt = 1;
377 ent->lwcnt = 1;
378 ent->lfcnt = 1;
379 ent->fnum = callnum;
380 ent->word = Pool;
381 memcpy (Pool, Word, len);
382 Pool += len;
383 PoolLeft -= len;
384 if (HashUsed == max_first_occr - 1)
385 {
386 ChangeMem (-sizeof (hash_rec *) * max_first_occr);
387 max_first_occr *= 2;
388 if (!(first_occr = Xrealloc (first_occr, sizeof (hash_rec *) *
389 max_first_occr)))
390 {
391 Message ("Unable to allocate memory for first_occr");
392 return (COMPERROR);
393 }
394 ChangeMem (sizeof (hash_rec *) * max_first_occr);
395 }
396 first_occr[HashUsed] = ent;
397 HashUsed++;
398 HashTable[hashval] = ent;
399 bytes_diff += Word[0];
400 break;
401 }
402
403 /* Compare the words */
404 s1 = Word;
405 s2 = ent->word;
406 len = *s1 + 1;
407 for (; len; len--)
408 if (*s1++ != *s2++)
409 break;
410
411 if (len)
412 {
413 hashval = (hashval + step);
414 if (hashval >= hsize)
415 hashval -= hsize;
416 }
417 else
418 {
419 ent->wcnt++;
420 ent->lwcnt++;
421 if (callnum > ent->fnum)
422 {
423 ptrcnt++;
424 ent->fcnt++;
425 ent->lfcnt++;
426 ent->fnum = callnum;
427 }
428 break;
429 }
430 }
431 }
432
433 if (HashUsed >= HashSize >> 1)
434 {
435 hash_rec **ht;
436 unsigned long size;
437 unsigned long i;
438 size = prime (HashSize * 2);
439 if (!(ht = Xmalloc (sizeof (hash_rec *) * size)))
440 {
441 Message ("Unable to allocate memory for table");
442 return (COMPERROR);
443 }
444 bzero ((char *) ht, sizeof (hash_rec *) * size);
445 ChangeMem (sizeof (hash_rec *) * size);
446
447 for (i = 0; i < HashSize; i++)
448 if (HashTable[i])
449 {
450 register u_char *wptr;
451 hash_rec *ent;
452 register unsigned long hashval, step;
453
454 wptr = HashTable[i]->word;
455 HASH (hashval, step, wptr, size);
456 ent = ht[hashval];
457 while (ent)
458 {
459 hashval += step;
460 if (hashval >= size)
461 hashval -= size;
462 ent = ht[hashval];
463 }
464 ht[hashval] = HashTable[i];
465 }
466 Xfree (HashTable);
467 ChangeMem (-sizeof (hash_rec *) * HashSize);
468 HashTable = ht;
469 HashSize = size;
470 }
471
472 }
473
474 if (ptrcnt >= checknum)
475 {
476 unsigned long mem;
477 /*fprintf(stderr, "Checking at %u . . . ", ptrcnt); */
478 mem = mem_reqd ();
479 if (mem >= invf_buffer_size * CHECK_CLOSE)
480 {
481 /*fprintf(stderr, "Got a match\n"); */
482 dump_dict (mem);
483 checknum = (invf_buffer_size * INIT_CHECK_FRAC) /
484 (InvfLevel == 3 ? PARA_DIV : NORM_DIV);
485 }
486 else
487 {
488 checknum = checknum * ((CHECK_FRAC * (invf_buffer_size - mem)) / mem) +
489 checknum;
490 if (checknum <= ptrcnt)
491 checknum = ptrcnt + 1;
492 /*fprintf(stderr, "Next check at %u\n", checknum); */
493 }
494 }
495 return (COMPALLOK);
496
497} /* encode */
498
499
500int
501process_ivf_1 (u_char * s_in, int l_in)
502{
503 if (InvfLevel <= 2)
504 return process_doc (s_in, l_in);
505 else
506 {
507 int pos = 0;
508 u_char *start = s_in;
509 while (pos < l_in)
510 {
511 if (s_in[pos] == TERMPARAGRAPH)
512 {
513 int len = pos + s_in + 1 - start;
514 if (process_doc (start, len) != COMPALLOK)
515 return (COMPERROR);
516 start = s_in + pos + 1;
517 }
518 pos++;
519 }
520 if (start < s_in + pos)
521 return process_doc (start, pos + s_in - start);
522 }
523 return COMPALLOK;
524}
525
526static int
527PackHashTable (void)
528{
529 int s, d;
530 for (s = d = 0; s < HashSize; s++)
531 if (HashTable[s])
532 HashTable[d++] = HashTable[s];
533 ChangeMem (-sizeof (hash_rec *) * HashSize);
534 ChangeMem (sizeof (hash_rec *) * HashUsed);
535 if (!(HashTable = Xrealloc (HashTable, sizeof (hash_rec *) * HashUsed)))
536 {
537 Message ("Out of memory");
538 return COMPERROR;
539 }
540 HashSize = HashUsed;
541 return COMPALLOK;
542}
543
544
545
546
547
548static int
549ent_comp (const void *A, const void *B)
550{
551 u_char *s1 = (*((hash_rec **) A))->word;
552 u_char *s2 = (*((hash_rec **) B))->word;
553
554 return (casecompare (s1, s2)); /* [RPAP - Jan 97: Stem Index Change] */
555} /* stem_comp */
556
557
558
559
560
561/*
562 * void count_text()
563 *
564 * The maths used in this function is described in the paper "Coding for
565 * Compression in Full-Text Retrieval Systems"
566 *
567 */
568static void
569count_text ()
570{
571 int i;
572 mg_ullong oldL12_bits = 0;
573 for (i = 0; i < HashUsed; i++)
574 {
575 hash_rec *wrd = HashTable[i];
576 /* estimate size of a level 1 inverted file */
577 L1_bits += (int) (0.99 + wrd->fcnt * (1.6 + log2 (1.0 * callnum / wrd->fcnt)));
578 L1_ohead += BIO_Gamma_Length (wrd->fcnt);
579
580 L2_bits += BIO_Unary_Length (wrd->wcnt);
581 L2_ohead += BIO_Gamma_Length (wrd->wcnt - wrd->fcnt + 1);
582
583 L3_bits += (int) (0.99 + wrd->wcnt *
584 (1.6 + log2 (1.0 * words_read / (wrd->wcnt + callnum))));
585 L3_ohead += 0;
586
587 /* check for overflow */
588 if (L1_bits + L2_bits < oldL12_bits) {
589 fprintf(stderr, "ERROR: Inverted file size will probably overflow %d byte unsigned integer\n", sizeof (mg_ullong));
590 fprintf(stderr, " counter in pass 2.\n");
591 if (sizeof (mg_ullong) < 8) {
592 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
593 }
594 fprintf(stderr, " Build aborted.\n");
595 exit(1);
596 }
597 oldL12_bits = L1_bits + L2_bits;
598 }
599 L3_bits = (L3_bits + L2_bits + L1_bits + 7) / 8;
600 L3_ohead = (L3_ohead + L2_ohead + L1_ohead + 7) / 8;
601 L2_bits = (L2_bits + L1_bits + 7) / 8;
602 L2_ohead = (L2_ohead + L1_ohead + 7) / 8;
603 L1_bits = (L1_bits + 7) / 8;
604 L1_ohead = (L1_ohead + 7) / 8;
605} /* count_text */
606
607
608
609/*
610 * write_stem_file():
611 * writes out the stemmed dictionary file
612 * in the following format
613 * lookback value (int)
614 * totalbytes value (int)
615 * indexstringbytes (int)
616 * for each word
617 * wordlen (4 bits)
618 * prefix match (4 bits)
619 * word (wordlen bytes)
620 * word frequency (int)
621 * word count (int)
622 *
623 * Accesses outside variables:
624 *
625 * Return value...:
626 */
627
628static void
629write_stem_file (char *file_name)
630{
631 long j;
632 struct invf_dict_header idh;
633 long lookback = (1 << LOGLOOKBACK); /* ???? */
634 long totalbytes = 0; /* The sum of the length of all words, including
635 the length byte */
636 long indexstringbytes = 0; /* The amount of space required to store the
637 words in the diction, this takes into account
638 the prefixes */
639 u_char *lastword = NULL; /* A pointer to the last word processed */
640 FILE *sp;
641
642
643 /* Calculate the size of certain things */
644 for (j = 0; j < HashSize; j++)
645 {
646 u_char *word = HashTable[j]->word;
647 indexstringbytes += word[0] + 2;
648 totalbytes += word[0] + 1;
649 if (lastword)
650 indexstringbytes -= prefixlen (lastword, word);
651 lastword = word;
652 }
653
654 lastword = NULL;
655
656 if (!(sp = create_file (file_name, INVF_DICT_SUFFIX, "wb", MAGIC_STEM_BUILD,
657 MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
658 return;
659
660 /* [RPAP - Jan 97: Endian Ordering] */
661 HTONUL2(lookback, idh.lookback);
662 HTONUL2(HashSize, idh.dict_size);
663 HTONUL2(totalbytes, idh.total_bytes);
664 HTONUL2(indexstringbytes, idh.index_string_bytes);
665 HTOND2(inputbytes, idh.input_bytes); /* [RJM 07/97: 4G limit] */
666 HTONUL2(callnum, idh.num_of_docs);
667 HTONUL2(callnum, idh.static_num_of_docs);
668 HTONUL2(words_read, idh.num_of_words);
669 HTONUL2(stemmer_num, idh.stemmer_num);
670 HTONUL2(stem_method, idh.stem_method);
671
672 fwrite ((char *) &idh, sizeof (idh), 1, sp);
673 outputbytes += sizeof (idh);
674
675 for (j = 0; j < HashSize; j++)
676 {
677 int i;
678 unsigned long wcnt, fcnt; /* [RPAP - Jan 97: Endian Ordering] */
679 hash_rec *ent = HashTable[j];
680 if (lastword != NULL)
681 /* look for prefix match with prev string */
682 i = prefixlen (lastword, ent->word);
683 else
684 i = 0;
685 fputc (i, sp);
686 fputc (ent->word[0] - i, sp);
687 fwrite ((char *) ent->word + i + 1, sizeof (u_char), ent->word[0] - i, sp);
688 outputbytes += ent->word[0] - i + 1;
689
690 /* [RPAP - Jan 97: Endian Ordering] */
691 HTONUL2(ent->fcnt, fcnt);
692 fwrite ((char *) &(fcnt), sizeof (fcnt), 1, sp);
693
694 outputbytes += sizeof (ent->fcnt);
695
696 /* [RPAP - Jan 97: Endian Ordering] */
697 HTONUL2(ent->wcnt, wcnt);
698 fwrite ((char *) &(wcnt), sizeof (wcnt), 1, sp);
699
700 outputbytes += sizeof (ent->wcnt);
701 lastword = ent->word;
702 }
703
704 fclose (sp);
705} /* write_stem_file() */
706
707
708
709/*
710 * write_codes():
711 * calls functions to assign and write out codes and
712 * then prints out stats about execution results.
713 *
714 * Accesses outside variables: z
715 *
716 * Return value...: z
717 */
718
719static void
720write_codes (char *file_name)
721{
722 unsigned long dicts = 0;
723 outputbytes = 0;
724 write_stem_file (file_name);
725 dicts = outputbytes;
726
727#ifndef SILENT
728 Message ("Chunks written : %d\n",
729 ChunksWritten);
730 if (InvfLevel == 3)
731 Message ("Paragraphs : %8d\n", callnum);
732 Message ("Peak memory usage : %10.1f Mb\n",
733 (double) MaxMemInUse / 1024.0 / 1024.0);
734 Message ("Stems size : %10.1f kB %5.2f%%\n",
735 (double) dicts / 1024, (double) 100 * dicts / inputbytes);
736 Message ("Lev 1 inverted file : %10.1f kB %5.2f%%\n",
737 (double) (L1_bits + L1_ohead) / 1024,
738 (double) 100 * (L1_bits + L1_ohead) / inputbytes);
739 Message ("Lev 2 inverted file : %10.1f kB %5.2f%%\n",
740 (double) (L2_bits + L2_ohead) / 1024,
741 (double) 100 * (L2_bits + L2_ohead) / inputbytes);
742 Message ("Lev 3 inverted file : %10.1f kB %5.2f%%\n",
743 (double) (L3_bits + L3_ohead) / 1024,
744 (double) 100 * (L3_bits + L3_ohead) / inputbytes);
745 Message ("Record addresses : %10.1f kB %5.2f%%\n",
746 (double) words_diff * 4 / 1024, (double) 100 * words_diff * 4 / inputbytes);
747#endif
748} /* write_codes() */
749
750
751
752void
753write_num_file (char *file_name)
754{
755
756 int i;
757 FILE *f;
758
759 for (i = 0; i < HashSize; i++)
760 HashTable[i]->fnum = i;
761
762 if (!(f = create_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "wb",
763 MAGIC_CHUNK_TRANS, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
764 return;
765
766#if 1
767 ENCODE_START (f)
768
769 for (i = 0; i < HashSize; i++)
770 BINARY_ENCODE (first_occr[i]->fnum + 1, HashSize + 1);
771
772 ENCODE_DONE;
773#else
774 for (i = 0; i < HashSize; i++)
775 {
776 /* [RPAP - Jan 97: Endian Ordering] */
777 unsigned long fnum;
778 HTONUL2(first_occur[i]->fnum, fnum);
779 fwrite ((char *) &fnum, sizeof (unsigned long), 1, f);
780 }
781#endif
782
783 fclose (f);
784}
785
786
787
788
789int
790done_ivf_1 (char *FileName)
791{
792 char *temp_str = msg_prefix;
793 msg_prefix = "ivf.pass1";
794#ifndef SILENT
795 Message ("Mem reqd for 1 chunk: %8u bytes\n",
796 max_mem_reqd ());
797#endif
798 if (lcallnum)
799 dump_dict (mem_reqd ());
800
801 if (PackHashTable () == COMPERROR)
802 return COMPERROR;
803
804 qsort (HashTable, HashUsed, sizeof (hash_rec *), ent_comp);
805
806 count_text ();
807 write_codes (FileName);
808
809 ENCODE_CONTINUE (sbs)
810 GAMMA_ENCODE (1);
811 ENCODE_DONE
812 fseek (ic, sizeof (long), 0);
813
814 HTONSL(max_mem); /* [RPAP - Jan 97: Endian Ordering] */
815 fwrite (&max_mem, sizeof (max_mem), 1, ic);
816 NTOHSL(max_mem); /* [RPAP - Jan 97: Endian Ordering] */
817
818 fclose (ic);
819
820 write_num_file (FileName);
821
822 msg_prefix = temp_str;
823
824 return (COMPALLOK);
825} /* done_encode */
Note: See TracBrowser for help on using the repository browser.