source: main/tags/2.80/indexers/mg/src/text/ivf.pass1.c@ 24541

Last change on this file since 24541 was 7582, checked in by kjdon, 20 years ago

fixed the bug that was causing it not to be able to create a second index using jni - basically had to reset all the static variables at the start of each pass. the tricky thing to find was the static variables in occur_to_lexical in ivf.pass2

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 21.4 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass1.c -- Memory efficient pass 1 inversion
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: ivf.pass1.c 7582 2004-06-10 03:02:05Z kjdon $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "bitio_m.h"
29#include "bitio_m_stdio.h"
30#include "bitio_gen.h"
31#include "local_strings.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "invf.h"
36#include "mg.h"
37#include "build.h"
38#include "locallib.h"
39#include "words.h"
40#include "stemmer.h"
41#include "hash.h"
42
43#include "longlong.h"
44
45/*
46 $Log$
47 Revision 1.2 2004/06/10 03:02:05 kjdon
48 fixed the bug that was causing it not to be able to create a second index using jni - basically had to reset all the static variables at the start of each pass. the tricky thing to find was the static variables in occur_to_lexical in ivf.pass2
49
50 Revision 1.1 2003/02/20 21:18:23 mdewsnip
51 Addition of MG package for search and retrieval
52
53 Revision 1.2 2001/09/21 12:46:42 kjm18
54 updated mg to be in line with mg_1.3f. Now uses long long for some variables
55 to enable indexing of very large collections.
56
57 Revision 1.1 1999/08/10 21:17:54 sjboddie
58 renamed mg-1.3d directory mg
59
60 Revision 1.3 1998/12/17 09:12:50 rjmcnab
61
62 Altered mg to process utf-8 encoded Unicode. The main changes
63 are in the parsing of the input, the casefolding, and the stemming.
64
65 Revision 1.2 1998/11/25 07:55:43 rjmcnab
66
67 Modified mg to that you can specify the stemmer you want
68 to use via a command line option. You specify it to
69 mg_passes during the build process. The number of the
70 stemmer that you used is stored within the inverted
71 dictionary header and the stemmed dictionary header so
72 the correct stemmer is used in later stages of building
73 and querying.
74
75 Revision 1.1 1998/11/17 09:34:44 rjmcnab
76 *** empty log message ***
77
78 * Revision 1.6 1995/01/16 03:57:05 tes
79 * Fixed bug for index_string_bytes which was calculated
80 * by adding the suffix lengths except for every lookback (block)
81 * number of words. This was incorrect as invf.dict is fully
82 * front coded - perhaps it was partially coded in the past.
83 * This would only affect a statistic - hence the reason why the
84 * bug could exist ;-)
85 *
86 * Revision 1.5 1994/11/29 00:31:59 tes
87 * Committing the new merged files and changes.
88 *
89 * Revision 1.3 1994/10/20 03:56:48 tes
90 * I have rewritten the boolean query optimiser and abstracted out the
91 * components of the boolean query.
92 *
93 * Revision 1.2 1994/09/20 04:41:34 tes
94 * For version 1.1
95 *
96 */
97
98/*
99 * Modified:
100 * - long long bit counts for inverted file
101 * (1999-08-05 Tim Bell <[email protected]>)
102 */
103static char *RCSID = "$Id: ivf.pass1.c 7582 2004-06-10 03:02:05Z kjdon $";
104
105#define LOGLOOKBACK 2
106#define POOL_SIZE 1024*1024
107#define HASH_POOL_SIZE 8192
108#define INITIAL_HASH_SIZE 7927
109
110#define INIT_CHECK_FRAC 0.10
111#define CHECK_FRAC 0.75
112#define CHECK_CLOSE 0.999
113#define PARA_DIV 1.5
114#define NORM_DIV 1.0
115
116
117typedef struct hash_rec
118 {
119 unsigned long fcnt; /* fragment frequency */
120 unsigned long lfcnt; /* local fragment frequency */
121 unsigned long fnum; /* last fragment to use stem */
122 unsigned long wcnt; /* stem frequency */
123 unsigned long lwcnt; /* local stem frequency */
124 u_char *word;
125 }
126hash_rec;
127
128static unsigned long words_read = 0, words_diff = 0, bytes_diff = 0;
129static unsigned long outputbytes = 0;
130static unsigned long inputbytes = 0;
131static unsigned long MaxMemInUse = 0;
132static unsigned long MemInUse = 0;
133static unsigned long ChunksWritten = 0;
134
135
136static FILE *ic; /* The invf block file */
137static stdio_bitio_state sbs;
138
139static hash_rec **HashTable;
140static unsigned long HashSize=0;
141static unsigned long HashUsed=0;
142static u_char *Pool;
143static int PoolLeft=0;
144
145static hash_rec *hr_pool;
146static int hr_PoolLeft=0;
147
148static hash_rec **first_occr;
149static int max_first_occr=0;
150
151static mg_ullong L1_bits = 0;
152static mg_ullong L2_bits = 0;
153static mg_ullong L3_bits = 0;
154static unsigned long L1_ohead = 0;
155static unsigned long L2_ohead = 0;
156static unsigned long L3_ohead = 0;
157static unsigned long callnum = 0, lcallnum = 0, wordnum = 0, lwordnum = 0;
158static unsigned long ptrcnt = 0;
159static unsigned long checknum=0;
160static long max_mem = 0;
161
162
163static void
164ChangeMem (int Change)
165{
166 MemInUse += Change;
167 if (MemInUse > MaxMemInUse)
168 MaxMemInUse = MemInUse;
169}
170
171void
172ResetStaticI1Vars ()
173{
174 words_read = 0;
175 words_diff = 0;
176 bytes_diff = 0;
177 outputbytes = 0;
178 inputbytes = 0;
179 MaxMemInUse = 0;
180 MemInUse = 0;
181 ChunksWritten = 0;
182
183 HashSize=0;
184 HashUsed=0;
185 PoolLeft=0;
186
187 hr_PoolLeft=0;
188
189 max_first_occr=0;
190
191 L1_bits = 0;
192 L2_bits = 0;
193 L3_bits = 0;
194 L1_ohead = 0;
195 L2_ohead = 0;
196 L3_ohead = 0;
197 callnum = 0;
198 lcallnum = 0;
199 wordnum = 0;
200 lwordnum = 0;
201 ptrcnt = 0;
202 checknum=0;
203 max_mem = 0;
204
205}
206int
207init_ivf_1 (char *file_name)
208{
209 ResetStaticI1Vars();
210 if (!(ic = create_file (file_name, INVF_CHUNK_SUFFIX, "wb", MAGIC_CHUNK,
211 MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
212 return (COMPERROR);
213 fwrite (" ", sizeof (u_long), 1, ic); /* Space for the maxmem */
214 ENCODE_START (ic)
215 ENCODE_PAUSE (sbs)
216
217 if (!(Pool = Xmalloc (POOL_SIZE)))
218 {
219 Message ("Unable to allocate memory for pool");
220 return (COMPERROR);
221 }
222 PoolLeft = POOL_SIZE;
223 ChangeMem (POOL_SIZE);
224
225 if (!(hr_pool = Xmalloc (HASH_POOL_SIZE * sizeof (hash_rec))))
226 {
227 Message ("Unable to allocate memory for pool");
228 return (COMPERROR);
229 }
230 hr_PoolLeft = HASH_POOL_SIZE;
231 ChangeMem (HASH_POOL_SIZE * sizeof (hash_rec));
232
233 HashSize = INITIAL_HASH_SIZE;
234 HashUsed = 0;
235 if (!(HashTable = Xmalloc (sizeof (hash_rec *) * HashSize)))
236 {
237 Message ("Unable to allocate memory for table");
238 return (COMPERROR);
239 }
240 ChangeMem (sizeof (hash_rec *) * HashSize);
241 bzero ((char *) HashTable, sizeof (hash_rec *) * HashSize);
242 max_first_occr = 8192;
243 if (!(first_occr = Xmalloc (sizeof (hash_rec *) * max_first_occr)))
244 {
245 Message ("Unable to allocate memory for first_occur");
246 return (COMPERROR);
247 }
248 ChangeMem (sizeof (hash_rec *) * max_first_occr);
249
250 checknum = (invf_buffer_size * INIT_CHECK_FRAC) /
251 (InvfLevel == 3 ? PARA_DIV : NORM_DIV);
252
253 return (COMPALLOK);
254}
255
256
257
258static unsigned long
259mem_reqd (void)
260{
261 register int i;
262 register unsigned long total = 0;
263/* register unsigned long N = InvfLevel == 3 ? lwordnum : lcallnum; */
264 register unsigned long N = lcallnum;
265 for (i = 0; i < HashUsed; i++)
266 {
267 register hash_rec *ent = first_occr[i];
268/* register unsigned long p = InvfLevel == 3 ? ent->lwcnt : ent->lfcnt; */
269 register unsigned long p = ent->lfcnt;
270 if (p)
271 total += BIO_Bblock_Bound (N, p);
272 if (InvfLevel >= 2)
273 total += ent->lwcnt;
274 }
275
276 total = (total + 7) >> 3;
277 return total;
278}
279
280static unsigned long
281max_mem_reqd (void)
282{
283 register int i;
284 register unsigned long total = 0;
285/* register unsigned long N = InvfLevel == 3 ? wordnum : callnum; */
286 register unsigned long N = callnum;
287 for (i = 0; i < HashUsed; i++)
288 {
289 register hash_rec *ent = first_occr[i];
290/* register unsigned long p = InvfLevel == 3 ? ent->wcnt : ent->fcnt; */
291 register unsigned long p = ent->fcnt;
292 if (p)
293 total += BIO_Bblock_Bound (N, p);
294 if (InvfLevel >= 2)
295 total += ent->wcnt;
296 }
297
298 total = (total + 7) >> 3;
299 return total;
300}
301
302
303static void
304dump_dict (unsigned long mem)
305{
306 int i;
307
308 ChunksWritten++;
309
310 ENCODE_CONTINUE (sbs)
311
312 GAMMA_ENCODE (lcallnum + 1);
313 if (mem > max_mem)
314 max_mem = mem;
315 GAMMA_ENCODE (mem + 1);
316 lwordnum = lcallnum = 0;
317 GAMMA_ENCODE (HashUsed + 1);
318 for (i = 0; i < HashUsed; i++)
319 {
320 hash_rec *ent = first_occr[i];
321 GAMMA_ENCODE (ent->lwcnt + 1);
322 if (ent->lwcnt >= 2)
323 GAMMA_ENCODE (ent->lfcnt);
324 ent->lwcnt = ent->lfcnt = 0;
325 }
326 ptrcnt = 0;
327 ENCODE_PAUSE (sbs)
328}
329
330
331
332static int
333process_doc (u_char * s_in, int l_in)
334{
335 u_char *end = s_in + l_in - 1;
336
337 callnum++;
338 lcallnum++;
339 inputbytes += l_in;
340
341 if (!inaword (s_in, end))
342 if (SkipSGML)
343 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
344 else
345 PARSE_NON_STEM_WORD (s_in, end);
346 /*
347 ** Alternately parse off words and non-words from the input
348 ** stream beginning with a non-word. Each token is then
349 ** inserted into the set if it does not exist or has it's
350 ** frequency count incremented if it does.
351 */
352 while (s_in <= end)
353 {
354 u_char Word[MAXSTEMLEN + 1];
355
356 PARSE_STEM_WORD (Word, s_in, end);
357
358 stemmer (stem_method, stemmer_num, Word);
359 if (SkipSGML)
360 PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
361 else
362 PARSE_NON_STEM_WORD (s_in, end);
363
364 words_read++;
365 wordnum++;
366 lwordnum++;
367 /* Search the hash table for Word */
368 {
369 register unsigned long hashval, step;
370 register int hsize = HashSize;
371 HASH (hashval, step, Word, hsize);
372 for (;;)
373 {
374 register u_char *s1;
375 register u_char *s2;
376 register int len;
377 register hash_rec *ent;
378 ent = HashTable[hashval];
379 if (!ent)
380 {
381 int len = *Word + 1;
382 if (!hr_PoolLeft)
383 {
384 if (!(hr_pool = Xmalloc (HASH_POOL_SIZE *
385 sizeof (hash_rec))))
386 {
387 Message ("Unable to allocate memory for pool");
388 return (COMPERROR);
389 }
390 hr_PoolLeft = HASH_POOL_SIZE;
391 ChangeMem (HASH_POOL_SIZE * sizeof (hash_rec));
392 }
393 ent = hr_pool++;
394 hr_PoolLeft--;
395 if (len > PoolLeft)
396 {
397 if (!(Pool = Xmalloc (POOL_SIZE)))
398 {
399 Message ("Unable to allocate memory for pool");
400 return (COMPERROR);
401 }
402 PoolLeft = POOL_SIZE;
403 ChangeMem (POOL_SIZE);
404 }
405 ent->wcnt = 1;
406 ent->fcnt = 1;
407 ent->lwcnt = 1;
408 ent->lfcnt = 1;
409 ent->fnum = callnum;
410 ent->word = Pool;
411 memcpy (Pool, Word, len);
412 Pool += len;
413 PoolLeft -= len;
414 if (HashUsed == max_first_occr - 1)
415 {
416 ChangeMem (-sizeof (hash_rec *) * max_first_occr);
417 max_first_occr *= 2;
418 if (!(first_occr = Xrealloc (first_occr, sizeof (hash_rec *) *
419 max_first_occr)))
420 {
421 Message ("Unable to allocate memory for first_occr");
422 return (COMPERROR);
423 }
424 ChangeMem (sizeof (hash_rec *) * max_first_occr);
425 }
426 first_occr[HashUsed] = ent;
427 HashUsed++;
428 HashTable[hashval] = ent;
429 bytes_diff += Word[0];
430 break;
431 }
432
433 /* Compare the words */
434 s1 = Word;
435 s2 = ent->word;
436 len = *s1 + 1;
437 for (; len; len--)
438 if (*s1++ != *s2++)
439 break;
440
441 if (len)
442 {
443 hashval = (hashval + step);
444 if (hashval >= hsize)
445 hashval -= hsize;
446 }
447 else
448 {
449 ent->wcnt++;
450 ent->lwcnt++;
451 if (callnum > ent->fnum)
452 {
453 ptrcnt++;
454 ent->fcnt++;
455 ent->lfcnt++;
456 ent->fnum = callnum;
457 }
458 break;
459 }
460 }
461 }
462
463 if (HashUsed >= HashSize >> 1)
464 {
465 hash_rec **ht;
466 unsigned long size;
467 unsigned long i;
468 size = prime (HashSize * 2);
469 if (!(ht = Xmalloc (sizeof (hash_rec *) * size)))
470 {
471 Message ("Unable to allocate memory for table");
472 return (COMPERROR);
473 }
474 bzero ((char *) ht, sizeof (hash_rec *) * size);
475 ChangeMem (sizeof (hash_rec *) * size);
476
477 for (i = 0; i < HashSize; i++)
478 if (HashTable[i])
479 {
480 register u_char *wptr;
481 hash_rec *ent;
482 register unsigned long hashval, step;
483
484 wptr = HashTable[i]->word;
485 HASH (hashval, step, wptr, size);
486 ent = ht[hashval];
487 while (ent)
488 {
489 hashval += step;
490 if (hashval >= size)
491 hashval -= size;
492 ent = ht[hashval];
493 }
494 ht[hashval] = HashTable[i];
495 }
496 Xfree (HashTable);
497 ChangeMem (-sizeof (hash_rec *) * HashSize);
498 HashTable = ht;
499 HashSize = size;
500 }
501
502 }
503
504 if (ptrcnt >= checknum)
505 {
506 unsigned long mem;
507 /*fprintf(stderr, "Checking at %u . . . ", ptrcnt); */
508 mem = mem_reqd ();
509 if (mem >= invf_buffer_size * CHECK_CLOSE)
510 {
511 /*fprintf(stderr, "Got a match\n"); */
512 dump_dict (mem);
513 checknum = (invf_buffer_size * INIT_CHECK_FRAC) /
514 (InvfLevel == 3 ? PARA_DIV : NORM_DIV);
515 }
516 else
517 {
518 checknum = checknum * ((CHECK_FRAC * (invf_buffer_size - mem)) / mem) +
519 checknum;
520 if (checknum <= ptrcnt)
521 checknum = ptrcnt + 1;
522 /*fprintf(stderr, "Next check at %u\n", checknum); */
523 }
524 }
525 return (COMPALLOK);
526
527} /* encode */
528
529
530int
531process_ivf_1 (u_char * s_in, int l_in)
532{
533 if (InvfLevel <= 2)
534 return process_doc (s_in, l_in);
535 else
536 {
537 int pos = 0;
538 u_char *start = s_in;
539 while (pos < l_in)
540 {
541 if (s_in[pos] == TERMPARAGRAPH)
542 {
543 int len = pos + s_in + 1 - start;
544 if (process_doc (start, len) != COMPALLOK)
545 return (COMPERROR);
546 start = s_in + pos + 1;
547 }
548 pos++;
549 }
550 if (start < s_in + pos)
551 return process_doc (start, pos + s_in - start);
552 }
553 return COMPALLOK;
554}
555
556static int
557PackHashTable (void)
558{
559 int s, d;
560 for (s = d = 0; s < HashSize; s++)
561 if (HashTable[s])
562 HashTable[d++] = HashTable[s];
563 ChangeMem (-sizeof (hash_rec *) * HashSize);
564 ChangeMem (sizeof (hash_rec *) * HashUsed);
565 if (!(HashTable = Xrealloc (HashTable, sizeof (hash_rec *) * HashUsed)))
566 {
567 Message ("Out of memory");
568 return COMPERROR;
569 }
570 HashSize = HashUsed;
571 return COMPALLOK;
572}
573
574
575
576
577
578static int
579ent_comp (const void *A, const void *B)
580{
581 u_char *s1 = (*((hash_rec **) A))->word;
582 u_char *s2 = (*((hash_rec **) B))->word;
583
584 return (casecompare (s1, s2)); /* [RPAP - Jan 97: Stem Index Change] */
585} /* stem_comp */
586
587
588
589
590
591/*
592 * void count_text()
593 *
594 * The maths used in this function is described in the paper "Coding for
595 * Compression in Full-Text Retrieval Systems"
596 *
597 */
598static void
599count_text ()
600{
601 int i;
602 mg_ullong oldL12_bits = 0;
603 for (i = 0; i < HashUsed; i++)
604 {
605 hash_rec *wrd = HashTable[i];
606 /* estimate size of a level 1 inverted file */
607 L1_bits += (int) (0.99 + wrd->fcnt * (1.6 + log2 (1.0 * callnum / wrd->fcnt)));
608 L1_ohead += BIO_Gamma_Length (wrd->fcnt);
609
610 L2_bits += BIO_Unary_Length (wrd->wcnt);
611 L2_ohead += BIO_Gamma_Length (wrd->wcnt - wrd->fcnt + 1);
612
613 L3_bits += (int) (0.99 + wrd->wcnt *
614 (1.6 + log2 (1.0 * words_read / (wrd->wcnt + callnum))));
615 L3_ohead += 0;
616
617 /* check for overflow */
618 if (L1_bits + L2_bits < oldL12_bits) {
619 fprintf(stderr, "ERROR: Inverted file size will probably overflow %d byte unsigned integer\n", sizeof (mg_ullong));
620 fprintf(stderr, " counter in pass 2.\n");
621 if (sizeof (mg_ullong) < 8) {
622 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
623 }
624 fprintf(stderr, " Build aborted.\n");
625 exit(1);
626 }
627 oldL12_bits = L1_bits + L2_bits;
628 }
629 L3_bits = (L3_bits + L2_bits + L1_bits + 7) / 8;
630 L3_ohead = (L3_ohead + L2_ohead + L1_ohead + 7) / 8;
631 L2_bits = (L2_bits + L1_bits + 7) / 8;
632 L2_ohead = (L2_ohead + L1_ohead + 7) / 8;
633 L1_bits = (L1_bits + 7) / 8;
634 L1_ohead = (L1_ohead + 7) / 8;
635} /* count_text */
636
637
638
639/*
640 * write_stem_file():
641 * writes out the stemmed dictionary file
642 * in the following format
643 * lookback value (int)
644 * totalbytes value (int)
645 * indexstringbytes (int)
646 * for each word
647 * wordlen (4 bits)
648 * prefix match (4 bits)
649 * word (wordlen bytes)
650 * word frequency (int)
651 * word count (int)
652 *
653 * Accesses outside variables:
654 *
655 * Return value...:
656 */
657
658static void
659write_stem_file (char *file_name)
660{
661 long j;
662 struct invf_dict_header idh;
663 long lookback = (1 << LOGLOOKBACK); /* ???? */
664 long totalbytes = 0; /* The sum of the length of all words, including
665 the length byte */
666 long indexstringbytes = 0; /* The amount of space required to store the
667 words in the diction, this takes into account
668 the prefixes */
669 u_char *lastword = NULL; /* A pointer to the last word processed */
670 FILE *sp;
671
672
673 /* Calculate the size of certain things */
674 for (j = 0; j < HashSize; j++)
675 {
676 u_char *word = HashTable[j]->word;
677 indexstringbytes += word[0] + 2;
678 totalbytes += word[0] + 1;
679 if (lastword)
680 indexstringbytes -= prefixlen (lastword, word);
681 lastword = word;
682 }
683
684 lastword = NULL;
685
686 if (!(sp = create_file (file_name, INVF_DICT_SUFFIX, "wb", MAGIC_STEM_BUILD,
687 MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
688 return;
689
690 /* [RPAP - Jan 97: Endian Ordering] */
691 HTONUL2(lookback, idh.lookback);
692 HTONUL2(HashSize, idh.dict_size);
693 HTONUL2(totalbytes, idh.total_bytes);
694 HTONUL2(indexstringbytes, idh.index_string_bytes);
695 HTOND2(inputbytes, idh.input_bytes); /* [RJM 07/97: 4G limit] */
696 HTONUL2(callnum, idh.num_of_docs);
697 HTONUL2(callnum, idh.static_num_of_docs);
698 HTONUL2(words_read, idh.num_of_words);
699 HTONUL2(stemmer_num, idh.stemmer_num);
700 HTONUL2(stem_method, idh.stem_method);
701
702 fwrite ((char *) &idh, sizeof (idh), 1, sp);
703 outputbytes += sizeof (idh);
704
705 for (j = 0; j < HashSize; j++)
706 {
707 int i;
708 unsigned long wcnt, fcnt; /* [RPAP - Jan 97: Endian Ordering] */
709 hash_rec *ent = HashTable[j];
710 if (lastword != NULL)
711 /* look for prefix match with prev string */
712 i = prefixlen (lastword, ent->word);
713 else
714 i = 0;
715 fputc (i, sp);
716 fputc (ent->word[0] - i, sp);
717 fwrite ((char *) ent->word + i + 1, sizeof (u_char), ent->word[0] - i, sp);
718 outputbytes += ent->word[0] - i + 1;
719
720 /* [RPAP - Jan 97: Endian Ordering] */
721 HTONUL2(ent->fcnt, fcnt);
722 fwrite ((char *) &(fcnt), sizeof (fcnt), 1, sp);
723
724 outputbytes += sizeof (ent->fcnt);
725
726 /* [RPAP - Jan 97: Endian Ordering] */
727 HTONUL2(ent->wcnt, wcnt);
728 fwrite ((char *) &(wcnt), sizeof (wcnt), 1, sp);
729
730 outputbytes += sizeof (ent->wcnt);
731 lastword = ent->word;
732 }
733
734 fclose (sp);
735} /* write_stem_file() */
736
737
738
739/*
740 * write_codes():
741 * calls functions to assign and write out codes and
742 * then prints out stats about execution results.
743 *
744 * Accesses outside variables: z
745 *
746 * Return value...: z
747 */
748
749static void
750write_codes (char *file_name)
751{
752 unsigned long dicts = 0;
753 outputbytes = 0;
754 write_stem_file (file_name);
755 dicts = outputbytes;
756
757#ifndef SILENT
758 Message ("Chunks written : %d\n",
759 ChunksWritten);
760 if (InvfLevel == 3)
761 Message ("Paragraphs : %8d\n", callnum);
762 Message ("Peak memory usage : %10.1f Mb\n",
763 (double) MaxMemInUse / 1024.0 / 1024.0);
764 Message ("Stems size : %10.1f kB %5.2f%%\n",
765 (double) dicts / 1024, (double) 100 * dicts / inputbytes);
766 Message ("Lev 1 inverted file : %10.1f kB %5.2f%%\n",
767 (double) (L1_bits + L1_ohead) / 1024,
768 (double) 100 * (L1_bits + L1_ohead) / inputbytes);
769 Message ("Lev 2 inverted file : %10.1f kB %5.2f%%\n",
770 (double) (L2_bits + L2_ohead) / 1024,
771 (double) 100 * (L2_bits + L2_ohead) / inputbytes);
772 Message ("Lev 3 inverted file : %10.1f kB %5.2f%%\n",
773 (double) (L3_bits + L3_ohead) / 1024,
774 (double) 100 * (L3_bits + L3_ohead) / inputbytes);
775 Message ("Record addresses : %10.1f kB %5.2f%%\n",
776 (double) words_diff * 4 / 1024, (double) 100 * words_diff * 4 / inputbytes);
777#endif
778} /* write_codes() */
779
780
781
782void
783write_num_file (char *file_name)
784{
785
786 int i;
787 FILE *f;
788
789 for (i = 0; i < HashSize; i++)
790 HashTable[i]->fnum = i;
791
792 if (!(f = create_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "wb",
793 MAGIC_CHUNK_TRANS, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
794 return;
795
796#if 1
797 ENCODE_START (f)
798
799 for (i = 0; i < HashSize; i++)
800 BINARY_ENCODE (first_occr[i]->fnum + 1, HashSize + 1);
801
802 ENCODE_DONE;
803#else
804 for (i = 0; i < HashSize; i++)
805 {
806 /* [RPAP - Jan 97: Endian Ordering] */
807 unsigned long fnum;
808 HTONUL2(first_occur[i]->fnum, fnum);
809 fwrite ((char *) &fnum, sizeof (unsigned long), 1, f);
810 }
811#endif
812
813 fclose (f);
814}
815
816int
817done_ivf_1 (char *FileName)
818{
819 char *temp_str = msg_prefix;
820 msg_prefix = "ivf.pass1";
821#ifndef SILENT
822 Message ("Mem reqd for 1 chunk: %8u bytes\n",
823 max_mem_reqd ());
824#endif
825 if (lcallnum)
826 dump_dict (mem_reqd ());
827
828 if (PackHashTable () == COMPERROR)
829 return COMPERROR;
830
831 qsort (HashTable, HashUsed, sizeof (hash_rec *), ent_comp);
832
833 count_text ();
834 write_codes (FileName);
835
836 ENCODE_CONTINUE (sbs)
837 GAMMA_ENCODE (1);
838 ENCODE_DONE
839 fseek (ic, sizeof (long), 0);
840
841 HTONSL(max_mem); /* [RPAP - Jan 97: Endian Ordering] */
842 fwrite (&max_mem, sizeof (max_mem), 1, ic);
843 NTOHSL(max_mem); /* [RPAP - Jan 97: Endian Ordering] */
844
845 fclose (ic);
846
847 write_num_file (FileName);
848
849 msg_prefix = temp_str;
850
851 /* free the memory */
852 Xfree(HashTable);
853 if (Pool) {
854 Pool -= (POOL_SIZE-PoolLeft);
855 Xfree(Pool);
856 }
857 if (hr_pool) {
858 hr_pool -= (HASH_POOL_SIZE-hr_PoolLeft);
859 Xfree(hr_pool);
860 }
861 if (first_occr) {
862 Xfree(first_occr);
863 }
864
865 return (COMPALLOK);
866} /* done_encode */
Note: See TracBrowser for help on using the repository browser.