1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * ivf.pass1.c -- Memory efficient pass 1 inversion
|
---|
4 | * Copyright (C) 1994 Neil Sharman
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | * $Id: ivf.pass1.c 16583 2008-07-29 10:20:36Z davidb $
|
---|
21 | *
|
---|
22 | **************************************************************************/
|
---|
23 |
|
---|
24 | #include "sysfuncs.h"
|
---|
25 |
|
---|
26 | #include "memlib.h"
|
---|
27 | #include "messages.h"
|
---|
28 | #include "bitio_m.h"
|
---|
29 | #include "bitio_m_stdio.h"
|
---|
30 | #include "bitio_gen.h"
|
---|
31 | #include "local_strings.h"
|
---|
32 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
33 |
|
---|
34 | #include "mg_files.h"
|
---|
35 | #include "invf.h"
|
---|
36 | #include "mg.h"
|
---|
37 | #include "build.h"
|
---|
38 | #include "locallib.h"
|
---|
39 | #include "words.h"
|
---|
40 | #include "stemmer.h"
|
---|
41 | #include "hash.h"
|
---|
42 |
|
---|
43 | #include "longlong.h"
|
---|
44 |
|
---|
45 | /*
|
---|
46 | $Log$
|
---|
47 | Revision 1.2 2004/06/10 03:02:05 kjdon
|
---|
48 | fixed the bug that was causing it not to be able to create a second index using jni - basically had to reset all the static variables at the start of each pass. the tricky thing to find was the static variables in occur_to_lexical in ivf.pass2
|
---|
49 |
|
---|
50 | Revision 1.1 2003/02/20 21:18:23 mdewsnip
|
---|
51 | Addition of MG package for search and retrieval
|
---|
52 |
|
---|
53 | Revision 1.2 2001/09/21 12:46:42 kjm18
|
---|
54 | updated mg to be in line with mg_1.3f. Now uses long long for some variables
|
---|
55 | to enable indexing of very large collections.
|
---|
56 |
|
---|
57 | Revision 1.1 1999/08/10 21:17:54 sjboddie
|
---|
58 | renamed mg-1.3d directory mg
|
---|
59 |
|
---|
60 | Revision 1.3 1998/12/17 09:12:50 rjmcnab
|
---|
61 |
|
---|
62 | Altered mg to process utf-8 encoded Unicode. The main changes
|
---|
63 | are in the parsing of the input, the casefolding, and the stemming.
|
---|
64 |
|
---|
65 | Revision 1.2 1998/11/25 07:55:43 rjmcnab
|
---|
66 |
|
---|
67 | Modified mg to that you can specify the stemmer you want
|
---|
68 | to use via a command line option. You specify it to
|
---|
69 | mg_passes during the build process. The number of the
|
---|
70 | stemmer that you used is stored within the inverted
|
---|
71 | dictionary header and the stemmed dictionary header so
|
---|
72 | the correct stemmer is used in later stages of building
|
---|
73 | and querying.
|
---|
74 |
|
---|
75 | Revision 1.1 1998/11/17 09:34:44 rjmcnab
|
---|
76 | *** empty log message ***
|
---|
77 |
|
---|
78 | * Revision 1.6 1995/01/16 03:57:05 tes
|
---|
79 | * Fixed bug for index_string_bytes which was calculated
|
---|
80 | * by adding the suffix lengths except for every lookback (block)
|
---|
81 | * number of words. This was incorrect as invf.dict is fully
|
---|
82 | * front coded - perhaps it was partially coded in the past.
|
---|
83 | * This would only affect a statistic - hence the reason why the
|
---|
84 | * bug could exist ;-)
|
---|
85 | *
|
---|
86 | * Revision 1.5 1994/11/29 00:31:59 tes
|
---|
87 | * Committing the new merged files and changes.
|
---|
88 | *
|
---|
89 | * Revision 1.3 1994/10/20 03:56:48 tes
|
---|
90 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
91 | * components of the boolean query.
|
---|
92 | *
|
---|
93 | * Revision 1.2 1994/09/20 04:41:34 tes
|
---|
94 | * For version 1.1
|
---|
95 | *
|
---|
96 | */
|
---|
97 |
|
---|
98 | /*
|
---|
99 | * Modified:
|
---|
100 | * - long long bit counts for inverted file
|
---|
101 | * (1999-08-05 Tim Bell <[email protected]>)
|
---|
102 | */
|
---|
103 | static char *RCSID = "$Id: ivf.pass1.c 16583 2008-07-29 10:20:36Z davidb $";
|
---|
104 |
|
---|
105 | #define LOGLOOKBACK 2
|
---|
106 | #define POOL_SIZE 1024*1024
|
---|
107 | #define HASH_POOL_SIZE 8192
|
---|
108 | #define INITIAL_HASH_SIZE 7927
|
---|
109 |
|
---|
110 | #define INIT_CHECK_FRAC 0.10
|
---|
111 | #define CHECK_FRAC 0.75
|
---|
112 | #define CHECK_CLOSE 0.999
|
---|
113 | #define PARA_DIV 1.5
|
---|
114 | #define NORM_DIV 1.0
|
---|
115 |
|
---|
116 |
|
---|
117 | typedef struct hash_rec
|
---|
118 | {
|
---|
119 | unsigned long fcnt; /* fragment frequency */
|
---|
120 | unsigned long lfcnt; /* local fragment frequency */
|
---|
121 | unsigned long fnum; /* last fragment to use stem */
|
---|
122 | unsigned long wcnt; /* stem frequency */
|
---|
123 | unsigned long lwcnt; /* local stem frequency */
|
---|
124 | u_char *word;
|
---|
125 | }
|
---|
126 | hash_rec;
|
---|
127 |
|
---|
128 | static unsigned long words_read = 0, words_diff = 0, bytes_diff = 0;
|
---|
129 | static unsigned long outputbytes = 0;
|
---|
130 | static unsigned long inputbytes = 0;
|
---|
131 | static unsigned long MaxMemInUse = 0;
|
---|
132 | static unsigned long MemInUse = 0;
|
---|
133 | static unsigned long ChunksWritten = 0;
|
---|
134 |
|
---|
135 |
|
---|
136 | static FILE *ic; /* The invf block file */
|
---|
137 | static stdio_bitio_state sbs;
|
---|
138 |
|
---|
139 | static hash_rec **HashTable;
|
---|
140 | static unsigned long HashSize=0;
|
---|
141 | static unsigned long HashUsed=0;
|
---|
142 | static u_char *Pool;
|
---|
143 | static int PoolLeft=0;
|
---|
144 |
|
---|
145 | static hash_rec *hr_pool;
|
---|
146 | static int hr_PoolLeft=0;
|
---|
147 |
|
---|
148 | static hash_rec **first_occr;
|
---|
149 | static int max_first_occr=0;
|
---|
150 |
|
---|
151 | static mg_ullong L1_bits = 0;
|
---|
152 | static mg_ullong L2_bits = 0;
|
---|
153 | static mg_ullong L3_bits = 0;
|
---|
154 | static unsigned long L1_ohead = 0;
|
---|
155 | static unsigned long L2_ohead = 0;
|
---|
156 | static unsigned long L3_ohead = 0;
|
---|
157 | static unsigned long callnum = 0, lcallnum = 0, wordnum = 0, lwordnum = 0;
|
---|
158 | static unsigned long ptrcnt = 0;
|
---|
159 | static unsigned long checknum=0;
|
---|
160 | static long max_mem = 0;
|
---|
161 |
|
---|
162 |
|
---|
163 | static void
|
---|
164 | ChangeMem (int Change)
|
---|
165 | {
|
---|
166 | MemInUse += Change;
|
---|
167 | if (MemInUse > MaxMemInUse)
|
---|
168 | MaxMemInUse = MemInUse;
|
---|
169 | }
|
---|
170 |
|
---|
171 | void
|
---|
172 | ResetStaticI1Vars ()
|
---|
173 | {
|
---|
174 | words_read = 0;
|
---|
175 | words_diff = 0;
|
---|
176 | bytes_diff = 0;
|
---|
177 | outputbytes = 0;
|
---|
178 | inputbytes = 0;
|
---|
179 | MaxMemInUse = 0;
|
---|
180 | MemInUse = 0;
|
---|
181 | ChunksWritten = 0;
|
---|
182 |
|
---|
183 | HashSize=0;
|
---|
184 | HashUsed=0;
|
---|
185 | PoolLeft=0;
|
---|
186 |
|
---|
187 | hr_PoolLeft=0;
|
---|
188 |
|
---|
189 | max_first_occr=0;
|
---|
190 |
|
---|
191 | L1_bits = 0;
|
---|
192 | L2_bits = 0;
|
---|
193 | L3_bits = 0;
|
---|
194 | L1_ohead = 0;
|
---|
195 | L2_ohead = 0;
|
---|
196 | L3_ohead = 0;
|
---|
197 | callnum = 0;
|
---|
198 | lcallnum = 0;
|
---|
199 | wordnum = 0;
|
---|
200 | lwordnum = 0;
|
---|
201 | ptrcnt = 0;
|
---|
202 | checknum=0;
|
---|
203 | max_mem = 0;
|
---|
204 |
|
---|
205 | }
|
---|
206 | int
|
---|
207 | init_ivf_1 (char *file_name)
|
---|
208 | {
|
---|
209 | ResetStaticI1Vars();
|
---|
210 | if (!(ic = create_file (file_name, INVF_CHUNK_SUFFIX, "wb", MAGIC_CHUNK,
|
---|
211 | MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
212 | return (COMPERROR);
|
---|
213 | fwrite (" ", sizeof (u_long), 1, ic); /* Space for the maxmem */
|
---|
214 | ENCODE_START (ic)
|
---|
215 | ENCODE_PAUSE (sbs)
|
---|
216 |
|
---|
217 | if (!(Pool = Xmalloc (POOL_SIZE)))
|
---|
218 | {
|
---|
219 | Message ("Unable to allocate memory for pool");
|
---|
220 | return (COMPERROR);
|
---|
221 | }
|
---|
222 | PoolLeft = POOL_SIZE;
|
---|
223 | ChangeMem (POOL_SIZE);
|
---|
224 |
|
---|
225 | if (!(hr_pool = Xmalloc (HASH_POOL_SIZE * sizeof (hash_rec))))
|
---|
226 | {
|
---|
227 | Message ("Unable to allocate memory for pool");
|
---|
228 | return (COMPERROR);
|
---|
229 | }
|
---|
230 | hr_PoolLeft = HASH_POOL_SIZE;
|
---|
231 | ChangeMem (HASH_POOL_SIZE * sizeof (hash_rec));
|
---|
232 |
|
---|
233 | HashSize = INITIAL_HASH_SIZE;
|
---|
234 | HashUsed = 0;
|
---|
235 | if (!(HashTable = Xmalloc (sizeof (hash_rec *) * HashSize)))
|
---|
236 | {
|
---|
237 | Message ("Unable to allocate memory for table");
|
---|
238 | return (COMPERROR);
|
---|
239 | }
|
---|
240 | ChangeMem (sizeof (hash_rec *) * HashSize);
|
---|
241 | bzero ((char *) HashTable, sizeof (hash_rec *) * HashSize);
|
---|
242 | max_first_occr = 8192;
|
---|
243 | if (!(first_occr = Xmalloc (sizeof (hash_rec *) * max_first_occr)))
|
---|
244 | {
|
---|
245 | Message ("Unable to allocate memory for first_occur");
|
---|
246 | return (COMPERROR);
|
---|
247 | }
|
---|
248 | ChangeMem (sizeof (hash_rec *) * max_first_occr);
|
---|
249 |
|
---|
250 | checknum = (invf_buffer_size * INIT_CHECK_FRAC) /
|
---|
251 | (InvfLevel == 3 ? PARA_DIV : NORM_DIV);
|
---|
252 |
|
---|
253 | return (COMPALLOK);
|
---|
254 | }
|
---|
255 |
|
---|
256 |
|
---|
257 |
|
---|
258 | static unsigned long
|
---|
259 | mem_reqd (void)
|
---|
260 | {
|
---|
261 | register int i;
|
---|
262 | register unsigned long total = 0;
|
---|
263 | /* register unsigned long N = InvfLevel == 3 ? lwordnum : lcallnum; */
|
---|
264 | register unsigned long N = lcallnum;
|
---|
265 | for (i = 0; i < HashUsed; i++)
|
---|
266 | {
|
---|
267 | register hash_rec *ent = first_occr[i];
|
---|
268 | /* register unsigned long p = InvfLevel == 3 ? ent->lwcnt : ent->lfcnt; */
|
---|
269 | register unsigned long p = ent->lfcnt;
|
---|
270 | if (p)
|
---|
271 | total += BIO_Bblock_Bound (N, p);
|
---|
272 | if (InvfLevel >= 2)
|
---|
273 | total += ent->lwcnt;
|
---|
274 | }
|
---|
275 |
|
---|
276 | total = (total + 7) >> 3;
|
---|
277 | return total;
|
---|
278 | }
|
---|
279 |
|
---|
280 | static unsigned long
|
---|
281 | max_mem_reqd (void)
|
---|
282 | {
|
---|
283 | register int i;
|
---|
284 | register unsigned long total = 0;
|
---|
285 | /* register unsigned long N = InvfLevel == 3 ? wordnum : callnum; */
|
---|
286 | register unsigned long N = callnum;
|
---|
287 | for (i = 0; i < HashUsed; i++)
|
---|
288 | {
|
---|
289 | register hash_rec *ent = first_occr[i];
|
---|
290 | /* register unsigned long p = InvfLevel == 3 ? ent->wcnt : ent->fcnt; */
|
---|
291 | register unsigned long p = ent->fcnt;
|
---|
292 | if (p)
|
---|
293 | total += BIO_Bblock_Bound (N, p);
|
---|
294 | if (InvfLevel >= 2)
|
---|
295 | total += ent->wcnt;
|
---|
296 | }
|
---|
297 |
|
---|
298 | total = (total + 7) >> 3;
|
---|
299 | return total;
|
---|
300 | }
|
---|
301 |
|
---|
302 |
|
---|
303 | static void
|
---|
304 | dump_dict (unsigned long mem)
|
---|
305 | {
|
---|
306 | int i;
|
---|
307 |
|
---|
308 | ChunksWritten++;
|
---|
309 |
|
---|
310 | ENCODE_CONTINUE (sbs)
|
---|
311 |
|
---|
312 | GAMMA_ENCODE (lcallnum + 1);
|
---|
313 | if (mem > max_mem)
|
---|
314 | max_mem = mem;
|
---|
315 | GAMMA_ENCODE (mem + 1);
|
---|
316 | lwordnum = lcallnum = 0;
|
---|
317 | GAMMA_ENCODE (HashUsed + 1);
|
---|
318 | for (i = 0; i < HashUsed; i++)
|
---|
319 | {
|
---|
320 | hash_rec *ent = first_occr[i];
|
---|
321 | GAMMA_ENCODE (ent->lwcnt + 1);
|
---|
322 | if (ent->lwcnt >= 2)
|
---|
323 | GAMMA_ENCODE (ent->lfcnt);
|
---|
324 | ent->lwcnt = ent->lfcnt = 0;
|
---|
325 | }
|
---|
326 | ptrcnt = 0;
|
---|
327 | ENCODE_PAUSE (sbs)
|
---|
328 | }
|
---|
329 |
|
---|
330 |
|
---|
331 |
|
---|
332 | static int
|
---|
333 | process_doc (u_char * s_in, int l_in)
|
---|
334 | {
|
---|
335 | u_char *end = s_in + l_in - 1;
|
---|
336 |
|
---|
337 | callnum++;
|
---|
338 | lcallnum++;
|
---|
339 | inputbytes += l_in;
|
---|
340 |
|
---|
341 | if (!inaword (s_in, end))
|
---|
342 | if (SkipSGML)
|
---|
343 | PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
|
---|
344 | else
|
---|
345 | PARSE_NON_STEM_WORD (s_in, end);
|
---|
346 | /*
|
---|
347 | ** Alternately parse off words and non-words from the input
|
---|
348 | ** stream beginning with a non-word. Each token is then
|
---|
349 | ** inserted into the set if it does not exist or has it's
|
---|
350 | ** frequency count incremented if it does.
|
---|
351 | */
|
---|
352 | while (s_in <= end)
|
---|
353 | {
|
---|
354 | u_char Word[MAXSTEMLEN + 1];
|
---|
355 |
|
---|
356 | PARSE_STEM_WORD (Word, s_in, end);
|
---|
357 |
|
---|
358 | stemmer (stem_method, stemmer_num, Word);
|
---|
359 | if (SkipSGML)
|
---|
360 | PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
|
---|
361 | else
|
---|
362 | PARSE_NON_STEM_WORD (s_in, end);
|
---|
363 |
|
---|
364 | words_read++;
|
---|
365 | wordnum++;
|
---|
366 | lwordnum++;
|
---|
367 | /* Search the hash table for Word */
|
---|
368 | {
|
---|
369 | register unsigned long hashval, step;
|
---|
370 | register int hsize = HashSize;
|
---|
371 | HASH (hashval, step, Word, hsize);
|
---|
372 | for (;;)
|
---|
373 | {
|
---|
374 | register u_char *s1;
|
---|
375 | register u_char *s2;
|
---|
376 | register int len;
|
---|
377 | register hash_rec *ent;
|
---|
378 | ent = HashTable[hashval];
|
---|
379 | if (!ent)
|
---|
380 | {
|
---|
381 | int len = *Word + 1;
|
---|
382 | if (!hr_PoolLeft)
|
---|
383 | {
|
---|
384 | if (!(hr_pool = Xmalloc (HASH_POOL_SIZE *
|
---|
385 | sizeof (hash_rec))))
|
---|
386 | {
|
---|
387 | Message ("Unable to allocate memory for pool");
|
---|
388 | return (COMPERROR);
|
---|
389 | }
|
---|
390 | hr_PoolLeft = HASH_POOL_SIZE;
|
---|
391 | ChangeMem (HASH_POOL_SIZE * sizeof (hash_rec));
|
---|
392 | }
|
---|
393 | ent = hr_pool++;
|
---|
394 | hr_PoolLeft--;
|
---|
395 | if (len > PoolLeft)
|
---|
396 | {
|
---|
397 | if (!(Pool = Xmalloc (POOL_SIZE)))
|
---|
398 | {
|
---|
399 | Message ("Unable to allocate memory for pool");
|
---|
400 | return (COMPERROR);
|
---|
401 | }
|
---|
402 | PoolLeft = POOL_SIZE;
|
---|
403 | ChangeMem (POOL_SIZE);
|
---|
404 | }
|
---|
405 | ent->wcnt = 1;
|
---|
406 | ent->fcnt = 1;
|
---|
407 | ent->lwcnt = 1;
|
---|
408 | ent->lfcnt = 1;
|
---|
409 | ent->fnum = callnum;
|
---|
410 | ent->word = Pool;
|
---|
411 | memcpy (Pool, Word, len);
|
---|
412 | Pool += len;
|
---|
413 | PoolLeft -= len;
|
---|
414 | if (HashUsed == max_first_occr - 1)
|
---|
415 | {
|
---|
416 | ChangeMem (-sizeof (hash_rec *) * max_first_occr);
|
---|
417 | max_first_occr *= 2;
|
---|
418 | if (!(first_occr = Xrealloc (first_occr, sizeof (hash_rec *) *
|
---|
419 | max_first_occr)))
|
---|
420 | {
|
---|
421 | Message ("Unable to allocate memory for first_occr");
|
---|
422 | return (COMPERROR);
|
---|
423 | }
|
---|
424 | ChangeMem (sizeof (hash_rec *) * max_first_occr);
|
---|
425 | }
|
---|
426 | first_occr[HashUsed] = ent;
|
---|
427 | HashUsed++;
|
---|
428 | HashTable[hashval] = ent;
|
---|
429 | bytes_diff += Word[0];
|
---|
430 | break;
|
---|
431 | }
|
---|
432 |
|
---|
433 | /* Compare the words */
|
---|
434 | s1 = Word;
|
---|
435 | s2 = ent->word;
|
---|
436 | len = *s1 + 1;
|
---|
437 | for (; len; len--)
|
---|
438 | if (*s1++ != *s2++)
|
---|
439 | break;
|
---|
440 |
|
---|
441 | if (len)
|
---|
442 | {
|
---|
443 | hashval = (hashval + step);
|
---|
444 | if (hashval >= hsize)
|
---|
445 | hashval -= hsize;
|
---|
446 | }
|
---|
447 | else
|
---|
448 | {
|
---|
449 | ent->wcnt++;
|
---|
450 | ent->lwcnt++;
|
---|
451 | if (callnum > ent->fnum)
|
---|
452 | {
|
---|
453 | ptrcnt++;
|
---|
454 | ent->fcnt++;
|
---|
455 | ent->lfcnt++;
|
---|
456 | ent->fnum = callnum;
|
---|
457 | }
|
---|
458 | break;
|
---|
459 | }
|
---|
460 | }
|
---|
461 | }
|
---|
462 |
|
---|
463 | if (HashUsed >= HashSize >> 1)
|
---|
464 | {
|
---|
465 | hash_rec **ht;
|
---|
466 | unsigned long size;
|
---|
467 | unsigned long i;
|
---|
468 | size = prime (HashSize * 2);
|
---|
469 | if (!(ht = Xmalloc (sizeof (hash_rec *) * size)))
|
---|
470 | {
|
---|
471 | Message ("Unable to allocate memory for table");
|
---|
472 | return (COMPERROR);
|
---|
473 | }
|
---|
474 | bzero ((char *) ht, sizeof (hash_rec *) * size);
|
---|
475 | ChangeMem (sizeof (hash_rec *) * size);
|
---|
476 |
|
---|
477 | for (i = 0; i < HashSize; i++)
|
---|
478 | if (HashTable[i])
|
---|
479 | {
|
---|
480 | register u_char *wptr;
|
---|
481 | hash_rec *ent;
|
---|
482 | register unsigned long hashval, step;
|
---|
483 |
|
---|
484 | wptr = HashTable[i]->word;
|
---|
485 | HASH (hashval, step, wptr, size);
|
---|
486 | ent = ht[hashval];
|
---|
487 | while (ent)
|
---|
488 | {
|
---|
489 | hashval += step;
|
---|
490 | if (hashval >= size)
|
---|
491 | hashval -= size;
|
---|
492 | ent = ht[hashval];
|
---|
493 | }
|
---|
494 | ht[hashval] = HashTable[i];
|
---|
495 | }
|
---|
496 | Xfree (HashTable);
|
---|
497 | ChangeMem (-sizeof (hash_rec *) * HashSize);
|
---|
498 | HashTable = ht;
|
---|
499 | HashSize = size;
|
---|
500 | }
|
---|
501 |
|
---|
502 | }
|
---|
503 |
|
---|
504 | if (ptrcnt >= checknum)
|
---|
505 | {
|
---|
506 | unsigned long mem;
|
---|
507 | /*fprintf(stderr, "Checking at %u . . . ", ptrcnt); */
|
---|
508 | mem = mem_reqd ();
|
---|
509 | if (mem >= invf_buffer_size * CHECK_CLOSE)
|
---|
510 | {
|
---|
511 | /*fprintf(stderr, "Got a match\n"); */
|
---|
512 | dump_dict (mem);
|
---|
513 | checknum = (invf_buffer_size * INIT_CHECK_FRAC) /
|
---|
514 | (InvfLevel == 3 ? PARA_DIV : NORM_DIV);
|
---|
515 | }
|
---|
516 | else
|
---|
517 | {
|
---|
518 | checknum = checknum * ((CHECK_FRAC * (invf_buffer_size - mem)) / mem) +
|
---|
519 | checknum;
|
---|
520 | if (checknum <= ptrcnt)
|
---|
521 | checknum = ptrcnt + 1;
|
---|
522 | /*fprintf(stderr, "Next check at %u\n", checknum); */
|
---|
523 | }
|
---|
524 | }
|
---|
525 | return (COMPALLOK);
|
---|
526 |
|
---|
527 | } /* encode */
|
---|
528 |
|
---|
529 |
|
---|
530 | int
|
---|
531 | process_ivf_1 (u_char * s_in, int l_in)
|
---|
532 | {
|
---|
533 | if (InvfLevel <= 2)
|
---|
534 | return process_doc (s_in, l_in);
|
---|
535 | else
|
---|
536 | {
|
---|
537 | int pos = 0;
|
---|
538 | u_char *start = s_in;
|
---|
539 | while (pos < l_in)
|
---|
540 | {
|
---|
541 | if (s_in[pos] == TERMPARAGRAPH)
|
---|
542 | {
|
---|
543 | int len = pos + s_in + 1 - start;
|
---|
544 | if (process_doc (start, len) != COMPALLOK)
|
---|
545 | return (COMPERROR);
|
---|
546 | start = s_in + pos + 1;
|
---|
547 | }
|
---|
548 | pos++;
|
---|
549 | }
|
---|
550 | if (start < s_in + pos)
|
---|
551 | return process_doc (start, pos + s_in - start);
|
---|
552 | }
|
---|
553 | return COMPALLOK;
|
---|
554 | }
|
---|
555 |
|
---|
556 | static int
|
---|
557 | PackHashTable (void)
|
---|
558 | {
|
---|
559 | int s, d;
|
---|
560 | for (s = d = 0; s < HashSize; s++)
|
---|
561 | if (HashTable[s])
|
---|
562 | HashTable[d++] = HashTable[s];
|
---|
563 | ChangeMem (-sizeof (hash_rec *) * HashSize);
|
---|
564 | ChangeMem (sizeof (hash_rec *) * HashUsed);
|
---|
565 | if (!(HashTable = Xrealloc (HashTable, sizeof (hash_rec *) * HashUsed)))
|
---|
566 | {
|
---|
567 | Message ("Out of memory");
|
---|
568 | return COMPERROR;
|
---|
569 | }
|
---|
570 | HashSize = HashUsed;
|
---|
571 | return COMPALLOK;
|
---|
572 | }
|
---|
573 |
|
---|
574 |
|
---|
575 |
|
---|
576 |
|
---|
577 |
|
---|
578 | static int
|
---|
579 | ent_comp (const void *A, const void *B)
|
---|
580 | {
|
---|
581 | u_char *s1 = (*((hash_rec **) A))->word;
|
---|
582 | u_char *s2 = (*((hash_rec **) B))->word;
|
---|
583 |
|
---|
584 | return (casecompare (s1, s2)); /* [RPAP - Jan 97: Stem Index Change] */
|
---|
585 | } /* stem_comp */
|
---|
586 |
|
---|
587 |
|
---|
588 |
|
---|
589 |
|
---|
590 |
|
---|
591 | /*
|
---|
592 | * void count_text()
|
---|
593 | *
|
---|
594 | * The maths used in this function is described in the paper "Coding for
|
---|
595 | * Compression in Full-Text Retrieval Systems"
|
---|
596 | *
|
---|
597 | */
|
---|
598 | static void
|
---|
599 | count_text ()
|
---|
600 | {
|
---|
601 | int i;
|
---|
602 | mg_ullong oldL12_bits = 0;
|
---|
603 | for (i = 0; i < HashUsed; i++)
|
---|
604 | {
|
---|
605 | hash_rec *wrd = HashTable[i];
|
---|
606 | /* estimate size of a level 1 inverted file */
|
---|
607 | L1_bits += (int) (0.99 + wrd->fcnt * (1.6 + log2 (1.0 * callnum / wrd->fcnt)));
|
---|
608 | L1_ohead += BIO_Gamma_Length (wrd->fcnt);
|
---|
609 |
|
---|
610 | L2_bits += BIO_Unary_Length (wrd->wcnt);
|
---|
611 | L2_ohead += BIO_Gamma_Length (wrd->wcnt - wrd->fcnt + 1);
|
---|
612 |
|
---|
613 | L3_bits += (int) (0.99 + wrd->wcnt *
|
---|
614 | (1.6 + log2 (1.0 * words_read / (wrd->wcnt + callnum))));
|
---|
615 | L3_ohead += 0;
|
---|
616 |
|
---|
617 | /* check for overflow */
|
---|
618 | if (L1_bits + L2_bits < oldL12_bits) {
|
---|
619 | fprintf(stderr, "ERROR: Inverted file size will probably overflow %d byte unsigned integer\n", sizeof (mg_ullong));
|
---|
620 | fprintf(stderr, " counter in pass 2.\n");
|
---|
621 | if (sizeof (mg_ullong) < 8) {
|
---|
622 | fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
|
---|
623 | }
|
---|
624 | fprintf(stderr, " Build aborted.\n");
|
---|
625 | exit(1);
|
---|
626 | }
|
---|
627 | oldL12_bits = L1_bits + L2_bits;
|
---|
628 | }
|
---|
629 | L3_bits = (L3_bits + L2_bits + L1_bits + 7) / 8;
|
---|
630 | L3_ohead = (L3_ohead + L2_ohead + L1_ohead + 7) / 8;
|
---|
631 | L2_bits = (L2_bits + L1_bits + 7) / 8;
|
---|
632 | L2_ohead = (L2_ohead + L1_ohead + 7) / 8;
|
---|
633 | L1_bits = (L1_bits + 7) / 8;
|
---|
634 | L1_ohead = (L1_ohead + 7) / 8;
|
---|
635 | } /* count_text */
|
---|
636 |
|
---|
637 |
|
---|
638 |
|
---|
639 | /*
|
---|
640 | * write_stem_file():
|
---|
641 | * writes out the stemmed dictionary file
|
---|
642 | * in the following format
|
---|
643 | * lookback value (int)
|
---|
644 | * totalbytes value (int)
|
---|
645 | * indexstringbytes (int)
|
---|
646 | * for each word
|
---|
647 | * wordlen (4 bits)
|
---|
648 | * prefix match (4 bits)
|
---|
649 | * word (wordlen bytes)
|
---|
650 | * word frequency (int)
|
---|
651 | * word count (int)
|
---|
652 | *
|
---|
653 | * Accesses outside variables:
|
---|
654 | *
|
---|
655 | * Return value...:
|
---|
656 | */
|
---|
657 |
|
---|
658 | static void
|
---|
659 | write_stem_file (char *file_name)
|
---|
660 | {
|
---|
661 | long j;
|
---|
662 | struct invf_dict_header idh;
|
---|
663 | long lookback = (1 << LOGLOOKBACK); /* ???? */
|
---|
664 | long totalbytes = 0; /* The sum of the length of all words, including
|
---|
665 | the length byte */
|
---|
666 | long indexstringbytes = 0; /* The amount of space required to store the
|
---|
667 | words in the diction, this takes into account
|
---|
668 | the prefixes */
|
---|
669 | u_char *lastword = NULL; /* A pointer to the last word processed */
|
---|
670 | FILE *sp;
|
---|
671 |
|
---|
672 |
|
---|
673 | /* Calculate the size of certain things */
|
---|
674 | for (j = 0; j < HashSize; j++)
|
---|
675 | {
|
---|
676 | u_char *word = HashTable[j]->word;
|
---|
677 | indexstringbytes += word[0] + 2;
|
---|
678 | totalbytes += word[0] + 1;
|
---|
679 | if (lastword)
|
---|
680 | indexstringbytes -= prefixlen (lastword, word);
|
---|
681 | lastword = word;
|
---|
682 | }
|
---|
683 |
|
---|
684 | lastword = NULL;
|
---|
685 |
|
---|
686 | if (!(sp = create_file (file_name, INVF_DICT_SUFFIX, "wb", MAGIC_STEM_BUILD,
|
---|
687 | MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
688 | return;
|
---|
689 |
|
---|
690 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
691 | HTONUL2(lookback, idh.lookback);
|
---|
692 | HTONUL2(HashSize, idh.dict_size);
|
---|
693 | HTONUL2(totalbytes, idh.total_bytes);
|
---|
694 | HTONUL2(indexstringbytes, idh.index_string_bytes);
|
---|
695 | HTOND2(inputbytes, idh.input_bytes); /* [RJM 07/97: 4G limit] */
|
---|
696 | HTONUL2(callnum, idh.num_of_docs);
|
---|
697 | HTONUL2(callnum, idh.static_num_of_docs);
|
---|
698 | HTONUL2(words_read, idh.num_of_words);
|
---|
699 | HTONUL2(stemmer_num, idh.stemmer_num);
|
---|
700 | HTONUL2(stem_method, idh.stem_method);
|
---|
701 |
|
---|
702 | fwrite ((char *) &idh, sizeof (idh), 1, sp);
|
---|
703 | outputbytes += sizeof (idh);
|
---|
704 |
|
---|
705 | for (j = 0; j < HashSize; j++)
|
---|
706 | {
|
---|
707 | int i;
|
---|
708 | unsigned long wcnt, fcnt; /* [RPAP - Jan 97: Endian Ordering] */
|
---|
709 | hash_rec *ent = HashTable[j];
|
---|
710 | if (lastword != NULL)
|
---|
711 | /* look for prefix match with prev string */
|
---|
712 | i = prefixlen (lastword, ent->word);
|
---|
713 | else
|
---|
714 | i = 0;
|
---|
715 | fputc (i, sp);
|
---|
716 | fputc (ent->word[0] - i, sp);
|
---|
717 | fwrite ((char *) ent->word + i + 1, sizeof (u_char), ent->word[0] - i, sp);
|
---|
718 | outputbytes += ent->word[0] - i + 1;
|
---|
719 |
|
---|
720 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
721 | HTONUL2(ent->fcnt, fcnt);
|
---|
722 | fwrite ((char *) &(fcnt), sizeof (fcnt), 1, sp);
|
---|
723 |
|
---|
724 | outputbytes += sizeof (ent->fcnt);
|
---|
725 |
|
---|
726 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
727 | HTONUL2(ent->wcnt, wcnt);
|
---|
728 | fwrite ((char *) &(wcnt), sizeof (wcnt), 1, sp);
|
---|
729 |
|
---|
730 | outputbytes += sizeof (ent->wcnt);
|
---|
731 | lastword = ent->word;
|
---|
732 | }
|
---|
733 |
|
---|
734 | fclose (sp);
|
---|
735 | } /* write_stem_file() */
|
---|
736 |
|
---|
737 |
|
---|
738 |
|
---|
739 | /*
|
---|
740 | * write_codes():
|
---|
741 | * calls functions to assign and write out codes and
|
---|
742 | * then prints out stats about execution results.
|
---|
743 | *
|
---|
744 | * Accesses outside variables: z
|
---|
745 | *
|
---|
746 | * Return value...: z
|
---|
747 | */
|
---|
748 |
|
---|
749 | static void
|
---|
750 | write_codes (char *file_name)
|
---|
751 | {
|
---|
752 | unsigned long dicts = 0;
|
---|
753 | outputbytes = 0;
|
---|
754 | write_stem_file (file_name);
|
---|
755 | dicts = outputbytes;
|
---|
756 |
|
---|
757 | #ifndef SILENT
|
---|
758 | Message ("Chunks written : %d\n",
|
---|
759 | ChunksWritten);
|
---|
760 | if (InvfLevel == 3)
|
---|
761 | Message ("Paragraphs : %8d\n", callnum);
|
---|
762 | Message ("Peak memory usage : %10.1f Mb\n",
|
---|
763 | (double) MaxMemInUse / 1024.0 / 1024.0);
|
---|
764 | Message ("Stems size : %10.1f kB %5.2f%%\n",
|
---|
765 | (double) dicts / 1024, (double) 100 * dicts / inputbytes);
|
---|
766 | Message ("Lev 1 inverted file : %10.1f kB %5.2f%%\n",
|
---|
767 | (double) (L1_bits + L1_ohead) / 1024,
|
---|
768 | (double) 100 * (L1_bits + L1_ohead) / inputbytes);
|
---|
769 | Message ("Lev 2 inverted file : %10.1f kB %5.2f%%\n",
|
---|
770 | (double) (L2_bits + L2_ohead) / 1024,
|
---|
771 | (double) 100 * (L2_bits + L2_ohead) / inputbytes);
|
---|
772 | Message ("Lev 3 inverted file : %10.1f kB %5.2f%%\n",
|
---|
773 | (double) (L3_bits + L3_ohead) / 1024,
|
---|
774 | (double) 100 * (L3_bits + L3_ohead) / inputbytes);
|
---|
775 | Message ("Record addresses : %10.1f kB %5.2f%%\n",
|
---|
776 | (double) words_diff * 4 / 1024, (double) 100 * words_diff * 4 / inputbytes);
|
---|
777 | #endif
|
---|
778 | } /* write_codes() */
|
---|
779 |
|
---|
780 |
|
---|
781 |
|
---|
782 | void
|
---|
783 | write_num_file (char *file_name)
|
---|
784 | {
|
---|
785 |
|
---|
786 | int i;
|
---|
787 | FILE *f;
|
---|
788 |
|
---|
789 | for (i = 0; i < HashSize; i++)
|
---|
790 | HashTable[i]->fnum = i;
|
---|
791 |
|
---|
792 | if (!(f = create_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "wb",
|
---|
793 | MAGIC_CHUNK_TRANS, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
794 | return;
|
---|
795 |
|
---|
796 | #if 1
|
---|
797 | ENCODE_START (f)
|
---|
798 |
|
---|
799 | for (i = 0; i < HashSize; i++)
|
---|
800 | BINARY_ENCODE (first_occr[i]->fnum + 1, HashSize + 1);
|
---|
801 |
|
---|
802 | ENCODE_DONE;
|
---|
803 | #else
|
---|
804 | for (i = 0; i < HashSize; i++)
|
---|
805 | {
|
---|
806 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
807 | unsigned long fnum;
|
---|
808 | HTONUL2(first_occur[i]->fnum, fnum);
|
---|
809 | fwrite ((char *) &fnum, sizeof (unsigned long), 1, f);
|
---|
810 | }
|
---|
811 | #endif
|
---|
812 |
|
---|
813 | fclose (f);
|
---|
814 | }
|
---|
815 |
|
---|
816 | int
|
---|
817 | done_ivf_1 (char *FileName)
|
---|
818 | {
|
---|
819 | char *temp_str = msg_prefix;
|
---|
820 | msg_prefix = "ivf.pass1";
|
---|
821 | #ifndef SILENT
|
---|
822 | Message ("Mem reqd for 1 chunk: %8u bytes\n",
|
---|
823 | max_mem_reqd ());
|
---|
824 | #endif
|
---|
825 | if (lcallnum)
|
---|
826 | dump_dict (mem_reqd ());
|
---|
827 |
|
---|
828 | if (PackHashTable () == COMPERROR)
|
---|
829 | return COMPERROR;
|
---|
830 |
|
---|
831 | qsort (HashTable, HashUsed, sizeof (hash_rec *), ent_comp);
|
---|
832 |
|
---|
833 | count_text ();
|
---|
834 | write_codes (FileName);
|
---|
835 |
|
---|
836 | ENCODE_CONTINUE (sbs)
|
---|
837 | GAMMA_ENCODE (1);
|
---|
838 | ENCODE_DONE
|
---|
839 | fseek (ic, sizeof (long), 0);
|
---|
840 |
|
---|
841 | HTONSL(max_mem); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
842 | fwrite (&max_mem, sizeof (max_mem), 1, ic);
|
---|
843 | NTOHSL(max_mem); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
844 |
|
---|
845 | fclose (ic);
|
---|
846 |
|
---|
847 | write_num_file (FileName);
|
---|
848 |
|
---|
849 | msg_prefix = temp_str;
|
---|
850 |
|
---|
851 | /* free the memory */
|
---|
852 | Xfree(HashTable);
|
---|
853 | if (Pool) {
|
---|
854 | Pool -= (POOL_SIZE-PoolLeft);
|
---|
855 | Xfree(Pool);
|
---|
856 | }
|
---|
857 | if (hr_pool) {
|
---|
858 | hr_pool -= (HASH_POOL_SIZE-hr_PoolLeft);
|
---|
859 | Xfree(hr_pool);
|
---|
860 | }
|
---|
861 | if (first_occr) {
|
---|
862 | Xfree(first_occr);
|
---|
863 | }
|
---|
864 |
|
---|
865 | return (COMPALLOK);
|
---|
866 | } /* done_encode */
|
---|