1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * ivf.pass1.c -- Memory efficient pass 1 inversion
|
---|
4 | * Copyright (C) 1994 Neil Sharman
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | * $Id: ivf.pass1.c 3745 2003-02-20 21:20:24Z mdewsnip $
|
---|
21 | *
|
---|
22 | **************************************************************************/
|
---|
23 |
|
---|
24 | #include "sysfuncs.h"
|
---|
25 |
|
---|
26 | #include "memlib.h"
|
---|
27 | #include "messages.h"
|
---|
28 | #include "bitio_m.h"
|
---|
29 | #include "bitio_m_stdio.h"
|
---|
30 | #include "bitio_gen.h"
|
---|
31 | #include "local_strings.h"
|
---|
32 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
33 |
|
---|
34 | #include "mg_files.h"
|
---|
35 | #include "invf.h"
|
---|
36 | #include "mg.h"
|
---|
37 | #include "build.h"
|
---|
38 | #include "locallib.h"
|
---|
39 | #include "words.h"
|
---|
40 | #include "stemmer.h"
|
---|
41 | #include "hash.h"
|
---|
42 |
|
---|
43 | #include "longlong.h"
|
---|
44 |
|
---|
45 | /*
|
---|
46 | $Log$
|
---|
47 | Revision 1.1 2003/02/20 21:18:23 mdewsnip
|
---|
48 | Addition of MG package for search and retrieval
|
---|
49 |
|
---|
50 | Revision 1.2 2001/09/21 12:46:42 kjm18
|
---|
51 | updated mg to be in line with mg_1.3f. Now uses long long for some variables
|
---|
52 | to enable indexing of very large collections.
|
---|
53 |
|
---|
54 | Revision 1.1 1999/08/10 21:17:54 sjboddie
|
---|
55 | renamed mg-1.3d directory mg
|
---|
56 |
|
---|
57 | Revision 1.3 1998/12/17 09:12:50 rjmcnab
|
---|
58 |
|
---|
59 | Altered mg to process utf-8 encoded Unicode. The main changes
|
---|
60 | are in the parsing of the input, the casefolding, and the stemming.
|
---|
61 |
|
---|
62 | Revision 1.2 1998/11/25 07:55:43 rjmcnab
|
---|
63 |
|
---|
64 | Modified mg to that you can specify the stemmer you want
|
---|
65 | to use via a command line option. You specify it to
|
---|
66 | mg_passes during the build process. The number of the
|
---|
67 | stemmer that you used is stored within the inverted
|
---|
68 | dictionary header and the stemmed dictionary header so
|
---|
69 | the correct stemmer is used in later stages of building
|
---|
70 | and querying.
|
---|
71 |
|
---|
72 | Revision 1.1 1998/11/17 09:34:44 rjmcnab
|
---|
73 | *** empty log message ***
|
---|
74 |
|
---|
75 | * Revision 1.6 1995/01/16 03:57:05 tes
|
---|
76 | * Fixed bug for index_string_bytes which was calculated
|
---|
77 | * by adding the suffix lengths except for every lookback (block)
|
---|
78 | * number of words. This was incorrect as invf.dict is fully
|
---|
79 | * front coded - perhaps it was partially coded in the past.
|
---|
80 | * This would only affect a statistic - hence the reason why the
|
---|
81 | * bug could exist ;-)
|
---|
82 | *
|
---|
83 | * Revision 1.5 1994/11/29 00:31:59 tes
|
---|
84 | * Committing the new merged files and changes.
|
---|
85 | *
|
---|
86 | * Revision 1.3 1994/10/20 03:56:48 tes
|
---|
87 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
88 | * components of the boolean query.
|
---|
89 | *
|
---|
90 | * Revision 1.2 1994/09/20 04:41:34 tes
|
---|
91 | * For version 1.1
|
---|
92 | *
|
---|
93 | */
|
---|
94 |
|
---|
95 | /*
|
---|
96 | * Modified:
|
---|
97 | * - long long bit counts for inverted file
|
---|
98 | * (1999-08-05 Tim Bell <[email protected]>)
|
---|
99 | */
|
---|
100 | static char *RCSID = "$Id: ivf.pass1.c 3745 2003-02-20 21:20:24Z mdewsnip $";
|
---|
101 |
|
---|
102 | #define LOGLOOKBACK 2
|
---|
103 | #define POOL_SIZE 1024*1024
|
---|
104 | #define HASH_POOL_SIZE 8192
|
---|
105 | #define INITIAL_HASH_SIZE 7927
|
---|
106 |
|
---|
107 | #define INIT_CHECK_FRAC 0.10
|
---|
108 | #define CHECK_FRAC 0.75
|
---|
109 | #define CHECK_CLOSE 0.999
|
---|
110 | #define PARA_DIV 1.5
|
---|
111 | #define NORM_DIV 1.0
|
---|
112 |
|
---|
113 |
|
---|
114 | typedef struct hash_rec
|
---|
115 | {
|
---|
116 | unsigned long fcnt; /* fragment frequency */
|
---|
117 | unsigned long lfcnt; /* local fragment frequency */
|
---|
118 | unsigned long fnum; /* last fragment to use stem */
|
---|
119 | unsigned long wcnt; /* stem frequency */
|
---|
120 | unsigned long lwcnt; /* local stem frequency */
|
---|
121 | u_char *word;
|
---|
122 | }
|
---|
123 | hash_rec;
|
---|
124 |
|
---|
125 |
|
---|
126 |
|
---|
127 |
|
---|
128 |
|
---|
129 |
|
---|
130 |
|
---|
131 |
|
---|
132 | static unsigned long words_read = 0, words_diff = 0, bytes_diff = 0;
|
---|
133 | static unsigned long outputbytes = 0;
|
---|
134 | static unsigned long inputbytes = 0;
|
---|
135 | static unsigned long MaxMemInUse = 0;
|
---|
136 | static unsigned long MemInUse = 0;
|
---|
137 | static unsigned long ChunksWritten = 0;
|
---|
138 |
|
---|
139 |
|
---|
140 | static FILE *ic; /* The invf block file */
|
---|
141 | static stdio_bitio_state sbs;
|
---|
142 |
|
---|
143 | static hash_rec **HashTable;
|
---|
144 | static unsigned long HashSize;
|
---|
145 | static unsigned long HashUsed;
|
---|
146 | static u_char *Pool;
|
---|
147 | static int PoolLeft;
|
---|
148 |
|
---|
149 | static hash_rec *hr_pool;
|
---|
150 | static int hr_PoolLeft;
|
---|
151 |
|
---|
152 | static hash_rec **first_occr;
|
---|
153 | static int max_first_occr;
|
---|
154 |
|
---|
155 | static mg_ullong L1_bits = 0;
|
---|
156 | static mg_ullong L2_bits = 0;
|
---|
157 | static mg_ullong L3_bits = 0;
|
---|
158 | static unsigned long L1_ohead = 0;
|
---|
159 | static unsigned long L2_ohead = 0;
|
---|
160 | static unsigned long L3_ohead = 0;
|
---|
161 | static unsigned long callnum = 0, lcallnum = 0, wordnum = 0, lwordnum = 0;
|
---|
162 | static unsigned long ptrcnt = 0;
|
---|
163 | static unsigned long checknum;
|
---|
164 | static long max_mem = 0;
|
---|
165 |
|
---|
166 |
|
---|
167 | static void
|
---|
168 | ChangeMem (int Change)
|
---|
169 | {
|
---|
170 | MemInUse += Change;
|
---|
171 | if (MemInUse > MaxMemInUse)
|
---|
172 | MaxMemInUse = MemInUse;
|
---|
173 | }
|
---|
174 |
|
---|
175 |
|
---|
176 |
|
---|
177 | int
|
---|
178 | init_ivf_1 (char *file_name)
|
---|
179 | {
|
---|
180 | if (!(ic = create_file (file_name, INVF_CHUNK_SUFFIX, "wb", MAGIC_CHUNK,
|
---|
181 | MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
182 | return (COMPERROR);
|
---|
183 | fwrite (" ", sizeof (u_long), 1, ic); /* Space for the maxmem */
|
---|
184 | ENCODE_START (ic)
|
---|
185 | ENCODE_PAUSE (sbs)
|
---|
186 |
|
---|
187 | if (!(Pool = Xmalloc (POOL_SIZE)))
|
---|
188 | {
|
---|
189 | Message ("Unable to allocate memory for pool");
|
---|
190 | return (COMPERROR);
|
---|
191 | }
|
---|
192 | PoolLeft = POOL_SIZE;
|
---|
193 | ChangeMem (POOL_SIZE);
|
---|
194 |
|
---|
195 | if (!(hr_pool = Xmalloc (HASH_POOL_SIZE * sizeof (hash_rec))))
|
---|
196 | {
|
---|
197 | Message ("Unable to allocate memory for pool");
|
---|
198 | return (COMPERROR);
|
---|
199 | }
|
---|
200 | hr_PoolLeft = HASH_POOL_SIZE;
|
---|
201 | ChangeMem (HASH_POOL_SIZE * sizeof (hash_rec));
|
---|
202 |
|
---|
203 | HashSize = INITIAL_HASH_SIZE;
|
---|
204 | HashUsed = 0;
|
---|
205 | if (!(HashTable = Xmalloc (sizeof (hash_rec *) * HashSize)))
|
---|
206 | {
|
---|
207 | Message ("Unable to allocate memory for table");
|
---|
208 | return (COMPERROR);
|
---|
209 | }
|
---|
210 | ChangeMem (sizeof (hash_rec *) * HashSize);
|
---|
211 | bzero ((char *) HashTable, sizeof (hash_rec *) * HashSize);
|
---|
212 | max_first_occr = 8192;
|
---|
213 | if (!(first_occr = Xmalloc (sizeof (hash_rec *) * max_first_occr)))
|
---|
214 | {
|
---|
215 | Message ("Unable to allocate memory for first_occur");
|
---|
216 | return (COMPERROR);
|
---|
217 | }
|
---|
218 | ChangeMem (sizeof (hash_rec *) * max_first_occr);
|
---|
219 |
|
---|
220 | checknum = (invf_buffer_size * INIT_CHECK_FRAC) /
|
---|
221 | (InvfLevel == 3 ? PARA_DIV : NORM_DIV);
|
---|
222 |
|
---|
223 | return (COMPALLOK);
|
---|
224 | }
|
---|
225 |
|
---|
226 |
|
---|
227 |
|
---|
228 | static unsigned long
|
---|
229 | mem_reqd (void)
|
---|
230 | {
|
---|
231 | register int i;
|
---|
232 | register unsigned long total = 0;
|
---|
233 | /* register unsigned long N = InvfLevel == 3 ? lwordnum : lcallnum; */
|
---|
234 | register unsigned long N = lcallnum;
|
---|
235 | for (i = 0; i < HashUsed; i++)
|
---|
236 | {
|
---|
237 | register hash_rec *ent = first_occr[i];
|
---|
238 | /* register unsigned long p = InvfLevel == 3 ? ent->lwcnt : ent->lfcnt; */
|
---|
239 | register unsigned long p = ent->lfcnt;
|
---|
240 | if (p)
|
---|
241 | total += BIO_Bblock_Bound (N, p);
|
---|
242 | if (InvfLevel >= 2)
|
---|
243 | total += ent->lwcnt;
|
---|
244 | }
|
---|
245 |
|
---|
246 | total = (total + 7) >> 3;
|
---|
247 | return total;
|
---|
248 | }
|
---|
249 |
|
---|
250 | static unsigned long
|
---|
251 | max_mem_reqd (void)
|
---|
252 | {
|
---|
253 | register int i;
|
---|
254 | register unsigned long total = 0;
|
---|
255 | /* register unsigned long N = InvfLevel == 3 ? wordnum : callnum; */
|
---|
256 | register unsigned long N = callnum;
|
---|
257 | for (i = 0; i < HashUsed; i++)
|
---|
258 | {
|
---|
259 | register hash_rec *ent = first_occr[i];
|
---|
260 | /* register unsigned long p = InvfLevel == 3 ? ent->wcnt : ent->fcnt; */
|
---|
261 | register unsigned long p = ent->fcnt;
|
---|
262 | if (p)
|
---|
263 | total += BIO_Bblock_Bound (N, p);
|
---|
264 | if (InvfLevel >= 2)
|
---|
265 | total += ent->wcnt;
|
---|
266 | }
|
---|
267 |
|
---|
268 | total = (total + 7) >> 3;
|
---|
269 | return total;
|
---|
270 | }
|
---|
271 |
|
---|
272 |
|
---|
273 | static void
|
---|
274 | dump_dict (unsigned long mem)
|
---|
275 | {
|
---|
276 | int i;
|
---|
277 |
|
---|
278 | ChunksWritten++;
|
---|
279 |
|
---|
280 | ENCODE_CONTINUE (sbs)
|
---|
281 |
|
---|
282 | GAMMA_ENCODE (lcallnum + 1);
|
---|
283 | if (mem > max_mem)
|
---|
284 | max_mem = mem;
|
---|
285 | GAMMA_ENCODE (mem + 1);
|
---|
286 | lwordnum = lcallnum = 0;
|
---|
287 | GAMMA_ENCODE (HashUsed + 1);
|
---|
288 | for (i = 0; i < HashUsed; i++)
|
---|
289 | {
|
---|
290 | hash_rec *ent = first_occr[i];
|
---|
291 | GAMMA_ENCODE (ent->lwcnt + 1);
|
---|
292 | if (ent->lwcnt >= 2)
|
---|
293 | GAMMA_ENCODE (ent->lfcnt);
|
---|
294 | ent->lwcnt = ent->lfcnt = 0;
|
---|
295 | }
|
---|
296 | ptrcnt = 0;
|
---|
297 | ENCODE_PAUSE (sbs)
|
---|
298 | }
|
---|
299 |
|
---|
300 |
|
---|
301 |
|
---|
302 | static int
|
---|
303 | process_doc (u_char * s_in, int l_in)
|
---|
304 | {
|
---|
305 | u_char *end = s_in + l_in - 1;
|
---|
306 |
|
---|
307 | callnum++;
|
---|
308 | lcallnum++;
|
---|
309 | inputbytes += l_in;
|
---|
310 |
|
---|
311 | if (!inaword (s_in, end))
|
---|
312 | if (SkipSGML)
|
---|
313 | PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
|
---|
314 | else
|
---|
315 | PARSE_NON_STEM_WORD (s_in, end);
|
---|
316 | /*
|
---|
317 | ** Alternately parse off words and non-words from the input
|
---|
318 | ** stream beginning with a non-word. Each token is then
|
---|
319 | ** inserted into the set if it does not exist or has it's
|
---|
320 | ** frequency count incremented if it does.
|
---|
321 | */
|
---|
322 | while (s_in <= end)
|
---|
323 | {
|
---|
324 | u_char Word[MAXSTEMLEN + 1];
|
---|
325 |
|
---|
326 | PARSE_STEM_WORD (Word, s_in, end);
|
---|
327 |
|
---|
328 | stemmer (stem_method, stemmer_num, Word);
|
---|
329 | if (SkipSGML)
|
---|
330 | PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
|
---|
331 | else
|
---|
332 | PARSE_NON_STEM_WORD (s_in, end);
|
---|
333 |
|
---|
334 | words_read++;
|
---|
335 | wordnum++;
|
---|
336 | lwordnum++;
|
---|
337 | /* Search the hash table for Word */
|
---|
338 | {
|
---|
339 | register unsigned long hashval, step;
|
---|
340 | register int hsize = HashSize;
|
---|
341 | HASH (hashval, step, Word, hsize);
|
---|
342 | for (;;)
|
---|
343 | {
|
---|
344 | register u_char *s1;
|
---|
345 | register u_char *s2;
|
---|
346 | register int len;
|
---|
347 | register hash_rec *ent;
|
---|
348 | ent = HashTable[hashval];
|
---|
349 | if (!ent)
|
---|
350 | {
|
---|
351 | int len = *Word + 1;
|
---|
352 | if (!hr_PoolLeft)
|
---|
353 | {
|
---|
354 | if (!(hr_pool = Xmalloc (HASH_POOL_SIZE *
|
---|
355 | sizeof (hash_rec))))
|
---|
356 | {
|
---|
357 | Message ("Unable to allocate memory for pool");
|
---|
358 | return (COMPERROR);
|
---|
359 | }
|
---|
360 | hr_PoolLeft = HASH_POOL_SIZE;
|
---|
361 | ChangeMem (HASH_POOL_SIZE * sizeof (hash_rec));
|
---|
362 | }
|
---|
363 | ent = hr_pool++;
|
---|
364 | hr_PoolLeft--;
|
---|
365 | if (len > PoolLeft)
|
---|
366 | {
|
---|
367 | if (!(Pool = Xmalloc (POOL_SIZE)))
|
---|
368 | {
|
---|
369 | Message ("Unable to allocate memory for pool");
|
---|
370 | return (COMPERROR);
|
---|
371 | }
|
---|
372 | PoolLeft = POOL_SIZE;
|
---|
373 | ChangeMem (POOL_SIZE);
|
---|
374 | }
|
---|
375 | ent->wcnt = 1;
|
---|
376 | ent->fcnt = 1;
|
---|
377 | ent->lwcnt = 1;
|
---|
378 | ent->lfcnt = 1;
|
---|
379 | ent->fnum = callnum;
|
---|
380 | ent->word = Pool;
|
---|
381 | memcpy (Pool, Word, len);
|
---|
382 | Pool += len;
|
---|
383 | PoolLeft -= len;
|
---|
384 | if (HashUsed == max_first_occr - 1)
|
---|
385 | {
|
---|
386 | ChangeMem (-sizeof (hash_rec *) * max_first_occr);
|
---|
387 | max_first_occr *= 2;
|
---|
388 | if (!(first_occr = Xrealloc (first_occr, sizeof (hash_rec *) *
|
---|
389 | max_first_occr)))
|
---|
390 | {
|
---|
391 | Message ("Unable to allocate memory for first_occr");
|
---|
392 | return (COMPERROR);
|
---|
393 | }
|
---|
394 | ChangeMem (sizeof (hash_rec *) * max_first_occr);
|
---|
395 | }
|
---|
396 | first_occr[HashUsed] = ent;
|
---|
397 | HashUsed++;
|
---|
398 | HashTable[hashval] = ent;
|
---|
399 | bytes_diff += Word[0];
|
---|
400 | break;
|
---|
401 | }
|
---|
402 |
|
---|
403 | /* Compare the words */
|
---|
404 | s1 = Word;
|
---|
405 | s2 = ent->word;
|
---|
406 | len = *s1 + 1;
|
---|
407 | for (; len; len--)
|
---|
408 | if (*s1++ != *s2++)
|
---|
409 | break;
|
---|
410 |
|
---|
411 | if (len)
|
---|
412 | {
|
---|
413 | hashval = (hashval + step);
|
---|
414 | if (hashval >= hsize)
|
---|
415 | hashval -= hsize;
|
---|
416 | }
|
---|
417 | else
|
---|
418 | {
|
---|
419 | ent->wcnt++;
|
---|
420 | ent->lwcnt++;
|
---|
421 | if (callnum > ent->fnum)
|
---|
422 | {
|
---|
423 | ptrcnt++;
|
---|
424 | ent->fcnt++;
|
---|
425 | ent->lfcnt++;
|
---|
426 | ent->fnum = callnum;
|
---|
427 | }
|
---|
428 | break;
|
---|
429 | }
|
---|
430 | }
|
---|
431 | }
|
---|
432 |
|
---|
433 | if (HashUsed >= HashSize >> 1)
|
---|
434 | {
|
---|
435 | hash_rec **ht;
|
---|
436 | unsigned long size;
|
---|
437 | unsigned long i;
|
---|
438 | size = prime (HashSize * 2);
|
---|
439 | if (!(ht = Xmalloc (sizeof (hash_rec *) * size)))
|
---|
440 | {
|
---|
441 | Message ("Unable to allocate memory for table");
|
---|
442 | return (COMPERROR);
|
---|
443 | }
|
---|
444 | bzero ((char *) ht, sizeof (hash_rec *) * size);
|
---|
445 | ChangeMem (sizeof (hash_rec *) * size);
|
---|
446 |
|
---|
447 | for (i = 0; i < HashSize; i++)
|
---|
448 | if (HashTable[i])
|
---|
449 | {
|
---|
450 | register u_char *wptr;
|
---|
451 | hash_rec *ent;
|
---|
452 | register unsigned long hashval, step;
|
---|
453 |
|
---|
454 | wptr = HashTable[i]->word;
|
---|
455 | HASH (hashval, step, wptr, size);
|
---|
456 | ent = ht[hashval];
|
---|
457 | while (ent)
|
---|
458 | {
|
---|
459 | hashval += step;
|
---|
460 | if (hashval >= size)
|
---|
461 | hashval -= size;
|
---|
462 | ent = ht[hashval];
|
---|
463 | }
|
---|
464 | ht[hashval] = HashTable[i];
|
---|
465 | }
|
---|
466 | Xfree (HashTable);
|
---|
467 | ChangeMem (-sizeof (hash_rec *) * HashSize);
|
---|
468 | HashTable = ht;
|
---|
469 | HashSize = size;
|
---|
470 | }
|
---|
471 |
|
---|
472 | }
|
---|
473 |
|
---|
474 | if (ptrcnt >= checknum)
|
---|
475 | {
|
---|
476 | unsigned long mem;
|
---|
477 | /*fprintf(stderr, "Checking at %u . . . ", ptrcnt); */
|
---|
478 | mem = mem_reqd ();
|
---|
479 | if (mem >= invf_buffer_size * CHECK_CLOSE)
|
---|
480 | {
|
---|
481 | /*fprintf(stderr, "Got a match\n"); */
|
---|
482 | dump_dict (mem);
|
---|
483 | checknum = (invf_buffer_size * INIT_CHECK_FRAC) /
|
---|
484 | (InvfLevel == 3 ? PARA_DIV : NORM_DIV);
|
---|
485 | }
|
---|
486 | else
|
---|
487 | {
|
---|
488 | checknum = checknum * ((CHECK_FRAC * (invf_buffer_size - mem)) / mem) +
|
---|
489 | checknum;
|
---|
490 | if (checknum <= ptrcnt)
|
---|
491 | checknum = ptrcnt + 1;
|
---|
492 | /*fprintf(stderr, "Next check at %u\n", checknum); */
|
---|
493 | }
|
---|
494 | }
|
---|
495 | return (COMPALLOK);
|
---|
496 |
|
---|
497 | } /* encode */
|
---|
498 |
|
---|
499 |
|
---|
500 | int
|
---|
501 | process_ivf_1 (u_char * s_in, int l_in)
|
---|
502 | {
|
---|
503 | if (InvfLevel <= 2)
|
---|
504 | return process_doc (s_in, l_in);
|
---|
505 | else
|
---|
506 | {
|
---|
507 | int pos = 0;
|
---|
508 | u_char *start = s_in;
|
---|
509 | while (pos < l_in)
|
---|
510 | {
|
---|
511 | if (s_in[pos] == TERMPARAGRAPH)
|
---|
512 | {
|
---|
513 | int len = pos + s_in + 1 - start;
|
---|
514 | if (process_doc (start, len) != COMPALLOK)
|
---|
515 | return (COMPERROR);
|
---|
516 | start = s_in + pos + 1;
|
---|
517 | }
|
---|
518 | pos++;
|
---|
519 | }
|
---|
520 | if (start < s_in + pos)
|
---|
521 | return process_doc (start, pos + s_in - start);
|
---|
522 | }
|
---|
523 | return COMPALLOK;
|
---|
524 | }
|
---|
525 |
|
---|
526 | static int
|
---|
527 | PackHashTable (void)
|
---|
528 | {
|
---|
529 | int s, d;
|
---|
530 | for (s = d = 0; s < HashSize; s++)
|
---|
531 | if (HashTable[s])
|
---|
532 | HashTable[d++] = HashTable[s];
|
---|
533 | ChangeMem (-sizeof (hash_rec *) * HashSize);
|
---|
534 | ChangeMem (sizeof (hash_rec *) * HashUsed);
|
---|
535 | if (!(HashTable = Xrealloc (HashTable, sizeof (hash_rec *) * HashUsed)))
|
---|
536 | {
|
---|
537 | Message ("Out of memory");
|
---|
538 | return COMPERROR;
|
---|
539 | }
|
---|
540 | HashSize = HashUsed;
|
---|
541 | return COMPALLOK;
|
---|
542 | }
|
---|
543 |
|
---|
544 |
|
---|
545 |
|
---|
546 |
|
---|
547 |
|
---|
548 | static int
|
---|
549 | ent_comp (const void *A, const void *B)
|
---|
550 | {
|
---|
551 | u_char *s1 = (*((hash_rec **) A))->word;
|
---|
552 | u_char *s2 = (*((hash_rec **) B))->word;
|
---|
553 |
|
---|
554 | return (casecompare (s1, s2)); /* [RPAP - Jan 97: Stem Index Change] */
|
---|
555 | } /* stem_comp */
|
---|
556 |
|
---|
557 |
|
---|
558 |
|
---|
559 |
|
---|
560 |
|
---|
561 | /*
|
---|
562 | * void count_text()
|
---|
563 | *
|
---|
564 | * The maths used in this function is described in the paper "Coding for
|
---|
565 | * Compression in Full-Text Retrieval Systems"
|
---|
566 | *
|
---|
567 | */
|
---|
568 | static void
|
---|
569 | count_text ()
|
---|
570 | {
|
---|
571 | int i;
|
---|
572 | mg_ullong oldL12_bits = 0;
|
---|
573 | for (i = 0; i < HashUsed; i++)
|
---|
574 | {
|
---|
575 | hash_rec *wrd = HashTable[i];
|
---|
576 | /* estimate size of a level 1 inverted file */
|
---|
577 | L1_bits += (int) (0.99 + wrd->fcnt * (1.6 + log2 (1.0 * callnum / wrd->fcnt)));
|
---|
578 | L1_ohead += BIO_Gamma_Length (wrd->fcnt);
|
---|
579 |
|
---|
580 | L2_bits += BIO_Unary_Length (wrd->wcnt);
|
---|
581 | L2_ohead += BIO_Gamma_Length (wrd->wcnt - wrd->fcnt + 1);
|
---|
582 |
|
---|
583 | L3_bits += (int) (0.99 + wrd->wcnt *
|
---|
584 | (1.6 + log2 (1.0 * words_read / (wrd->wcnt + callnum))));
|
---|
585 | L3_ohead += 0;
|
---|
586 |
|
---|
587 | /* check for overflow */
|
---|
588 | if (L1_bits + L2_bits < oldL12_bits) {
|
---|
589 | fprintf(stderr, "ERROR: Inverted file size will probably overflow %d byte unsigned integer\n", sizeof (mg_ullong));
|
---|
590 | fprintf(stderr, " counter in pass 2.\n");
|
---|
591 | if (sizeof (mg_ullong) < 8) {
|
---|
592 | fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
|
---|
593 | }
|
---|
594 | fprintf(stderr, " Build aborted.\n");
|
---|
595 | exit(1);
|
---|
596 | }
|
---|
597 | oldL12_bits = L1_bits + L2_bits;
|
---|
598 | }
|
---|
599 | L3_bits = (L3_bits + L2_bits + L1_bits + 7) / 8;
|
---|
600 | L3_ohead = (L3_ohead + L2_ohead + L1_ohead + 7) / 8;
|
---|
601 | L2_bits = (L2_bits + L1_bits + 7) / 8;
|
---|
602 | L2_ohead = (L2_ohead + L1_ohead + 7) / 8;
|
---|
603 | L1_bits = (L1_bits + 7) / 8;
|
---|
604 | L1_ohead = (L1_ohead + 7) / 8;
|
---|
605 | } /* count_text */
|
---|
606 |
|
---|
607 |
|
---|
608 |
|
---|
609 | /*
|
---|
610 | * write_stem_file():
|
---|
611 | * writes out the stemmed dictionary file
|
---|
612 | * in the following format
|
---|
613 | * lookback value (int)
|
---|
614 | * totalbytes value (int)
|
---|
615 | * indexstringbytes (int)
|
---|
616 | * for each word
|
---|
617 | * wordlen (4 bits)
|
---|
618 | * prefix match (4 bits)
|
---|
619 | * word (wordlen bytes)
|
---|
620 | * word frequency (int)
|
---|
621 | * word count (int)
|
---|
622 | *
|
---|
623 | * Accesses outside variables:
|
---|
624 | *
|
---|
625 | * Return value...:
|
---|
626 | */
|
---|
627 |
|
---|
628 | static void
|
---|
629 | write_stem_file (char *file_name)
|
---|
630 | {
|
---|
631 | long j;
|
---|
632 | struct invf_dict_header idh;
|
---|
633 | long lookback = (1 << LOGLOOKBACK); /* ???? */
|
---|
634 | long totalbytes = 0; /* The sum of the length of all words, including
|
---|
635 | the length byte */
|
---|
636 | long indexstringbytes = 0; /* The amount of space required to store the
|
---|
637 | words in the diction, this takes into account
|
---|
638 | the prefixes */
|
---|
639 | u_char *lastword = NULL; /* A pointer to the last word processed */
|
---|
640 | FILE *sp;
|
---|
641 |
|
---|
642 |
|
---|
643 | /* Calculate the size of certain things */
|
---|
644 | for (j = 0; j < HashSize; j++)
|
---|
645 | {
|
---|
646 | u_char *word = HashTable[j]->word;
|
---|
647 | indexstringbytes += word[0] + 2;
|
---|
648 | totalbytes += word[0] + 1;
|
---|
649 | if (lastword)
|
---|
650 | indexstringbytes -= prefixlen (lastword, word);
|
---|
651 | lastword = word;
|
---|
652 | }
|
---|
653 |
|
---|
654 | lastword = NULL;
|
---|
655 |
|
---|
656 | if (!(sp = create_file (file_name, INVF_DICT_SUFFIX, "wb", MAGIC_STEM_BUILD,
|
---|
657 | MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
658 | return;
|
---|
659 |
|
---|
660 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
661 | HTONUL2(lookback, idh.lookback);
|
---|
662 | HTONUL2(HashSize, idh.dict_size);
|
---|
663 | HTONUL2(totalbytes, idh.total_bytes);
|
---|
664 | HTONUL2(indexstringbytes, idh.index_string_bytes);
|
---|
665 | HTOND2(inputbytes, idh.input_bytes); /* [RJM 07/97: 4G limit] */
|
---|
666 | HTONUL2(callnum, idh.num_of_docs);
|
---|
667 | HTONUL2(callnum, idh.static_num_of_docs);
|
---|
668 | HTONUL2(words_read, idh.num_of_words);
|
---|
669 | HTONUL2(stemmer_num, idh.stemmer_num);
|
---|
670 | HTONUL2(stem_method, idh.stem_method);
|
---|
671 |
|
---|
672 | fwrite ((char *) &idh, sizeof (idh), 1, sp);
|
---|
673 | outputbytes += sizeof (idh);
|
---|
674 |
|
---|
675 | for (j = 0; j < HashSize; j++)
|
---|
676 | {
|
---|
677 | int i;
|
---|
678 | unsigned long wcnt, fcnt; /* [RPAP - Jan 97: Endian Ordering] */
|
---|
679 | hash_rec *ent = HashTable[j];
|
---|
680 | if (lastword != NULL)
|
---|
681 | /* look for prefix match with prev string */
|
---|
682 | i = prefixlen (lastword, ent->word);
|
---|
683 | else
|
---|
684 | i = 0;
|
---|
685 | fputc (i, sp);
|
---|
686 | fputc (ent->word[0] - i, sp);
|
---|
687 | fwrite ((char *) ent->word + i + 1, sizeof (u_char), ent->word[0] - i, sp);
|
---|
688 | outputbytes += ent->word[0] - i + 1;
|
---|
689 |
|
---|
690 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
691 | HTONUL2(ent->fcnt, fcnt);
|
---|
692 | fwrite ((char *) &(fcnt), sizeof (fcnt), 1, sp);
|
---|
693 |
|
---|
694 | outputbytes += sizeof (ent->fcnt);
|
---|
695 |
|
---|
696 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
697 | HTONUL2(ent->wcnt, wcnt);
|
---|
698 | fwrite ((char *) &(wcnt), sizeof (wcnt), 1, sp);
|
---|
699 |
|
---|
700 | outputbytes += sizeof (ent->wcnt);
|
---|
701 | lastword = ent->word;
|
---|
702 | }
|
---|
703 |
|
---|
704 | fclose (sp);
|
---|
705 | } /* write_stem_file() */
|
---|
706 |
|
---|
707 |
|
---|
708 |
|
---|
709 | /*
|
---|
710 | * write_codes():
|
---|
711 | * calls functions to assign and write out codes and
|
---|
712 | * then prints out stats about execution results.
|
---|
713 | *
|
---|
714 | * Accesses outside variables: z
|
---|
715 | *
|
---|
716 | * Return value...: z
|
---|
717 | */
|
---|
718 |
|
---|
719 | static void
|
---|
720 | write_codes (char *file_name)
|
---|
721 | {
|
---|
722 | unsigned long dicts = 0;
|
---|
723 | outputbytes = 0;
|
---|
724 | write_stem_file (file_name);
|
---|
725 | dicts = outputbytes;
|
---|
726 |
|
---|
727 | #ifndef SILENT
|
---|
728 | Message ("Chunks written : %d\n",
|
---|
729 | ChunksWritten);
|
---|
730 | if (InvfLevel == 3)
|
---|
731 | Message ("Paragraphs : %8d\n", callnum);
|
---|
732 | Message ("Peak memory usage : %10.1f Mb\n",
|
---|
733 | (double) MaxMemInUse / 1024.0 / 1024.0);
|
---|
734 | Message ("Stems size : %10.1f kB %5.2f%%\n",
|
---|
735 | (double) dicts / 1024, (double) 100 * dicts / inputbytes);
|
---|
736 | Message ("Lev 1 inverted file : %10.1f kB %5.2f%%\n",
|
---|
737 | (double) (L1_bits + L1_ohead) / 1024,
|
---|
738 | (double) 100 * (L1_bits + L1_ohead) / inputbytes);
|
---|
739 | Message ("Lev 2 inverted file : %10.1f kB %5.2f%%\n",
|
---|
740 | (double) (L2_bits + L2_ohead) / 1024,
|
---|
741 | (double) 100 * (L2_bits + L2_ohead) / inputbytes);
|
---|
742 | Message ("Lev 3 inverted file : %10.1f kB %5.2f%%\n",
|
---|
743 | (double) (L3_bits + L3_ohead) / 1024,
|
---|
744 | (double) 100 * (L3_bits + L3_ohead) / inputbytes);
|
---|
745 | Message ("Record addresses : %10.1f kB %5.2f%%\n",
|
---|
746 | (double) words_diff * 4 / 1024, (double) 100 * words_diff * 4 / inputbytes);
|
---|
747 | #endif
|
---|
748 | } /* write_codes() */
|
---|
749 |
|
---|
750 |
|
---|
751 |
|
---|
752 | void
|
---|
753 | write_num_file (char *file_name)
|
---|
754 | {
|
---|
755 |
|
---|
756 | int i;
|
---|
757 | FILE *f;
|
---|
758 |
|
---|
759 | for (i = 0; i < HashSize; i++)
|
---|
760 | HashTable[i]->fnum = i;
|
---|
761 |
|
---|
762 | if (!(f = create_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "wb",
|
---|
763 | MAGIC_CHUNK_TRANS, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
764 | return;
|
---|
765 |
|
---|
766 | #if 1
|
---|
767 | ENCODE_START (f)
|
---|
768 |
|
---|
769 | for (i = 0; i < HashSize; i++)
|
---|
770 | BINARY_ENCODE (first_occr[i]->fnum + 1, HashSize + 1);
|
---|
771 |
|
---|
772 | ENCODE_DONE;
|
---|
773 | #else
|
---|
774 | for (i = 0; i < HashSize; i++)
|
---|
775 | {
|
---|
776 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
777 | unsigned long fnum;
|
---|
778 | HTONUL2(first_occur[i]->fnum, fnum);
|
---|
779 | fwrite ((char *) &fnum, sizeof (unsigned long), 1, f);
|
---|
780 | }
|
---|
781 | #endif
|
---|
782 |
|
---|
783 | fclose (f);
|
---|
784 | }
|
---|
785 |
|
---|
786 |
|
---|
787 |
|
---|
788 |
|
---|
789 | int
|
---|
790 | done_ivf_1 (char *FileName)
|
---|
791 | {
|
---|
792 | char *temp_str = msg_prefix;
|
---|
793 | msg_prefix = "ivf.pass1";
|
---|
794 | #ifndef SILENT
|
---|
795 | Message ("Mem reqd for 1 chunk: %8u bytes\n",
|
---|
796 | max_mem_reqd ());
|
---|
797 | #endif
|
---|
798 | if (lcallnum)
|
---|
799 | dump_dict (mem_reqd ());
|
---|
800 |
|
---|
801 | if (PackHashTable () == COMPERROR)
|
---|
802 | return COMPERROR;
|
---|
803 |
|
---|
804 | qsort (HashTable, HashUsed, sizeof (hash_rec *), ent_comp);
|
---|
805 |
|
---|
806 | count_text ();
|
---|
807 | write_codes (FileName);
|
---|
808 |
|
---|
809 | ENCODE_CONTINUE (sbs)
|
---|
810 | GAMMA_ENCODE (1);
|
---|
811 | ENCODE_DONE
|
---|
812 | fseek (ic, sizeof (long), 0);
|
---|
813 |
|
---|
814 | HTONSL(max_mem); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
815 | fwrite (&max_mem, sizeof (max_mem), 1, ic);
|
---|
816 | NTOHSL(max_mem); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
817 |
|
---|
818 | fclose (ic);
|
---|
819 |
|
---|
820 | write_num_file (FileName);
|
---|
821 |
|
---|
822 | msg_prefix = temp_str;
|
---|
823 |
|
---|
824 | return (COMPALLOK);
|
---|
825 | } /* done_encode */
|
---|