1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * ivf.pass1.c -- Memory efficient pass 1 inversion
|
---|
4 | * Copyright (C) 1994 Neil Sharman
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | * $Id: ivf.pass1.c 439 1999-08-10 21:23:37Z sjboddie $
|
---|
21 | *
|
---|
22 | **************************************************************************/
|
---|
23 |
|
---|
24 | #include "sysfuncs.h"
|
---|
25 |
|
---|
26 | #include "memlib.h"
|
---|
27 | #include "messages.h"
|
---|
28 | #include "bitio_m.h"
|
---|
29 | #include "bitio_m_stdio.h"
|
---|
30 | #include "bitio_gen.h"
|
---|
31 | #include "local_strings.h"
|
---|
32 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
33 |
|
---|
34 | #include "mg_files.h"
|
---|
35 | #include "invf.h"
|
---|
36 | #include "mg.h"
|
---|
37 | #include "build.h"
|
---|
38 | #include "locallib.h"
|
---|
39 | #include "words.h"
|
---|
40 | #include "stemmer.h"
|
---|
41 | #include "hash.h"
|
---|
42 |
|
---|
43 |
|
---|
44 | /*
|
---|
45 | $Log$
|
---|
46 | Revision 1.1 1999/08/10 21:17:54 sjboddie
|
---|
47 | renamed mg-1.3d directory mg
|
---|
48 |
|
---|
49 | Revision 1.3 1998/12/17 09:12:50 rjmcnab
|
---|
50 |
|
---|
51 | Altered mg to process utf-8 encoded Unicode. The main changes
|
---|
52 | are in the parsing of the input, the casefolding, and the stemming.
|
---|
53 |
|
---|
54 | Revision 1.2 1998/11/25 07:55:43 rjmcnab
|
---|
55 |
|
---|
56 | Modified mg to that you can specify the stemmer you want
|
---|
57 | to use via a command line option. You specify it to
|
---|
58 | mg_passes during the build process. The number of the
|
---|
59 | stemmer that you used is stored within the inverted
|
---|
60 | dictionary header and the stemmed dictionary header so
|
---|
61 | the correct stemmer is used in later stages of building
|
---|
62 | and querying.
|
---|
63 |
|
---|
64 | Revision 1.1 1998/11/17 09:34:44 rjmcnab
|
---|
65 | *** empty log message ***
|
---|
66 |
|
---|
67 | * Revision 1.6 1995/01/16 03:57:05 tes
|
---|
68 | * Fixed bug for index_string_bytes which was calculated
|
---|
69 | * by adding the suffix lengths except for every lookback (block)
|
---|
70 | * number of words. This was incorrect as invf.dict is fully
|
---|
71 | * front coded - perhaps it was partially coded in the past.
|
---|
72 | * This would only affect a statistic - hence the reason why the
|
---|
73 | * bug could exist ;-)
|
---|
74 | *
|
---|
75 | * Revision 1.5 1994/11/29 00:31:59 tes
|
---|
76 | * Committing the new merged files and changes.
|
---|
77 | *
|
---|
78 | * Revision 1.3 1994/10/20 03:56:48 tes
|
---|
79 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
80 | * components of the boolean query.
|
---|
81 | *
|
---|
82 | * Revision 1.2 1994/09/20 04:41:34 tes
|
---|
83 | * For version 1.1
|
---|
84 | *
|
---|
85 | */
|
---|
86 |
|
---|
87 | static char *RCSID = "$Id: ivf.pass1.c 439 1999-08-10 21:23:37Z sjboddie $";
|
---|
88 |
|
---|
89 | #define LOGLOOKBACK 2
|
---|
90 | #define POOL_SIZE 1024*1024
|
---|
91 | #define HASH_POOL_SIZE 8192
|
---|
92 | #define INITIAL_HASH_SIZE 7927
|
---|
93 |
|
---|
94 | #define INIT_CHECK_FRAC 0.10
|
---|
95 | #define CHECK_FRAC 0.75
|
---|
96 | #define CHECK_CLOSE 0.999
|
---|
97 | #define PARA_DIV 1.5
|
---|
98 | #define NORM_DIV 1.0
|
---|
99 |
|
---|
100 |
|
---|
101 | typedef struct hash_rec
|
---|
102 | {
|
---|
103 | unsigned long fcnt; /* fragment frequency */
|
---|
104 | unsigned long lfcnt; /* local fragment frequency */
|
---|
105 | unsigned long fnum; /* last fragment to use stem */
|
---|
106 | unsigned long wcnt; /* stem frequency */
|
---|
107 | unsigned long lwcnt; /* local stem frequency */
|
---|
108 | u_char *word;
|
---|
109 | }
|
---|
110 | hash_rec;
|
---|
111 |
|
---|
112 |
|
---|
113 |
|
---|
114 |
|
---|
115 |
|
---|
116 |
|
---|
117 |
|
---|
118 |
|
---|
119 | static unsigned long words_read = 0, words_diff = 0, bytes_diff = 0;
|
---|
120 | static unsigned long outputbytes = 0;
|
---|
121 | static double inputbytes = 0; /* [RJM 07/97: 4G limit] */
|
---|
122 | static unsigned long MaxMemInUse = 0;
|
---|
123 | static unsigned long MemInUse = 0;
|
---|
124 | static unsigned long ChunksWritten = 0;
|
---|
125 |
|
---|
126 |
|
---|
127 | static FILE *ic; /* The invf block file */
|
---|
128 | static stdio_bitio_state sbs;
|
---|
129 |
|
---|
130 | static hash_rec **HashTable;
|
---|
131 | static unsigned long HashSize;
|
---|
132 | static unsigned long HashUsed;
|
---|
133 | static u_char *Pool;
|
---|
134 | static int PoolLeft;
|
---|
135 |
|
---|
136 | static hash_rec *hr_pool;
|
---|
137 | static int hr_PoolLeft;
|
---|
138 |
|
---|
139 | static hash_rec **first_occr;
|
---|
140 | static int max_first_occr;
|
---|
141 |
|
---|
142 | static unsigned long L1_bits = 0, L1_ohead = 0;
|
---|
143 | static unsigned long L2_bits = 0, L2_ohead = 0;
|
---|
144 | static unsigned long L3_bits = 0, L3_ohead = 0;
|
---|
145 | static unsigned long callnum = 0, lcallnum = 0, wordnum = 0, lwordnum = 0;
|
---|
146 | static unsigned long ptrcnt = 0;
|
---|
147 | static unsigned long checknum;
|
---|
148 | static long max_mem = 0;
|
---|
149 |
|
---|
150 |
|
---|
151 | static void
|
---|
152 | ChangeMem (int Change)
|
---|
153 | {
|
---|
154 | MemInUse += Change;
|
---|
155 | if (MemInUse > MaxMemInUse)
|
---|
156 | MaxMemInUse = MemInUse;
|
---|
157 | }
|
---|
158 |
|
---|
159 |
|
---|
160 |
|
---|
161 | int
|
---|
162 | init_ivf_1 (char *file_name)
|
---|
163 | {
|
---|
164 | if (!(ic = create_file (file_name, INVF_CHUNK_SUFFIX, "wb", MAGIC_CHUNK,
|
---|
165 | MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
166 | return (COMPERROR);
|
---|
167 | fwrite (" ", sizeof (u_long), 1, ic); /* Space for the maxmem */
|
---|
168 | ENCODE_START (ic)
|
---|
169 | ENCODE_PAUSE (sbs)
|
---|
170 |
|
---|
171 | if (!(Pool = Xmalloc (POOL_SIZE)))
|
---|
172 | {
|
---|
173 | Message ("Unable to allocate memory for pool");
|
---|
174 | return (COMPERROR);
|
---|
175 | }
|
---|
176 | PoolLeft = POOL_SIZE;
|
---|
177 | ChangeMem (POOL_SIZE);
|
---|
178 |
|
---|
179 | if (!(hr_pool = Xmalloc (HASH_POOL_SIZE * sizeof (hash_rec))))
|
---|
180 | {
|
---|
181 | Message ("Unable to allocate memory for pool");
|
---|
182 | return (COMPERROR);
|
---|
183 | }
|
---|
184 | hr_PoolLeft = HASH_POOL_SIZE;
|
---|
185 | ChangeMem (HASH_POOL_SIZE * sizeof (hash_rec));
|
---|
186 |
|
---|
187 | HashSize = INITIAL_HASH_SIZE;
|
---|
188 | HashUsed = 0;
|
---|
189 | if (!(HashTable = Xmalloc (sizeof (hash_rec *) * HashSize)))
|
---|
190 | {
|
---|
191 | Message ("Unable to allocate memory for table");
|
---|
192 | return (COMPERROR);
|
---|
193 | }
|
---|
194 | ChangeMem (sizeof (hash_rec *) * HashSize);
|
---|
195 | bzero ((char *) HashTable, sizeof (hash_rec *) * HashSize);
|
---|
196 | max_first_occr = 8192;
|
---|
197 | if (!(first_occr = Xmalloc (sizeof (hash_rec *) * max_first_occr)))
|
---|
198 | {
|
---|
199 | Message ("Unable to allocate memory for first_occur");
|
---|
200 | return (COMPERROR);
|
---|
201 | }
|
---|
202 | ChangeMem (sizeof (hash_rec *) * max_first_occr);
|
---|
203 |
|
---|
204 | checknum = (invf_buffer_size * INIT_CHECK_FRAC) /
|
---|
205 | (InvfLevel == 3 ? PARA_DIV : NORM_DIV);
|
---|
206 |
|
---|
207 | return (COMPALLOK);
|
---|
208 | }
|
---|
209 |
|
---|
210 |
|
---|
211 |
|
---|
212 | static unsigned long
|
---|
213 | mem_reqd (void)
|
---|
214 | {
|
---|
215 | register int i;
|
---|
216 | register unsigned long total = 0;
|
---|
217 | /* register unsigned long N = InvfLevel == 3 ? lwordnum : lcallnum; */
|
---|
218 | register unsigned long N = lcallnum;
|
---|
219 | for (i = 0; i < HashUsed; i++)
|
---|
220 | {
|
---|
221 | register hash_rec *ent = first_occr[i];
|
---|
222 | /* register unsigned long p = InvfLevel == 3 ? ent->lwcnt : ent->lfcnt; */
|
---|
223 | register unsigned long p = ent->lfcnt;
|
---|
224 | if (p)
|
---|
225 | total += BIO_Bblock_Bound (N, p);
|
---|
226 | if (InvfLevel >= 2)
|
---|
227 | total += ent->lwcnt;
|
---|
228 | }
|
---|
229 |
|
---|
230 | total = (total + 7) >> 3;
|
---|
231 | return total;
|
---|
232 | }
|
---|
233 |
|
---|
234 | static unsigned long
|
---|
235 | max_mem_reqd (void)
|
---|
236 | {
|
---|
237 | register int i;
|
---|
238 | register unsigned long total = 0;
|
---|
239 | /* register unsigned long N = InvfLevel == 3 ? wordnum : callnum; */
|
---|
240 | register unsigned long N = callnum;
|
---|
241 | for (i = 0; i < HashUsed; i++)
|
---|
242 | {
|
---|
243 | register hash_rec *ent = first_occr[i];
|
---|
244 | /* register unsigned long p = InvfLevel == 3 ? ent->wcnt : ent->fcnt; */
|
---|
245 | register unsigned long p = ent->fcnt;
|
---|
246 | if (p)
|
---|
247 | total += BIO_Bblock_Bound (N, p);
|
---|
248 | if (InvfLevel >= 2)
|
---|
249 | total += ent->wcnt;
|
---|
250 | }
|
---|
251 |
|
---|
252 | total = (total + 7) >> 3;
|
---|
253 | return total;
|
---|
254 | }
|
---|
255 |
|
---|
256 |
|
---|
257 | static void
|
---|
258 | dump_dict (unsigned long mem)
|
---|
259 | {
|
---|
260 | int i;
|
---|
261 |
|
---|
262 | ChunksWritten++;
|
---|
263 |
|
---|
264 | ENCODE_CONTINUE (sbs)
|
---|
265 |
|
---|
266 | GAMMA_ENCODE (lcallnum + 1);
|
---|
267 | if (mem > max_mem)
|
---|
268 | max_mem = mem;
|
---|
269 | GAMMA_ENCODE (mem + 1);
|
---|
270 | lwordnum = lcallnum = 0;
|
---|
271 | GAMMA_ENCODE (HashUsed + 1);
|
---|
272 | for (i = 0; i < HashUsed; i++)
|
---|
273 | {
|
---|
274 | hash_rec *ent = first_occr[i];
|
---|
275 | GAMMA_ENCODE (ent->lwcnt + 1);
|
---|
276 | if (ent->lwcnt >= 2)
|
---|
277 | GAMMA_ENCODE (ent->lfcnt);
|
---|
278 | ent->lwcnt = ent->lfcnt = 0;
|
---|
279 | }
|
---|
280 | ptrcnt = 0;
|
---|
281 | ENCODE_PAUSE (sbs)
|
---|
282 | }
|
---|
283 |
|
---|
284 |
|
---|
285 |
|
---|
286 | static int
|
---|
287 | process_doc (u_char * s_in, int l_in)
|
---|
288 | {
|
---|
289 | u_char *end = s_in + l_in - 1;
|
---|
290 |
|
---|
291 | callnum++;
|
---|
292 | lcallnum++;
|
---|
293 | inputbytes += l_in;
|
---|
294 |
|
---|
295 | if (!inaword (s_in, end))
|
---|
296 | if (SkipSGML)
|
---|
297 | PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
|
---|
298 | else
|
---|
299 | PARSE_NON_STEM_WORD (s_in, end);
|
---|
300 | /*
|
---|
301 | ** Alternately parse off words and non-words from the input
|
---|
302 | ** stream beginning with a non-word. Each token is then
|
---|
303 | ** inserted into the set if it does not exist or has it's
|
---|
304 | ** frequency count incremented if it does.
|
---|
305 | */
|
---|
306 | while (s_in <= end)
|
---|
307 | {
|
---|
308 | u_char Word[MAXSTEMLEN + 1];
|
---|
309 |
|
---|
310 | PARSE_STEM_WORD (Word, s_in, end);
|
---|
311 |
|
---|
312 | stemmer (stem_method, stemmer_num, Word);
|
---|
313 | if (SkipSGML)
|
---|
314 | PARSE_NON_STEM_WORD_OR_SGML_TAG (s_in, end);
|
---|
315 | else
|
---|
316 | PARSE_NON_STEM_WORD (s_in, end);
|
---|
317 |
|
---|
318 | words_read++;
|
---|
319 | wordnum++;
|
---|
320 | lwordnum++;
|
---|
321 | /* Search the hash table for Word */
|
---|
322 | {
|
---|
323 | register unsigned long hashval, step;
|
---|
324 | register int hsize = HashSize;
|
---|
325 | HASH (hashval, step, Word, hsize);
|
---|
326 | for (;;)
|
---|
327 | {
|
---|
328 | register u_char *s1;
|
---|
329 | register u_char *s2;
|
---|
330 | register int len;
|
---|
331 | register hash_rec *ent;
|
---|
332 | ent = HashTable[hashval];
|
---|
333 | if (!ent)
|
---|
334 | {
|
---|
335 | int len = *Word + 1;
|
---|
336 | if (!hr_PoolLeft)
|
---|
337 | {
|
---|
338 | if (!(hr_pool = Xmalloc (HASH_POOL_SIZE *
|
---|
339 | sizeof (hash_rec))))
|
---|
340 | {
|
---|
341 | Message ("Unable to allocate memory for pool");
|
---|
342 | return (COMPERROR);
|
---|
343 | }
|
---|
344 | hr_PoolLeft = HASH_POOL_SIZE;
|
---|
345 | ChangeMem (HASH_POOL_SIZE * sizeof (hash_rec));
|
---|
346 | }
|
---|
347 | ent = hr_pool++;
|
---|
348 | hr_PoolLeft--;
|
---|
349 | if (len > PoolLeft)
|
---|
350 | {
|
---|
351 | if (!(Pool = Xmalloc (POOL_SIZE)))
|
---|
352 | {
|
---|
353 | Message ("Unable to allocate memory for pool");
|
---|
354 | return (COMPERROR);
|
---|
355 | }
|
---|
356 | PoolLeft = POOL_SIZE;
|
---|
357 | ChangeMem (POOL_SIZE);
|
---|
358 | }
|
---|
359 | ent->wcnt = 1;
|
---|
360 | ent->fcnt = 1;
|
---|
361 | ent->lwcnt = 1;
|
---|
362 | ent->lfcnt = 1;
|
---|
363 | ent->fnum = callnum;
|
---|
364 | ent->word = Pool;
|
---|
365 | memcpy (Pool, Word, len);
|
---|
366 | Pool += len;
|
---|
367 | PoolLeft -= len;
|
---|
368 | if (HashUsed == max_first_occr - 1)
|
---|
369 | {
|
---|
370 | ChangeMem (-sizeof (hash_rec *) * max_first_occr);
|
---|
371 | max_first_occr *= 2;
|
---|
372 | if (!(first_occr = Xrealloc (first_occr, sizeof (hash_rec *) *
|
---|
373 | max_first_occr)))
|
---|
374 | {
|
---|
375 | Message ("Unable to allocate memory for first_occr");
|
---|
376 | return (COMPERROR);
|
---|
377 | }
|
---|
378 | ChangeMem (sizeof (hash_rec *) * max_first_occr);
|
---|
379 | }
|
---|
380 | first_occr[HashUsed] = ent;
|
---|
381 | HashUsed++;
|
---|
382 | HashTable[hashval] = ent;
|
---|
383 | bytes_diff += Word[0];
|
---|
384 | break;
|
---|
385 | }
|
---|
386 |
|
---|
387 | /* Compare the words */
|
---|
388 | s1 = Word;
|
---|
389 | s2 = ent->word;
|
---|
390 | len = *s1 + 1;
|
---|
391 | for (; len; len--)
|
---|
392 | if (*s1++ != *s2++)
|
---|
393 | break;
|
---|
394 |
|
---|
395 | if (len)
|
---|
396 | {
|
---|
397 | hashval = (hashval + step);
|
---|
398 | if (hashval >= hsize)
|
---|
399 | hashval -= hsize;
|
---|
400 | }
|
---|
401 | else
|
---|
402 | {
|
---|
403 | ent->wcnt++;
|
---|
404 | ent->lwcnt++;
|
---|
405 | if (callnum > ent->fnum)
|
---|
406 | {
|
---|
407 | ptrcnt++;
|
---|
408 | ent->fcnt++;
|
---|
409 | ent->lfcnt++;
|
---|
410 | ent->fnum = callnum;
|
---|
411 | }
|
---|
412 | break;
|
---|
413 | }
|
---|
414 | }
|
---|
415 | }
|
---|
416 |
|
---|
417 | if (HashUsed >= HashSize >> 1)
|
---|
418 | {
|
---|
419 | hash_rec **ht;
|
---|
420 | unsigned long size;
|
---|
421 | unsigned long i;
|
---|
422 | size = prime (HashSize * 2);
|
---|
423 | if (!(ht = Xmalloc (sizeof (hash_rec *) * size)))
|
---|
424 | {
|
---|
425 | Message ("Unable to allocate memory for table");
|
---|
426 | return (COMPERROR);
|
---|
427 | }
|
---|
428 | bzero ((char *) ht, sizeof (hash_rec *) * size);
|
---|
429 | ChangeMem (sizeof (hash_rec *) * size);
|
---|
430 |
|
---|
431 | for (i = 0; i < HashSize; i++)
|
---|
432 | if (HashTable[i])
|
---|
433 | {
|
---|
434 | register u_char *wptr;
|
---|
435 | hash_rec *ent;
|
---|
436 | register unsigned long hashval, step;
|
---|
437 |
|
---|
438 | wptr = HashTable[i]->word;
|
---|
439 | HASH (hashval, step, wptr, size);
|
---|
440 | ent = ht[hashval];
|
---|
441 | while (ent)
|
---|
442 | {
|
---|
443 | hashval += step;
|
---|
444 | if (hashval >= size)
|
---|
445 | hashval -= size;
|
---|
446 | ent = ht[hashval];
|
---|
447 | }
|
---|
448 | ht[hashval] = HashTable[i];
|
---|
449 | }
|
---|
450 | Xfree (HashTable);
|
---|
451 | ChangeMem (-sizeof (hash_rec *) * HashSize);
|
---|
452 | HashTable = ht;
|
---|
453 | HashSize = size;
|
---|
454 | }
|
---|
455 |
|
---|
456 | }
|
---|
457 |
|
---|
458 | if (ptrcnt >= checknum)
|
---|
459 | {
|
---|
460 | unsigned long mem;
|
---|
461 | /*fprintf(stderr, "Checking at %u . . . ", ptrcnt); */
|
---|
462 | mem = mem_reqd ();
|
---|
463 | if (mem >= invf_buffer_size * CHECK_CLOSE)
|
---|
464 | {
|
---|
465 | /*fprintf(stderr, "Got a match\n"); */
|
---|
466 | dump_dict (mem);
|
---|
467 | checknum = (invf_buffer_size * INIT_CHECK_FRAC) /
|
---|
468 | (InvfLevel == 3 ? PARA_DIV : NORM_DIV);
|
---|
469 | }
|
---|
470 | else
|
---|
471 | {
|
---|
472 | checknum = checknum * ((CHECK_FRAC * (invf_buffer_size - mem)) / mem) +
|
---|
473 | checknum;
|
---|
474 | if (checknum <= ptrcnt)
|
---|
475 | checknum = ptrcnt + 1;
|
---|
476 | /*fprintf(stderr, "Next check at %u\n", checknum); */
|
---|
477 | }
|
---|
478 | }
|
---|
479 | return (COMPALLOK);
|
---|
480 |
|
---|
481 | } /* encode */
|
---|
482 |
|
---|
483 |
|
---|
484 | int
|
---|
485 | process_ivf_1 (u_char * s_in, int l_in)
|
---|
486 | {
|
---|
487 | if (InvfLevel <= 2)
|
---|
488 | return process_doc (s_in, l_in);
|
---|
489 | else
|
---|
490 | {
|
---|
491 | int pos = 0;
|
---|
492 | u_char *start = s_in;
|
---|
493 | while (pos < l_in)
|
---|
494 | {
|
---|
495 | if (s_in[pos] == TERMPARAGRAPH)
|
---|
496 | {
|
---|
497 | int len = pos + s_in + 1 - start;
|
---|
498 | if (process_doc (start, len) != COMPALLOK)
|
---|
499 | return (COMPERROR);
|
---|
500 | start = s_in + pos + 1;
|
---|
501 | }
|
---|
502 | pos++;
|
---|
503 | }
|
---|
504 | if (start < s_in + pos)
|
---|
505 | return process_doc (start, pos + s_in - start);
|
---|
506 | }
|
---|
507 | return COMPALLOK;
|
---|
508 | }
|
---|
509 |
|
---|
510 | static int
|
---|
511 | PackHashTable (void)
|
---|
512 | {
|
---|
513 | int s, d;
|
---|
514 | for (s = d = 0; s < HashSize; s++)
|
---|
515 | if (HashTable[s])
|
---|
516 | HashTable[d++] = HashTable[s];
|
---|
517 | ChangeMem (-sizeof (hash_rec *) * HashSize);
|
---|
518 | ChangeMem (sizeof (hash_rec *) * HashUsed);
|
---|
519 | if (!(HashTable = Xrealloc (HashTable, sizeof (hash_rec *) * HashUsed)))
|
---|
520 | {
|
---|
521 | Message ("Out of memory");
|
---|
522 | return COMPERROR;
|
---|
523 | }
|
---|
524 | HashSize = HashUsed;
|
---|
525 | return COMPALLOK;
|
---|
526 | }
|
---|
527 |
|
---|
528 |
|
---|
529 |
|
---|
530 |
|
---|
531 |
|
---|
532 | static int
|
---|
533 | ent_comp (const void *A, const void *B)
|
---|
534 | {
|
---|
535 | u_char *s1 = (*((hash_rec **) A))->word;
|
---|
536 | u_char *s2 = (*((hash_rec **) B))->word;
|
---|
537 |
|
---|
538 | return (casecompare (s1, s2)); /* [RPAP - Jan 97: Stem Index Change] */
|
---|
539 | } /* stem_comp */
|
---|
540 |
|
---|
541 |
|
---|
542 |
|
---|
543 |
|
---|
544 |
|
---|
545 | /*
|
---|
546 | * void count_text()
|
---|
547 | *
|
---|
548 | * The maths used in this function is described in the paper "Coding for
|
---|
549 | * Compression in Full-Text Retrieval Systems"
|
---|
550 | *
|
---|
551 | */
|
---|
552 | static void
|
---|
553 | count_text ()
|
---|
554 | {
|
---|
555 | int i;
|
---|
556 | for (i = 0; i < HashUsed; i++)
|
---|
557 | {
|
---|
558 | hash_rec *wrd = HashTable[i];
|
---|
559 | /* estimate size of a level 1 inverted file */
|
---|
560 | L1_bits += (int) (0.99 + wrd->fcnt * (1.6 + log2 (1.0 * callnum / wrd->fcnt)));
|
---|
561 | L1_ohead += BIO_Gamma_Length (wrd->fcnt);
|
---|
562 |
|
---|
563 | L2_bits += BIO_Unary_Length (wrd->wcnt);
|
---|
564 | L2_ohead += BIO_Gamma_Length (wrd->wcnt - wrd->fcnt + 1);
|
---|
565 |
|
---|
566 | L3_bits += (int) (0.99 + wrd->wcnt *
|
---|
567 | (1.6 + log2 (1.0 * words_read / (wrd->wcnt + callnum))));
|
---|
568 | L3_ohead += 0;
|
---|
569 | }
|
---|
570 | L3_bits = (L3_bits + L2_bits + L1_bits + 7) / 8;
|
---|
571 | L3_ohead = (L3_ohead + L2_ohead + L1_ohead + 7) / 8;
|
---|
572 | L2_bits = (L2_bits + L1_bits + 7) / 8;
|
---|
573 | L2_ohead = (L2_ohead + L1_ohead + 7) / 8;
|
---|
574 | L1_bits = (L1_bits + 7) / 8;
|
---|
575 | L1_ohead = (L1_ohead + 7) / 8;
|
---|
576 | } /* count_text */
|
---|
577 |
|
---|
578 |
|
---|
579 |
|
---|
580 | /*
|
---|
581 | * write_stem_file():
|
---|
582 | * writes out the stemmed dictionary file
|
---|
583 | * in the following format
|
---|
584 | * lookback value (int)
|
---|
585 | * totalbytes value (int)
|
---|
586 | * indexstringbytes (int)
|
---|
587 | * for each word
|
---|
588 | * wordlen (4 bits)
|
---|
589 | * prefix match (4 bits)
|
---|
590 | * word (wordlen bytes)
|
---|
591 | * word frequency (int)
|
---|
592 | * word count (int)
|
---|
593 | *
|
---|
594 | * Accesses outside variables:
|
---|
595 | *
|
---|
596 | * Return value...:
|
---|
597 | */
|
---|
598 |
|
---|
599 | static void
|
---|
600 | write_stem_file (char *file_name)
|
---|
601 | {
|
---|
602 | long j;
|
---|
603 | struct invf_dict_header idh;
|
---|
604 | long lookback = (1 << LOGLOOKBACK); /* ???? */
|
---|
605 | long totalbytes = 0; /* The sum of the length of all words, including
|
---|
606 | the length byte */
|
---|
607 | long indexstringbytes = 0; /* The amount of space required to store the
|
---|
608 | words in the diction, this takes into account
|
---|
609 | the prefixes */
|
---|
610 | u_char *lastword = NULL; /* A pointer to the last word processed */
|
---|
611 | FILE *sp;
|
---|
612 |
|
---|
613 |
|
---|
614 | /* Calculate the size of certain things */
|
---|
615 | for (j = 0; j < HashSize; j++)
|
---|
616 | {
|
---|
617 | u_char *word = HashTable[j]->word;
|
---|
618 | indexstringbytes += word[0] + 2;
|
---|
619 | totalbytes += word[0] + 1;
|
---|
620 | if (lastword)
|
---|
621 | indexstringbytes -= prefixlen (lastword, word);
|
---|
622 | lastword = word;
|
---|
623 | }
|
---|
624 |
|
---|
625 | lastword = NULL;
|
---|
626 |
|
---|
627 | if (!(sp = create_file (file_name, INVF_DICT_SUFFIX, "wb", MAGIC_STEM_BUILD,
|
---|
628 | MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
629 | return;
|
---|
630 |
|
---|
631 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
632 | HTONUL2(lookback, idh.lookback);
|
---|
633 | HTONUL2(HashSize, idh.dict_size);
|
---|
634 | HTONUL2(totalbytes, idh.total_bytes);
|
---|
635 | HTONUL2(indexstringbytes, idh.index_string_bytes);
|
---|
636 | HTOND2(inputbytes, idh.input_bytes); /* [RJM 07/97: 4G limit] */
|
---|
637 | HTONUL2(callnum, idh.num_of_docs);
|
---|
638 | HTONUL2(callnum, idh.static_num_of_docs);
|
---|
639 | HTONUL2(words_read, idh.num_of_words);
|
---|
640 | HTONUL2(stemmer_num, idh.stemmer_num);
|
---|
641 | HTONUL2(stem_method, idh.stem_method);
|
---|
642 |
|
---|
643 | fwrite ((char *) &idh, sizeof (idh), 1, sp);
|
---|
644 | outputbytes += sizeof (idh);
|
---|
645 |
|
---|
646 | for (j = 0; j < HashSize; j++)
|
---|
647 | {
|
---|
648 | int i;
|
---|
649 | unsigned long wcnt, fcnt; /* [RPAP - Jan 97: Endian Ordering] */
|
---|
650 | hash_rec *ent = HashTable[j];
|
---|
651 | if (lastword != NULL)
|
---|
652 | /* look for prefix match with prev string */
|
---|
653 | i = prefixlen (lastword, ent->word);
|
---|
654 | else
|
---|
655 | i = 0;
|
---|
656 | fputc (i, sp);
|
---|
657 | fputc (ent->word[0] - i, sp);
|
---|
658 | fwrite ((char *) ent->word + i + 1, sizeof (u_char), ent->word[0] - i, sp);
|
---|
659 | outputbytes += ent->word[0] - i + 1;
|
---|
660 |
|
---|
661 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
662 | HTONUL2(ent->fcnt, fcnt);
|
---|
663 | fwrite ((char *) &(fcnt), sizeof (fcnt), 1, sp);
|
---|
664 |
|
---|
665 | outputbytes += sizeof (ent->fcnt);
|
---|
666 |
|
---|
667 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
668 | HTONUL2(ent->wcnt, wcnt);
|
---|
669 | fwrite ((char *) &(wcnt), sizeof (wcnt), 1, sp);
|
---|
670 |
|
---|
671 | outputbytes += sizeof (ent->wcnt);
|
---|
672 | lastword = ent->word;
|
---|
673 | }
|
---|
674 |
|
---|
675 | fclose (sp);
|
---|
676 | } /* write_stem_file() */
|
---|
677 |
|
---|
678 |
|
---|
679 |
|
---|
680 | /*
|
---|
681 | * write_codes():
|
---|
682 | * calls functions to assign and write out codes and
|
---|
683 | * then prints out stats about execution results.
|
---|
684 | *
|
---|
685 | * Accesses outside variables: z
|
---|
686 | *
|
---|
687 | * Return value...: z
|
---|
688 | */
|
---|
689 |
|
---|
690 | static void
|
---|
691 | write_codes (char *file_name)
|
---|
692 | {
|
---|
693 | unsigned long dicts = 0;
|
---|
694 | outputbytes = 0;
|
---|
695 | write_stem_file (file_name);
|
---|
696 | dicts = outputbytes;
|
---|
697 |
|
---|
698 | #ifndef SILENT
|
---|
699 | Message ("Chunks written : %d\n",
|
---|
700 | ChunksWritten);
|
---|
701 | if (InvfLevel == 3)
|
---|
702 | Message ("Paragraphs : %8d\n", callnum);
|
---|
703 | Message ("Peak memory usage : %10.1f Mb\n",
|
---|
704 | (double) MaxMemInUse / 1024.0 / 1024.0);
|
---|
705 | Message ("Stems size : %10.1f kB %5.2f%%\n",
|
---|
706 | (double) dicts / 1024, (double) 100 * dicts / inputbytes);
|
---|
707 | Message ("Lev 1 inverted file : %10.1f kB %5.2f%%\n",
|
---|
708 | (double) (L1_bits + L1_ohead) / 1024,
|
---|
709 | (double) 100 * (L1_bits + L1_ohead) / inputbytes);
|
---|
710 | Message ("Lev 2 inverted file : %10.1f kB %5.2f%%\n",
|
---|
711 | (double) (L2_bits + L2_ohead) / 1024,
|
---|
712 | (double) 100 * (L2_bits + L2_ohead) / inputbytes);
|
---|
713 | Message ("Lev 3 inverted file : %10.1f kB %5.2f%%\n",
|
---|
714 | (double) (L3_bits + L3_ohead) / 1024,
|
---|
715 | (double) 100 * (L3_bits + L3_ohead) / inputbytes);
|
---|
716 | Message ("Record addresses : %10.1f kB %5.2f%%\n",
|
---|
717 | (double) words_diff * 4 / 1024, (double) 100 * words_diff * 4 / inputbytes);
|
---|
718 | #endif
|
---|
719 | } /* write_codes() */
|
---|
720 |
|
---|
721 |
|
---|
722 |
|
---|
723 | void
|
---|
724 | write_num_file (char *file_name)
|
---|
725 | {
|
---|
726 |
|
---|
727 | int i;
|
---|
728 | FILE *f;
|
---|
729 |
|
---|
730 | for (i = 0; i < HashSize; i++)
|
---|
731 | HashTable[i]->fnum = i;
|
---|
732 |
|
---|
733 | if (!(f = create_file (file_name, INVF_CHUNK_TRANS_SUFFIX, "wb",
|
---|
734 | MAGIC_CHUNK_TRANS, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
735 | return;
|
---|
736 |
|
---|
737 | #if 1
|
---|
738 | ENCODE_START (f)
|
---|
739 |
|
---|
740 | for (i = 0; i < HashSize; i++)
|
---|
741 | BINARY_ENCODE (first_occr[i]->fnum + 1, HashSize + 1);
|
---|
742 |
|
---|
743 | ENCODE_DONE;
|
---|
744 | #else
|
---|
745 | for (i = 0; i < HashSize; i++)
|
---|
746 | {
|
---|
747 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
748 | unsigned long fnum;
|
---|
749 | HTONUL2(first_occur[i]->fnum, fnum);
|
---|
750 | fwrite ((char *) &fnum, sizeof (unsigned long), 1, f);
|
---|
751 | }
|
---|
752 | #endif
|
---|
753 |
|
---|
754 | fclose (f);
|
---|
755 | }
|
---|
756 |
|
---|
757 |
|
---|
758 |
|
---|
759 |
|
---|
760 | int
|
---|
761 | done_ivf_1 (char *FileName)
|
---|
762 | {
|
---|
763 | char *temp_str = msg_prefix;
|
---|
764 | msg_prefix = "ivf.pass1";
|
---|
765 | #ifndef SILENT
|
---|
766 | Message ("Mem reqd for 1 chunk: %8u bytes\n",
|
---|
767 | max_mem_reqd ());
|
---|
768 | #endif
|
---|
769 | if (lcallnum)
|
---|
770 | dump_dict (mem_reqd ());
|
---|
771 |
|
---|
772 | if (PackHashTable () == COMPERROR)
|
---|
773 | return COMPERROR;
|
---|
774 |
|
---|
775 | qsort (HashTable, HashUsed, sizeof (hash_rec *), ent_comp);
|
---|
776 |
|
---|
777 | count_text ();
|
---|
778 | write_codes (FileName);
|
---|
779 |
|
---|
780 | ENCODE_CONTINUE (sbs)
|
---|
781 | GAMMA_ENCODE (1);
|
---|
782 | ENCODE_DONE
|
---|
783 | fseek (ic, sizeof (long), 0);
|
---|
784 |
|
---|
785 | HTONSL(max_mem); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
786 | fwrite (&max_mem, sizeof (max_mem), 1, ic);
|
---|
787 | NTOHSL(max_mem); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
788 |
|
---|
789 | fclose (ic);
|
---|
790 |
|
---|
791 | write_num_file (FileName);
|
---|
792 |
|
---|
793 | msg_prefix = temp_str;
|
---|
794 |
|
---|
795 | return (COMPALLOK);
|
---|
796 | } /* done_encode */
|
---|