1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * mg_stem_idx.c -- Memory efficient stem index builder
|
---|
4 | * Copyright (C) 1997 Ross Peeters
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | **************************************************************************/
|
---|
21 |
|
---|
22 | #include "sysfuncs.h"
|
---|
23 |
|
---|
24 | #include "memlib.h"
|
---|
25 | #include "messages.h"
|
---|
26 | #include "filestats.h"
|
---|
27 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
28 |
|
---|
29 | #include "mg_files.h"
|
---|
30 | #include "invf.h"
|
---|
31 | #include "mg.h"
|
---|
32 | #include "locallib.h"
|
---|
33 | #include "backend.h" /* For struct stemmed_dict */
|
---|
34 | #include "words.h"
|
---|
35 | #include "stemmer.h"
|
---|
36 | #include "hash.h"
|
---|
37 | #include "local_strings.h"
|
---|
38 |
|
---|
39 | typedef struct PosEntry
|
---|
40 | {
|
---|
41 | unsigned int num_cases;
|
---|
42 | unsigned int blk;
|
---|
43 | unsigned short blk_index;
|
---|
44 | unsigned short offset;
|
---|
45 | }
|
---|
46 | PosEntry;
|
---|
47 |
|
---|
48 |
|
---|
49 | typedef struct PosList
|
---|
50 | {
|
---|
51 | unsigned int list_size;
|
---|
52 | unsigned int num_entries;
|
---|
53 | PosEntry PE[1];
|
---|
54 | }
|
---|
55 | PosList;
|
---|
56 |
|
---|
57 |
|
---|
58 | typedef struct idx_hash_rec
|
---|
59 | {
|
---|
60 | u_char *word;
|
---|
61 | PosList *PL;
|
---|
62 | }
|
---|
63 | idx_hash_rec;
|
---|
64 |
|
---|
65 |
|
---|
66 | #define POOL_SIZE 1024*1024
|
---|
67 | #define HASH_POOL_SIZE 8192
|
---|
68 | #define INITIAL_HASH_SIZE 7927
|
---|
69 |
|
---|
70 |
|
---|
71 | static unsigned long MaxMemInUse = 0;
|
---|
72 | static unsigned long MemInUse = 0;
|
---|
73 |
|
---|
74 | static idx_hash_rec **IdxHashTable;
|
---|
75 | static unsigned long IdxHashSize;
|
---|
76 | static unsigned long IdxHashUsed;
|
---|
77 | static u_char *IdxPool;
|
---|
78 | static int IdxPoolLeft;
|
---|
79 |
|
---|
80 | static idx_hash_rec *idx_hr_pool;
|
---|
81 | static int idx_hr_PoolLeft;
|
---|
82 |
|
---|
83 | static idx_hash_rec **idx_first_occr;
|
---|
84 | static int idx_max_first_occr;
|
---|
85 |
|
---|
86 | int block_size = 1024 * 4;
|
---|
87 |
|
---|
88 | int force = 0;
|
---|
89 |
|
---|
90 | static long lookback = 4;
|
---|
91 |
|
---|
92 | static void
|
---|
93 | ChangeMem (int Change)
|
---|
94 | {
|
---|
95 | MemInUse += Change;
|
---|
96 | if (MemInUse > MaxMemInUse)
|
---|
97 | MaxMemInUse = MemInUse;
|
---|
98 | }
|
---|
99 |
|
---|
100 |
|
---|
101 | /* =========================================================================
|
---|
102 | * Function: MakePosList
|
---|
103 | * Description:
|
---|
104 | * Input:
|
---|
105 | * Output:
|
---|
106 | * ========================================================================= */
|
---|
107 | PosList *
|
---|
108 | MakePosList (int n)
|
---|
109 | {
|
---|
110 | PosList *pl;
|
---|
111 | int list_size = (n == 0 ? 1 : n); /* always allocate at least one node */
|
---|
112 |
|
---|
113 | pl = Xmalloc (sizeof (PosList) + (list_size - 1) * sizeof (PosEntry));
|
---|
114 | if (!pl)
|
---|
115 | FatalError (1, "Unable to allocate term list");
|
---|
116 | ChangeMem (sizeof (PosList) + (list_size - 1) * sizeof (PosEntry));
|
---|
117 |
|
---|
118 | pl->num_entries = n;
|
---|
119 | pl->list_size = list_size;
|
---|
120 |
|
---|
121 | return pl;
|
---|
122 | }
|
---|
123 |
|
---|
124 | /* =========================================================================
|
---|
125 | * Function: ResizePosList
|
---|
126 | * Description:
|
---|
127 | * Input:
|
---|
128 | * Output:
|
---|
129 | * ========================================================================= */
|
---|
130 |
|
---|
131 | #define GROWTH_FACTOR 2
|
---|
132 | #define MIN_SIZE 2
|
---|
133 |
|
---|
134 | static void
|
---|
135 | ResizePosList (PosList ** pos_list)
|
---|
136 | {
|
---|
137 | PosList *pl = *pos_list;
|
---|
138 |
|
---|
139 | ChangeMem (-(sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry)));
|
---|
140 | if (pl->num_entries > pl->list_size)
|
---|
141 | {
|
---|
142 | if (pl->list_size)
|
---|
143 | pl->list_size *= GROWTH_FACTOR;
|
---|
144 | else
|
---|
145 | pl->list_size = MIN_SIZE;
|
---|
146 | }
|
---|
147 | pl = Xrealloc (pl, sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry));
|
---|
148 |
|
---|
149 | if (!pl)
|
---|
150 | FatalError (1, "Unable to resize pos list");
|
---|
151 | ChangeMem (sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry));
|
---|
152 |
|
---|
153 | *pos_list = pl;
|
---|
154 | }
|
---|
155 |
|
---|
156 |
|
---|
157 | /* =========================================================================
|
---|
158 | * Function: FreePosList
|
---|
159 | * Description:
|
---|
160 | * Input:
|
---|
161 | * Output:
|
---|
162 | * ========================================================================= */
|
---|
163 |
|
---|
164 | void
|
---|
165 | FreePosList (PosList ** the_pl)
|
---|
166 | {
|
---|
167 | PosList *pl = *the_pl;
|
---|
168 |
|
---|
169 | ChangeMem (-(sizeof(PosList) + sizeof (PosEntry) * (pl->list_size - 1)));
|
---|
170 | Xfree (pl);
|
---|
171 |
|
---|
172 | *the_pl = NULL;
|
---|
173 | }
|
---|
174 |
|
---|
175 |
|
---|
176 | /* =========================================================================
|
---|
177 | * Function: ResetPosList
|
---|
178 | * Description:
|
---|
179 | * Input:
|
---|
180 | * Output:
|
---|
181 | * ========================================================================= */
|
---|
182 |
|
---|
183 | void
|
---|
184 | ResetPosList (PosList ** pl)
|
---|
185 | {
|
---|
186 | if (*pl)
|
---|
187 | FreePosList (pl);
|
---|
188 | *pl = MakePosList (0);
|
---|
189 | }
|
---|
190 |
|
---|
191 | /* =========================================================================
|
---|
192 | * Function: AddPosEntry
|
---|
193 | * Description:
|
---|
194 | * Input:
|
---|
195 | * Output:
|
---|
196 | * ========================================================================= */
|
---|
197 |
|
---|
198 | int
|
---|
199 | AddPosEntry (PosList ** pos_list, PosEntry * pe)
|
---|
200 | {
|
---|
201 | PosList *pl = *pos_list;
|
---|
202 |
|
---|
203 | pl->num_entries++;
|
---|
204 | ResizePosList (pos_list);
|
---|
205 | pl = *pos_list;
|
---|
206 |
|
---|
207 | /* copy the structure contents */
|
---|
208 | bcopy ((char *) pe, (char *) &(pl->PE[pl->num_entries - 1]), sizeof (PosEntry));
|
---|
209 |
|
---|
210 | return pl->num_entries - 1;
|
---|
211 | }
|
---|
212 |
|
---|
213 |
|
---|
214 | /* Modified from stem_search.c */
|
---|
215 | stemmed_dict *
|
---|
216 | ReadStemDictBlk (File * stem_file)
|
---|
217 | {
|
---|
218 | unsigned long i;
|
---|
219 | stemmed_dict *sd;
|
---|
220 | u_char *buffer;
|
---|
221 |
|
---|
222 | if (!(sd = Xmalloc (sizeof (stemmed_dict))))
|
---|
223 | FatalError (1, "Could not allocate memory for stemmed dict");
|
---|
224 |
|
---|
225 |
|
---|
226 | sd->stem_file = stem_file;
|
---|
227 | sd->MemForStemDict = 0;
|
---|
228 |
|
---|
229 | Fread (&sd->sdh, sizeof (sd->sdh), 1, stem_file);
|
---|
230 |
|
---|
231 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
232 | NTOHUL(sd->sdh.lookback);
|
---|
233 | NTOHUL(sd->sdh.block_size);
|
---|
234 | NTOHUL(sd->sdh.num_blocks);
|
---|
235 | NTOHUL(sd->sdh.blocks_start);
|
---|
236 | NTOHUL(sd->sdh.index_chars);
|
---|
237 | NTOHUL(sd->sdh.num_of_docs);
|
---|
238 | NTOHUL(sd->sdh.static_num_of_docs);
|
---|
239 | NTOHUL(sd->sdh.num_of_words);
|
---|
240 | NTOHUL(sd->sdh.stemmer_num);
|
---|
241 | NTOHUL(sd->sdh.stem_method);
|
---|
242 | NTOHUL(sd->sdh.indexed);
|
---|
243 |
|
---|
244 | if (!(buffer = Xmalloc (sd->sdh.index_chars)))
|
---|
245 | {
|
---|
246 | Xfree (sd);
|
---|
247 | FatalError (1, "Could not allocate memory for stemmed dict");
|
---|
248 | return (NULL);
|
---|
249 | };
|
---|
250 | sd->MemForStemDict += sd->sdh.index_chars;
|
---|
251 |
|
---|
252 | if (!(sd->index = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->index))))
|
---|
253 | {
|
---|
254 | Xfree (sd);
|
---|
255 | Xfree (buffer);
|
---|
256 | FatalError (1, "Could not allocate memory for stemmed dict");
|
---|
257 | return (NULL);
|
---|
258 | };
|
---|
259 | sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->index);
|
---|
260 |
|
---|
261 | if (!(sd->pos = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->pos))))
|
---|
262 | {
|
---|
263 | Xfree (sd);
|
---|
264 | Xfree (buffer);
|
---|
265 | Xfree (sd->index);
|
---|
266 | FatalError (1, "Could not allocate memory for stemmed dict");
|
---|
267 | return (NULL);
|
---|
268 | };
|
---|
269 | sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->pos);
|
---|
270 |
|
---|
271 | if (!(sd->buffer = Xmalloc (sd->sdh.block_size * sizeof (*sd->buffer))))
|
---|
272 | {
|
---|
273 | Xfree (sd);
|
---|
274 | Xfree (buffer);
|
---|
275 | Xfree (sd->index);
|
---|
276 | Xfree (sd->buffer);
|
---|
277 | FatalError (1, "Could not allocate memory for stemmed dict");
|
---|
278 | return (NULL);
|
---|
279 | };
|
---|
280 | sd->MemForStemDict += sd->sdh.block_size * sizeof (*sd->buffer);
|
---|
281 |
|
---|
282 | sd->active = -1;
|
---|
283 |
|
---|
284 | for (i = 0; i < sd->sdh.num_blocks; i++)
|
---|
285 | {
|
---|
286 | register u_char len;
|
---|
287 | sd->index[i] = buffer;
|
---|
288 | len = Getc (stem_file);
|
---|
289 | *buffer++ = len;
|
---|
290 | Fread (buffer, sizeof (u_char), len, stem_file);
|
---|
291 | buffer += len;
|
---|
292 | Fread (&sd->pos[i], sizeof (*sd->pos), 1, stem_file);
|
---|
293 | NTOHUL(sd->pos[i]); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
294 | }
|
---|
295 |
|
---|
296 | return sd;
|
---|
297 | }
|
---|
298 |
|
---|
299 |
|
---|
300 | void
|
---|
301 | mg_init_process ()
|
---|
302 | {
|
---|
303 | /* Allocate memory for idx hash table */
|
---|
304 |
|
---|
305 | if (!(IdxPool = Xmalloc (POOL_SIZE)))
|
---|
306 | FatalError (1, "Unable to allocate memory for idx pool");
|
---|
307 | IdxPoolLeft = POOL_SIZE;
|
---|
308 | ChangeMem (POOL_SIZE);
|
---|
309 |
|
---|
310 | if (!(idx_hr_pool = Xmalloc (HASH_POOL_SIZE * sizeof (idx_hash_rec))))
|
---|
311 | FatalError (1, "Unable to allocate memory for idx pool");
|
---|
312 | idx_hr_PoolLeft = HASH_POOL_SIZE;
|
---|
313 | ChangeMem (HASH_POOL_SIZE * sizeof (idx_hash_rec));
|
---|
314 |
|
---|
315 | IdxHashSize = INITIAL_HASH_SIZE;
|
---|
316 | IdxHashUsed = 0;
|
---|
317 | if (!(IdxHashTable = Xmalloc (sizeof (idx_hash_rec *) * IdxHashSize)))
|
---|
318 | FatalError (1, "Unable to allocate memory for idx table");
|
---|
319 | ChangeMem (sizeof (idx_hash_rec *) * IdxHashSize);
|
---|
320 | bzero ((char *) IdxHashTable, sizeof (idx_hash_rec *) * IdxHashSize);
|
---|
321 | idx_max_first_occr = 8192;
|
---|
322 | if (!(idx_first_occr = Xmalloc (sizeof (idx_hash_rec *) * idx_max_first_occr)))
|
---|
323 | FatalError (1, "Unable to allocate memory for idx_first_occur");
|
---|
324 | ChangeMem (sizeof (idx_hash_rec *) * idx_max_first_occr);
|
---|
325 | }
|
---|
326 |
|
---|
327 |
|
---|
328 | void
|
---|
329 | PackIdxHashTable (void)
|
---|
330 | {
|
---|
331 | int s, d;
|
---|
332 | for (s = d = 0; s < IdxHashSize; s++)
|
---|
333 | if (IdxHashTable[s])
|
---|
334 | IdxHashTable[d++] = IdxHashTable[s];
|
---|
335 | ChangeMem (-sizeof (idx_hash_rec *) * IdxHashSize);
|
---|
336 | ChangeMem (sizeof (idx_hash_rec *) * IdxHashUsed);
|
---|
337 | if (!(IdxHashTable = Xrealloc (IdxHashTable, sizeof (idx_hash_rec *) * IdxHashUsed)))
|
---|
338 | FatalError (1, "Out of memory");
|
---|
339 | IdxHashSize = IdxHashUsed;
|
---|
340 | }
|
---|
341 |
|
---|
342 |
|
---|
343 | void
|
---|
344 | process_stem_dict (stemmed_dict * sd, int stem_method)
|
---|
345 | {
|
---|
346 | int block;
|
---|
347 | short blk_index = -1;
|
---|
348 | int wordnum = -1;
|
---|
349 | PosEntry *prevPE = NULL;
|
---|
350 | idx_hash_rec *prevIdx = NULL;
|
---|
351 | u_char word[MAXSTEMLEN + 1];
|
---|
352 | u_char prev[MAXSTEMLEN + 1];
|
---|
353 |
|
---|
354 | /* For each block in stem dict... */
|
---|
355 | for (block = 0; block < sd->sdh.num_blocks; block++)
|
---|
356 | {
|
---|
357 | register unsigned int res;
|
---|
358 | int num_indexes;
|
---|
359 | unsigned long *first_word, *last_invf_len;
|
---|
360 | unsigned short *num_words;
|
---|
361 | u_char *base;
|
---|
362 | unsigned short *index;
|
---|
363 |
|
---|
364 | /* Read block into buffer */
|
---|
365 | Fseek (sd->stem_file, sd->pos[block] + sd->sdh.blocks_start, 0);
|
---|
366 | Fread (sd->buffer, sd->sdh.block_size, sizeof (u_char), sd->stem_file);
|
---|
367 | sd->active = sd->pos[block];
|
---|
368 |
|
---|
369 | /* Move through block header */
|
---|
370 | first_word = (unsigned long *) (sd->buffer);
|
---|
371 | NTOHUL(*first_word); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
372 | last_invf_len = (unsigned long *) (first_word + 1);
|
---|
373 | NTOHUL(*last_invf_len); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
374 | num_words = (unsigned short *) (last_invf_len + 1);
|
---|
375 | NTOHUS(*num_words); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
376 | index = num_words + 1;
|
---|
377 | num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
|
---|
378 |
|
---|
379 | {
|
---|
380 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
381 | int i;
|
---|
382 | for (i = 0; i < num_indexes; i++)
|
---|
383 | NTOHUS(index[i]);
|
---|
384 | }
|
---|
385 |
|
---|
386 | base = (u_char *) (index + num_indexes);
|
---|
387 | blk_index = -1;
|
---|
388 |
|
---|
389 | /* For each word in block... */
|
---|
390 | for (res = 0; res < *num_words; res++)
|
---|
391 | {
|
---|
392 | unsigned copy, suff;
|
---|
393 | register idx_hash_rec *idx_ent = 0;
|
---|
394 |
|
---|
395 | /* Update blk_index */
|
---|
396 | if (!(res % sd->sdh.lookback))
|
---|
397 | blk_index++;
|
---|
398 |
|
---|
399 | copy = *base++;
|
---|
400 | suff = *base++;
|
---|
401 | bcopy ((char *) base, (char *) (prev + copy + 1), suff);
|
---|
402 | base += suff;
|
---|
403 | *prev = copy + suff;
|
---|
404 |
|
---|
405 | /* Skip irrelevant word info */
|
---|
406 | base += sizeof (u_long);
|
---|
407 | base += sizeof (u_long);
|
---|
408 | base += sizeof (u_long);
|
---|
409 |
|
---|
410 | /* Stem word */
|
---|
411 | bcopy ((char *) prev, (char *) word, *prev + 1);
|
---|
412 | stemmer (stem_method, sd->sdh.stemmer_num, word);
|
---|
413 |
|
---|
414 | /* Check if word follows straight on from previous word */
|
---|
415 | if (prevIdx && !compare (word, prevIdx->word))
|
---|
416 | prevPE->num_cases++;
|
---|
417 |
|
---|
418 | else
|
---|
419 | {
|
---|
420 | /* Search the idx hash table for word */
|
---|
421 | register unsigned long hashval, step;
|
---|
422 | register int hsize = IdxHashSize;
|
---|
423 | HASH (hashval, step, word, hsize);
|
---|
424 | for (;;)
|
---|
425 | {
|
---|
426 | register u_char *s1;
|
---|
427 | register u_char *s2;
|
---|
428 | register int len;
|
---|
429 | idx_ent = IdxHashTable[hashval];
|
---|
430 | if (!idx_ent)
|
---|
431 | {
|
---|
432 | /* Create a next entry in the hash table */
|
---|
433 | int len = *word + 1;
|
---|
434 | if (!idx_hr_PoolLeft)
|
---|
435 | {
|
---|
436 | if (!(idx_hr_pool = Xmalloc (HASH_POOL_SIZE *
|
---|
437 | sizeof (idx_hash_rec))))
|
---|
438 | FatalError (1, "Unable to allocate memory for pool");
|
---|
439 | idx_hr_PoolLeft = HASH_POOL_SIZE;
|
---|
440 | ChangeMem (HASH_POOL_SIZE * sizeof (idx_hash_rec));
|
---|
441 | }
|
---|
442 | idx_ent = idx_hr_pool++;
|
---|
443 | idx_hr_PoolLeft--;
|
---|
444 | if (len > IdxPoolLeft)
|
---|
445 | {
|
---|
446 | if (!(IdxPool = Xmalloc (POOL_SIZE)))
|
---|
447 | FatalError (1, "Unable to allocate memory for pool");
|
---|
448 | IdxPoolLeft = POOL_SIZE;
|
---|
449 | ChangeMem (POOL_SIZE);
|
---|
450 | }
|
---|
451 | wordnum++;
|
---|
452 |
|
---|
453 | idx_ent->word = IdxPool;
|
---|
454 | idx_ent->PL = MakePosList (0);
|
---|
455 | {
|
---|
456 | PosEntry PE;
|
---|
457 | PE.num_cases = 1;
|
---|
458 | PE.blk = block;
|
---|
459 | PE.blk_index = (unsigned short) blk_index;
|
---|
460 | PE.offset = res % sd->sdh.lookback;
|
---|
461 | AddPosEntry (&(idx_ent->PL), &PE);
|
---|
462 | }
|
---|
463 | prevIdx = idx_ent;
|
---|
464 | prevPE = &(idx_ent->PL->PE[idx_ent->PL->num_entries - 1]);
|
---|
465 |
|
---|
466 | bcopy ((char *) word, (char *) IdxPool, len);
|
---|
467 | IdxPool += len;
|
---|
468 | IdxPoolLeft -= len;
|
---|
469 | if (IdxHashUsed == idx_max_first_occr - 1)
|
---|
470 | {
|
---|
471 | ChangeMem (-sizeof (idx_hash_rec *) * idx_max_first_occr);
|
---|
472 | idx_max_first_occr *= 2;
|
---|
473 | if (!(idx_first_occr = Xrealloc (idx_first_occr, sizeof (idx_hash_rec *) *
|
---|
474 | idx_max_first_occr)))
|
---|
475 | FatalError (1, "Unable to allocate memory for idx_first_occr");
|
---|
476 | ChangeMem (sizeof (idx_hash_rec *) * idx_max_first_occr);
|
---|
477 | }
|
---|
478 | idx_first_occr[IdxHashUsed] = idx_ent;
|
---|
479 | IdxHashUsed++;
|
---|
480 | IdxHashTable[hashval] = idx_ent;
|
---|
481 | break;
|
---|
482 | }
|
---|
483 |
|
---|
484 | /* Compare the words */
|
---|
485 | s1 = word;
|
---|
486 | s2 = idx_ent->word;
|
---|
487 | len = *s1 + 1;
|
---|
488 | for (; len; len--)
|
---|
489 | if (*s1++ != *s2++)
|
---|
490 | break;
|
---|
491 |
|
---|
492 | if (len)
|
---|
493 | {
|
---|
494 | /* Entry is not the right one - move to next hash index */
|
---|
495 | hashval = (hashval + step);
|
---|
496 | if (hashval >= hsize)
|
---|
497 | hashval -= hsize;
|
---|
498 | }
|
---|
499 | else
|
---|
500 | {
|
---|
501 | /* Entry is correct - added PosEntry to word */
|
---|
502 | PosEntry PE;
|
---|
503 | PE.num_cases = 1;
|
---|
504 | PE.blk = block;
|
---|
505 | PE.blk_index = (unsigned short) blk_index;
|
---|
506 | PE.offset = res % sd->sdh.lookback;
|
---|
507 | AddPosEntry (&(idx_ent->PL), &PE);
|
---|
508 | prevIdx = idx_ent;
|
---|
509 | prevPE = &(idx_ent->PL->PE[idx_ent->PL->num_entries - 1]);
|
---|
510 | break;
|
---|
511 | }
|
---|
512 | }
|
---|
513 | }
|
---|
514 |
|
---|
515 | if (IdxHashUsed >= IdxHashSize >> 1)
|
---|
516 | {
|
---|
517 | idx_hash_rec **ht;
|
---|
518 | unsigned long size;
|
---|
519 | unsigned long i;
|
---|
520 | size = prime (IdxHashSize * 2);
|
---|
521 | if (!(ht = Xmalloc (sizeof (idx_hash_rec *) * size)))
|
---|
522 | FatalError (1, "Unable to allocate memory for idx table");
|
---|
523 | bzero ((char *) ht, sizeof (idx_hash_rec *) * size);
|
---|
524 | ChangeMem (sizeof (idx_hash_rec *) * size);
|
---|
525 |
|
---|
526 | for (i = 0; i < IdxHashSize; i++)
|
---|
527 | if (IdxHashTable[i])
|
---|
528 | {
|
---|
529 | register u_char *wptr;
|
---|
530 | idx_hash_rec *ent;
|
---|
531 | register unsigned long hashval, step;
|
---|
532 |
|
---|
533 | wptr = IdxHashTable[i]->word;
|
---|
534 | HASH (hashval, step, wptr, size);
|
---|
535 | ent = ht[hashval];
|
---|
536 | while (ent)
|
---|
537 | {
|
---|
538 | hashval += step;
|
---|
539 | if (hashval >= size)
|
---|
540 | hashval -= size;
|
---|
541 | ent = ht[hashval];
|
---|
542 | }
|
---|
543 | ht[hashval] = IdxHashTable[i];
|
---|
544 | }
|
---|
545 | Xfree (IdxHashTable);
|
---|
546 | ChangeMem (-sizeof (idx_hash_rec *) * IdxHashSize);
|
---|
547 | IdxHashTable = ht;
|
---|
548 | IdxHashSize = size;
|
---|
549 | }
|
---|
550 |
|
---|
551 | } /* end for each word */
|
---|
552 |
|
---|
553 | } /* end for each block */
|
---|
554 |
|
---|
555 | }
|
---|
556 |
|
---|
557 | static int
|
---|
558 | idx_comp (const void *A, const void *B)
|
---|
559 | {
|
---|
560 | u_char *s1 = (*((idx_hash_rec **) A))->word;
|
---|
561 | u_char *s2 = (*((idx_hash_rec **) B))->word;
|
---|
562 | return (casecompare (s1, s2));
|
---|
563 | }
|
---|
564 |
|
---|
565 |
|
---|
566 | void
|
---|
567 | save_idx (char * filename, int stem_method)
|
---|
568 | {
|
---|
569 | char *FName;
|
---|
570 | unsigned long i, j, pos, First_word, num;
|
---|
571 | struct stem_idx_header sih;
|
---|
572 | u_char *buffer, *last_word = NULL;
|
---|
573 | unsigned short *pointers;
|
---|
574 | int buf_in_use;
|
---|
575 | unsigned short ptrs_in_use, word_num;
|
---|
576 | FILE *idbi = NULL, *tmp = NULL;
|
---|
577 |
|
---|
578 | FName = make_name (filename, ".tmp", NULL);
|
---|
579 | if (!(tmp = fopen (FName, "w+b")))
|
---|
580 | FatalError (1, "Unable to open \"%s\".\n", FName);
|
---|
581 |
|
---|
582 | /* Delete the file now */
|
---|
583 | unlink (FName);
|
---|
584 |
|
---|
585 | /* Create appropriate stem index file */
|
---|
586 | switch (stem_method)
|
---|
587 | {
|
---|
588 | case (1):
|
---|
589 | {
|
---|
590 | idbi = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX, "wb", MAGIC_STEM_1,
|
---|
591 | MG_ABORT);
|
---|
592 | break;
|
---|
593 | }
|
---|
594 |
|
---|
595 | case (2):
|
---|
596 | {
|
---|
597 | idbi = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX, "wb", MAGIC_STEM_2,
|
---|
598 | MG_ABORT);
|
---|
599 | break;
|
---|
600 | }
|
---|
601 | case (3):
|
---|
602 | {
|
---|
603 | idbi = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX, "wb", MAGIC_STEM_3,
|
---|
604 | MG_ABORT);
|
---|
605 | break;
|
---|
606 | }
|
---|
607 | }
|
---|
608 |
|
---|
609 | if (!idbi)
|
---|
610 | FatalError (1, "Could NOT create .invf.blocked.%d file", stem_method);
|
---|
611 |
|
---|
612 | PackIdxHashTable();
|
---|
613 | qsort (IdxHashTable, IdxHashUsed, sizeof (idx_hash_rec *), idx_comp);
|
---|
614 |
|
---|
615 | sih.lookback = lookback;
|
---|
616 | sih.block_size = block_size;
|
---|
617 | sih.num_blocks = 0;
|
---|
618 | sih.blocks_start = 0;
|
---|
619 | sih.index_chars = 0;
|
---|
620 | sih.num_of_words = IdxHashUsed;
|
---|
621 |
|
---|
622 | fwrite ((char *) &sih, sizeof (sih), 1, idbi);
|
---|
623 |
|
---|
624 | if (!(buffer = Xmalloc (block_size + 512)))
|
---|
625 | FatalError (1, "Unable to allocate memory for \"buffer\"\n");
|
---|
626 | if (!(pointers = Xmalloc (block_size + 512)))
|
---|
627 | FatalError (1, "Unable to allocate memory for \"buffer\"\n");
|
---|
628 |
|
---|
629 | buf_in_use = 0;
|
---|
630 | pos = 0;
|
---|
631 | word_num = 0;
|
---|
632 | ptrs_in_use = 0;
|
---|
633 | First_word = 0;
|
---|
634 |
|
---|
635 | /* For each word in the hashtable... */
|
---|
636 | for (i = 0; i < IdxHashUsed; i++)
|
---|
637 | {
|
---|
638 | register unsigned long extra, copy, suff;
|
---|
639 | register struct idx_hash_rec *ent = IdxHashTable[i];
|
---|
640 |
|
---|
641 | /* build a new word on top of prev */
|
---|
642 | if (last_word != NULL)
|
---|
643 | copy = prefixlen (last_word, ent->word);
|
---|
644 | else
|
---|
645 | copy = 0;
|
---|
646 | suff = *(ent->word) - copy;
|
---|
647 | last_word = ent->word;
|
---|
648 |
|
---|
649 | if (word_num % sih.lookback == 0)
|
---|
650 | /* Will need copy chars to add + a pointer in index */
|
---|
651 | extra = copy + sizeof (*pointers);
|
---|
652 | else
|
---|
653 | extra = 0;
|
---|
654 | if ((ptrs_in_use + 1) * sizeof (*pointers) + sizeof (ptrs_in_use) + extra +
|
---|
655 | buf_in_use + sizeof (First_word) + suff + 1 + sizeof (ent->PL->num_entries) +
|
---|
656 | ent->PL->num_entries * sizeof (PosEntry) > block_size)
|
---|
657 | {
|
---|
658 | /* Dump buffer to tmp file */
|
---|
659 | int chunk;
|
---|
660 | HTONUL(First_word); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
661 | HTONUS(word_num); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
662 | fwrite (&First_word, sizeof (First_word), 1, tmp);
|
---|
663 | fwrite (&word_num, sizeof (word_num), 1, tmp);
|
---|
664 | fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
|
---|
665 | fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
|
---|
666 | bzero ((char *) buffer, block_size);
|
---|
667 | chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
|
---|
668 | sizeof (ptrs_in_use) + sizeof (First_word);
|
---|
669 | if (force && chunk < block_size)
|
---|
670 | {
|
---|
671 | fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
|
---|
672 | chunk = block_size;
|
---|
673 | }
|
---|
674 |
|
---|
675 | pos += chunk;
|
---|
676 |
|
---|
677 | buf_in_use = 0;
|
---|
678 | word_num = 0;
|
---|
679 | ptrs_in_use = 0;
|
---|
680 | sih.num_blocks++;
|
---|
681 |
|
---|
682 | /* Check that entry will fit into new block */
|
---|
683 | if (sizeof (*pointers) + sizeof (ptrs_in_use) + extra + sizeof (First_word) +
|
---|
684 | suff + 1 + sizeof (ent->PL->num_entries) +
|
---|
685 | ent->PL->num_entries * sizeof (PosEntry) > block_size)
|
---|
686 | FatalError (1, "Block size to small");
|
---|
687 |
|
---|
688 | }
|
---|
689 |
|
---|
690 | if (word_num % sih.lookback == 0)
|
---|
691 | {
|
---|
692 | HTONUS2(buf_in_use, pointers[ptrs_in_use++]); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
693 | suff += copy;
|
---|
694 | copy = 0;
|
---|
695 | }
|
---|
696 |
|
---|
697 | /* Output Word information */
|
---|
698 | buffer[buf_in_use++] = copy;
|
---|
699 | buffer[buf_in_use++] = suff;
|
---|
700 | bcopy ((char *) (ent->word + copy + 1), (char *) (buffer + buf_in_use), suff);
|
---|
701 | buf_in_use += suff;
|
---|
702 | HTONUI(ent->PL->num_entries); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
703 | bcopy ((char *) &(ent->PL->num_entries), (char *) (buffer + buf_in_use), sizeof (ent->PL->num_entries));
|
---|
704 | NTOHUI(ent->PL->num_entries); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
705 | buf_in_use += sizeof (ent->PL->num_entries);
|
---|
706 |
|
---|
707 | for (j = 0; j < ent->PL->num_entries; j++)
|
---|
708 | {
|
---|
709 | register PosEntry *pe = &(ent->PL->PE[j]);
|
---|
710 | HTONUI(pe->num_cases); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
711 | bcopy ((char *) &(pe->num_cases), (char *) (buffer + buf_in_use), sizeof (pe->num_cases));
|
---|
712 | buf_in_use += sizeof (pe->num_cases);
|
---|
713 | HTONUI(pe->blk); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
714 | bcopy ((char *) &(pe->blk), (char *) (buffer + buf_in_use), sizeof (pe->blk));
|
---|
715 | buf_in_use += sizeof (pe->blk);
|
---|
716 | HTONUS(pe->blk_index); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
717 | bcopy ((char *) &(pe->blk_index), (char *) (buffer + buf_in_use), sizeof (pe->blk_index));
|
---|
718 | buf_in_use += sizeof (pe->blk_index);
|
---|
719 | HTONUS(pe->offset); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
720 | bcopy ((char *) &(pe->offset), (char *) (buffer + buf_in_use), sizeof (pe->offset));
|
---|
721 | buf_in_use += sizeof (pe->offset);
|
---|
722 | }
|
---|
723 |
|
---|
724 | if (buf_in_use + ptrs_in_use * sizeof (*pointers) +
|
---|
725 | sizeof (ptrs_in_use) > block_size)
|
---|
726 | FatalError (1, "Fatal Internal Error # 64209258\n");
|
---|
727 |
|
---|
728 | if (word_num == 0)
|
---|
729 | {
|
---|
730 | /* Write word to main index */
|
---|
731 | fwrite (ent->word, sizeof (u_char), *(ent->word) + 1, idbi);
|
---|
732 | HTONUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
733 | fwrite (&pos, sizeof (pos), 1, idbi);
|
---|
734 | NTOHUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
735 | sih.index_chars += *(ent->word) + 1;
|
---|
736 | First_word = i;
|
---|
737 | }
|
---|
738 | word_num++;
|
---|
739 | } /* end for each word */
|
---|
740 |
|
---|
741 | if (buf_in_use)
|
---|
742 | {
|
---|
743 | /* Write last buffer to tmp file */
|
---|
744 | int chunk;
|
---|
745 |
|
---|
746 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
747 | HTONUL(First_word);
|
---|
748 | HTONUS(word_num);
|
---|
749 |
|
---|
750 | fwrite (&First_word, sizeof (First_word), 1, tmp);
|
---|
751 | fwrite (&word_num, sizeof (word_num), 1, tmp);
|
---|
752 | fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
|
---|
753 | fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
|
---|
754 | bzero ((char *) buffer, block_size);
|
---|
755 | chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
|
---|
756 | sizeof (ptrs_in_use) + sizeof (First_word);
|
---|
757 | if (force && chunk < block_size)
|
---|
758 | {
|
---|
759 | fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
|
---|
760 | chunk = block_size;
|
---|
761 | }
|
---|
762 |
|
---|
763 | sih.num_blocks++;
|
---|
764 | }
|
---|
765 |
|
---|
766 | rewind (tmp);
|
---|
767 | sih.blocks_start = sih.index_chars + sizeof (u_long) + sizeof (sih) +
|
---|
768 | sih.num_blocks * sizeof (pos);
|
---|
769 | if (force)
|
---|
770 | {
|
---|
771 | int amount;
|
---|
772 | amount = sih.blocks_start % block_size;
|
---|
773 | if (amount != 0)
|
---|
774 | {
|
---|
775 | bzero ((char *) buffer, block_size);
|
---|
776 | fwrite (buffer, sizeof (u_char), block_size - amount, idbi);
|
---|
777 | sih.blocks_start += block_size - amount;
|
---|
778 | }
|
---|
779 | }
|
---|
780 |
|
---|
781 | while ((num = fread (buffer, sizeof (u_char), block_size, tmp)) != 0)
|
---|
782 | fwrite (buffer, sizeof (u_char), num, idbi);
|
---|
783 | fclose (tmp);
|
---|
784 |
|
---|
785 | /* skip over the magic number */
|
---|
786 | fseek (idbi, sizeof (u_long), 0);
|
---|
787 |
|
---|
788 | /* [RPAP - Jan 97: Endian Ordering] */
|
---|
789 | HTONUL(sih.lookback);
|
---|
790 | HTONUL(sih.block_size);
|
---|
791 | HTONUL(sih.num_blocks);
|
---|
792 | HTONUL(sih.blocks_start);
|
---|
793 | HTONUL(sih.index_chars);
|
---|
794 | HTONUL(sih.num_of_words);
|
---|
795 |
|
---|
796 | fwrite (&sih, sizeof (sih), 1, idbi);
|
---|
797 | fclose (idbi);
|
---|
798 |
|
---|
799 | #ifndef SILENT
|
---|
800 | Message ("Stem %d:\n", stem_method);
|
---|
801 | Message (" Block size : %10d\n", block_size);
|
---|
802 | Message (" Num_blocks : %10d\n", NTOHUL(sih.num_blocks)); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
803 | Message (" Max mem used : %10.1f Mb\n", (double) MaxMemInUse / 1024.0 / 1024.0);
|
---|
804 | Message (" Num_of_words : %10d\n", NTOHUL(sih.num_of_words)); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
805 | #endif
|
---|
806 | }
|
---|
807 |
|
---|
808 |
|
---|
809 | void
|
---|
810 | UpdateStemDict (char * filename, int stem_method)
|
---|
811 | {
|
---|
812 | FILE *idb;
|
---|
813 | struct stem_dict_header sdh;
|
---|
814 |
|
---|
815 | if (!(idb = open_file (filename, INVF_DICT_BLOCKED_SUFFIX, "r+b",
|
---|
816 | MAGIC_STEM, MG_CONTINUE)))
|
---|
817 | FatalError (1, "Could not update stemmed dict");
|
---|
818 |
|
---|
819 | fread ((char *) &sdh, sizeof (sdh), 1, idb);
|
---|
820 | NTOHUL(sdh.indexed); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
821 | sdh.indexed |= 1 << (stem_method - 1);
|
---|
822 | HTONUL(sdh.indexed);
|
---|
823 | fseek (idb, sizeof (u_long), 0);
|
---|
824 | fwrite ((char *) &sdh, sizeof (sdh), 1, idb);
|
---|
825 | fclose (idb);
|
---|
826 | }
|
---|
827 |
|
---|
828 |
|
---|
829 |
|
---|
830 | /* Main */
|
---|
831 | int main (int argc, char **argv)
|
---|
832 | {
|
---|
833 | File *idb; /* File to .invf.dict.blocked */
|
---|
834 | char *filename = "";
|
---|
835 | stemmed_dict *sd; /* Stemmed dictionary */
|
---|
836 | int ch;
|
---|
837 | char path[512];
|
---|
838 | int stem_method = 0;
|
---|
839 |
|
---|
840 | msg_prefix = argv[0];
|
---|
841 | opterr = 0;
|
---|
842 | while ((ch = getopt (argc, argv, "f:d:b:hFs:")) != -1)
|
---|
843 | switch (ch)
|
---|
844 | {
|
---|
845 | case 'f': /* input file */
|
---|
846 | filename = optarg;
|
---|
847 | break;
|
---|
848 | case 'd':
|
---|
849 | set_basepath (optarg);
|
---|
850 | break;
|
---|
851 | case 'b':
|
---|
852 | block_size = atoi (optarg);
|
---|
853 | break;
|
---|
854 | case 'F':
|
---|
855 | force = 1;
|
---|
856 | break;
|
---|
857 | case 's':
|
---|
858 | stem_method = atoi (optarg);
|
---|
859 | break;
|
---|
860 | case 'h':
|
---|
861 | case '?':
|
---|
862 | fprintf (stderr, "usage: %s [-d directory] "
|
---|
863 | "[-b num] [-F] [-h] -s 1|2|3 -f name\n", argv[0]);
|
---|
864 | exit (1);
|
---|
865 | }
|
---|
866 |
|
---|
867 | if (stem_method < 1 || stem_method > 3)
|
---|
868 | FatalError (1, "Stem method must be 1, 2 or 3");
|
---|
869 |
|
---|
870 | /* Open required stem dict file */
|
---|
871 | sprintf (path, FILE_NAME_FORMAT, get_basepath (), filename, INVF_DICT_BLOCKED_SUFFIX);
|
---|
872 | if (!(idb = Fopen (path, "rb", MAGIC_STEM)))
|
---|
873 | FatalError (1, "Unable to open \"%s\"", path);
|
---|
874 |
|
---|
875 | /* Read in idb header and index to blocks */
|
---|
876 | if (!(sd = ReadStemDictBlk (idb)))
|
---|
877 | FatalError (1, "Could not read stemmed dictionary");
|
---|
878 |
|
---|
879 | /* Process stemmed dictionary */
|
---|
880 | mg_init_process ();
|
---|
881 | process_stem_dict (sd, stem_method);
|
---|
882 | save_idx (filename, stem_method);
|
---|
883 |
|
---|
884 | /* Close stemmed dict */
|
---|
885 | Fclose (idb);
|
---|
886 |
|
---|
887 | /* Update stemmed dict */
|
---|
888 | UpdateStemDict (filename, stem_method);
|
---|
889 |
|
---|
890 | return 0;
|
---|
891 | }
|
---|
892 |
|
---|