source: trunk/gsdl3/src/packages/mg/src/text/mg_stem_idx.c@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 19 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 23.9 KB
Line 
1/**************************************************************************
2 *
3 * mg_stem_idx.c -- Memory efficient stem index builder
4 * Copyright (C) 1997 Ross Peeters
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23
24#include "memlib.h"
25#include "messages.h"
26#include "filestats.h"
27#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
28
29#include "mg_files.h"
30#include "invf.h"
31#include "mg.h"
32#include "locallib.h"
33#include "backend.h" /* For struct stemmed_dict */
34#include "words.h"
35#include "stemmer.h"
36#include "hash.h"
37#include "local_strings.h"
38
39typedef struct PosEntry
40{
41 unsigned int num_cases;
42 unsigned int blk;
43 unsigned short blk_index;
44 unsigned short offset;
45}
46PosEntry;
47
48
49typedef struct PosList
50{
51 unsigned int list_size;
52 unsigned int num_entries;
53 PosEntry PE[1];
54}
55PosList;
56
57
58typedef struct idx_hash_rec
59{
60 u_char *word;
61 PosList *PL;
62}
63idx_hash_rec;
64
65
66#define POOL_SIZE 1024*1024
67#define HASH_POOL_SIZE 8192
68#define INITIAL_HASH_SIZE 7927
69
70
71static unsigned long MaxMemInUse = 0;
72static unsigned long MemInUse = 0;
73
74static idx_hash_rec **IdxHashTable;
75static unsigned long IdxHashSize;
76static unsigned long IdxHashUsed;
77static u_char *IdxPool;
78static int IdxPoolLeft;
79
80static idx_hash_rec *idx_hr_pool;
81static int idx_hr_PoolLeft;
82
83static idx_hash_rec **idx_first_occr;
84static int idx_max_first_occr;
85
86int block_size = 1024 * 4;
87
88int force = 0;
89
90static long lookback = 4;
91
92static void
93ChangeMem (int Change)
94{
95 MemInUse += Change;
96 if (MemInUse > MaxMemInUse)
97 MaxMemInUse = MemInUse;
98}
99
100
101/* =========================================================================
102 * Function: MakePosList
103 * Description:
104 * Input:
105 * Output:
106 * ========================================================================= */
107PosList *
108MakePosList (int n)
109{
110 PosList *pl;
111 int list_size = (n == 0 ? 1 : n); /* always allocate at least one node */
112
113 pl = Xmalloc (sizeof (PosList) + (list_size - 1) * sizeof (PosEntry));
114 if (!pl)
115 FatalError (1, "Unable to allocate term list");
116 ChangeMem (sizeof (PosList) + (list_size - 1) * sizeof (PosEntry));
117
118 pl->num_entries = n;
119 pl->list_size = list_size;
120
121 return pl;
122}
123
124/* =========================================================================
125 * Function: ResizePosList
126 * Description:
127 * Input:
128 * Output:
129 * ========================================================================= */
130
131#define GROWTH_FACTOR 2
132#define MIN_SIZE 2
133
134static void
135ResizePosList (PosList ** pos_list)
136{
137 PosList *pl = *pos_list;
138
139 ChangeMem (-(sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry)));
140 if (pl->num_entries > pl->list_size)
141 {
142 if (pl->list_size)
143 pl->list_size *= GROWTH_FACTOR;
144 else
145 pl->list_size = MIN_SIZE;
146 }
147 pl = Xrealloc (pl, sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry));
148
149 if (!pl)
150 FatalError (1, "Unable to resize pos list");
151 ChangeMem (sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry));
152
153 *pos_list = pl;
154}
155
156
157/* =========================================================================
158 * Function: FreePosList
159 * Description:
160 * Input:
161 * Output:
162 * ========================================================================= */
163
164void
165FreePosList (PosList ** the_pl)
166{
167 PosList *pl = *the_pl;
168
169 ChangeMem (-(sizeof(PosList) + sizeof (PosEntry) * (pl->list_size - 1)));
170 Xfree (pl);
171
172 *the_pl = NULL;
173}
174
175
176/* =========================================================================
177 * Function: ResetPosList
178 * Description:
179 * Input:
180 * Output:
181 * ========================================================================= */
182
183void
184ResetPosList (PosList ** pl)
185{
186 if (*pl)
187 FreePosList (pl);
188 *pl = MakePosList (0);
189}
190
191/* =========================================================================
192 * Function: AddPosEntry
193 * Description:
194 * Input:
195 * Output:
196 * ========================================================================= */
197
198int
199AddPosEntry (PosList ** pos_list, PosEntry * pe)
200{
201 PosList *pl = *pos_list;
202
203 pl->num_entries++;
204 ResizePosList (pos_list);
205 pl = *pos_list;
206
207 /* copy the structure contents */
208 bcopy ((char *) pe, (char *) &(pl->PE[pl->num_entries - 1]), sizeof (PosEntry));
209
210 return pl->num_entries - 1;
211}
212
213
214/* Modified from stem_search.c */
215stemmed_dict *
216ReadStemDictBlk (File * stem_file)
217{
218 unsigned long i;
219 stemmed_dict *sd;
220 u_char *buffer;
221
222 if (!(sd = Xmalloc (sizeof (stemmed_dict))))
223 FatalError (1, "Could not allocate memory for stemmed dict");
224
225
226 sd->stem_file = stem_file;
227 sd->MemForStemDict = 0;
228
229 Fread (&sd->sdh, sizeof (sd->sdh), 1, stem_file);
230
231 /* [RPAP - Jan 97: Endian Ordering] */
232 NTOHUL(sd->sdh.lookback);
233 NTOHUL(sd->sdh.block_size);
234 NTOHUL(sd->sdh.num_blocks);
235 NTOHUL(sd->sdh.blocks_start);
236 NTOHUL(sd->sdh.index_chars);
237 NTOHUL(sd->sdh.num_of_docs);
238 NTOHUL(sd->sdh.static_num_of_docs);
239 NTOHUL(sd->sdh.num_of_words);
240 NTOHUL(sd->sdh.stemmer_num);
241 NTOHUL(sd->sdh.stem_method);
242 NTOHUL(sd->sdh.indexed);
243
244 if (!(buffer = Xmalloc (sd->sdh.index_chars)))
245 {
246 Xfree (sd);
247 FatalError (1, "Could not allocate memory for stemmed dict");
248 return (NULL);
249 };
250 sd->MemForStemDict += sd->sdh.index_chars;
251
252 if (!(sd->index = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->index))))
253 {
254 Xfree (sd);
255 Xfree (buffer);
256 FatalError (1, "Could not allocate memory for stemmed dict");
257 return (NULL);
258 };
259 sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->index);
260
261 if (!(sd->pos = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->pos))))
262 {
263 Xfree (sd);
264 Xfree (buffer);
265 Xfree (sd->index);
266 FatalError (1, "Could not allocate memory for stemmed dict");
267 return (NULL);
268 };
269 sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->pos);
270
271 if (!(sd->buffer = Xmalloc (sd->sdh.block_size * sizeof (*sd->buffer))))
272 {
273 Xfree (sd);
274 Xfree (buffer);
275 Xfree (sd->index);
276 Xfree (sd->buffer);
277 FatalError (1, "Could not allocate memory for stemmed dict");
278 return (NULL);
279 };
280 sd->MemForStemDict += sd->sdh.block_size * sizeof (*sd->buffer);
281
282 sd->active = -1;
283
284 for (i = 0; i < sd->sdh.num_blocks; i++)
285 {
286 register u_char len;
287 sd->index[i] = buffer;
288 len = Getc (stem_file);
289 *buffer++ = len;
290 Fread (buffer, sizeof (u_char), len, stem_file);
291 buffer += len;
292 Fread (&sd->pos[i], sizeof (*sd->pos), 1, stem_file);
293 NTOHUL(sd->pos[i]); /* [RPAP - Jan 97: Endian Ordering] */
294 }
295
296 return sd;
297}
298
299
300void
301mg_init_process ()
302{
303 /* Allocate memory for idx hash table */
304
305 if (!(IdxPool = Xmalloc (POOL_SIZE)))
306 FatalError (1, "Unable to allocate memory for idx pool");
307 IdxPoolLeft = POOL_SIZE;
308 ChangeMem (POOL_SIZE);
309
310 if (!(idx_hr_pool = Xmalloc (HASH_POOL_SIZE * sizeof (idx_hash_rec))))
311 FatalError (1, "Unable to allocate memory for idx pool");
312 idx_hr_PoolLeft = HASH_POOL_SIZE;
313 ChangeMem (HASH_POOL_SIZE * sizeof (idx_hash_rec));
314
315 IdxHashSize = INITIAL_HASH_SIZE;
316 IdxHashUsed = 0;
317 if (!(IdxHashTable = Xmalloc (sizeof (idx_hash_rec *) * IdxHashSize)))
318 FatalError (1, "Unable to allocate memory for idx table");
319 ChangeMem (sizeof (idx_hash_rec *) * IdxHashSize);
320 bzero ((char *) IdxHashTable, sizeof (idx_hash_rec *) * IdxHashSize);
321 idx_max_first_occr = 8192;
322 if (!(idx_first_occr = Xmalloc (sizeof (idx_hash_rec *) * idx_max_first_occr)))
323 FatalError (1, "Unable to allocate memory for idx_first_occur");
324 ChangeMem (sizeof (idx_hash_rec *) * idx_max_first_occr);
325}
326
327
328void
329PackIdxHashTable (void)
330{
331 int s, d;
332 for (s = d = 0; s < IdxHashSize; s++)
333 if (IdxHashTable[s])
334 IdxHashTable[d++] = IdxHashTable[s];
335 ChangeMem (-sizeof (idx_hash_rec *) * IdxHashSize);
336 ChangeMem (sizeof (idx_hash_rec *) * IdxHashUsed);
337 if (!(IdxHashTable = Xrealloc (IdxHashTable, sizeof (idx_hash_rec *) * IdxHashUsed)))
338 FatalError (1, "Out of memory");
339 IdxHashSize = IdxHashUsed;
340}
341
342
343void
344process_stem_dict (stemmed_dict * sd, int stem_method)
345{
346 int block;
347 short blk_index = -1;
348 int wordnum = -1;
349 PosEntry *prevPE = NULL;
350 idx_hash_rec *prevIdx = NULL;
351 u_char word[MAXSTEMLEN + 1];
352 u_char prev[MAXSTEMLEN + 1];
353
354 /* For each block in stem dict... */
355 for (block = 0; block < sd->sdh.num_blocks; block++)
356 {
357 register unsigned int res;
358 int num_indexes;
359 unsigned long *first_word, *last_invf_len;
360 unsigned short *num_words;
361 u_char *base;
362 unsigned short *index;
363
364 /* Read block into buffer */
365 Fseek (sd->stem_file, sd->pos[block] + sd->sdh.blocks_start, 0);
366 Fread (sd->buffer, sd->sdh.block_size, sizeof (u_char), sd->stem_file);
367 sd->active = sd->pos[block];
368
369 /* Move through block header */
370 first_word = (unsigned long *) (sd->buffer);
371 NTOHUL(*first_word); /* [RPAP - Jan 97: Endian Ordering] */
372 last_invf_len = (unsigned long *) (first_word + 1);
373 NTOHUL(*last_invf_len); /* [RPAP - Jan 97: Endian Ordering] */
374 num_words = (unsigned short *) (last_invf_len + 1);
375 NTOHUS(*num_words); /* [RPAP - Jan 97: Endian Ordering] */
376 index = num_words + 1;
377 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
378
379 {
380 /* [RPAP - Jan 97: Endian Ordering] */
381 int i;
382 for (i = 0; i < num_indexes; i++)
383 NTOHUS(index[i]);
384 }
385
386 base = (u_char *) (index + num_indexes);
387 blk_index = -1;
388
389 /* For each word in block... */
390 for (res = 0; res < *num_words; res++)
391 {
392 unsigned copy, suff;
393 register idx_hash_rec *idx_ent = 0;
394
395 /* Update blk_index */
396 if (!(res % sd->sdh.lookback))
397 blk_index++;
398
399 copy = *base++;
400 suff = *base++;
401 bcopy ((char *) base, (char *) (prev + copy + 1), suff);
402 base += suff;
403 *prev = copy + suff;
404
405 /* Skip irrelevant word info */
406 base += sizeof (u_long);
407 base += sizeof (u_long);
408 base += sizeof (u_long);
409
410 /* Stem word */
411 bcopy ((char *) prev, (char *) word, *prev + 1);
412 stemmer (stem_method, sd->sdh.stemmer_num, word);
413
414 /* Check if word follows straight on from previous word */
415 if (prevIdx && !compare (word, prevIdx->word))
416 prevPE->num_cases++;
417
418 else
419 {
420 /* Search the idx hash table for word */
421 register unsigned long hashval, step;
422 register int hsize = IdxHashSize;
423 HASH (hashval, step, word, hsize);
424 for (;;)
425 {
426 register u_char *s1;
427 register u_char *s2;
428 register int len;
429 idx_ent = IdxHashTable[hashval];
430 if (!idx_ent)
431 {
432 /* Create a next entry in the hash table */
433 int len = *word + 1;
434 if (!idx_hr_PoolLeft)
435 {
436 if (!(idx_hr_pool = Xmalloc (HASH_POOL_SIZE *
437 sizeof (idx_hash_rec))))
438 FatalError (1, "Unable to allocate memory for pool");
439 idx_hr_PoolLeft = HASH_POOL_SIZE;
440 ChangeMem (HASH_POOL_SIZE * sizeof (idx_hash_rec));
441 }
442 idx_ent = idx_hr_pool++;
443 idx_hr_PoolLeft--;
444 if (len > IdxPoolLeft)
445 {
446 if (!(IdxPool = Xmalloc (POOL_SIZE)))
447 FatalError (1, "Unable to allocate memory for pool");
448 IdxPoolLeft = POOL_SIZE;
449 ChangeMem (POOL_SIZE);
450 }
451 wordnum++;
452
453 idx_ent->word = IdxPool;
454 idx_ent->PL = MakePosList (0);
455 {
456 PosEntry PE;
457 PE.num_cases = 1;
458 PE.blk = block;
459 PE.blk_index = (unsigned short) blk_index;
460 PE.offset = res % sd->sdh.lookback;
461 AddPosEntry (&(idx_ent->PL), &PE);
462 }
463 prevIdx = idx_ent;
464 prevPE = &(idx_ent->PL->PE[idx_ent->PL->num_entries - 1]);
465
466 bcopy ((char *) word, (char *) IdxPool, len);
467 IdxPool += len;
468 IdxPoolLeft -= len;
469 if (IdxHashUsed == idx_max_first_occr - 1)
470 {
471 ChangeMem (-sizeof (idx_hash_rec *) * idx_max_first_occr);
472 idx_max_first_occr *= 2;
473 if (!(idx_first_occr = Xrealloc (idx_first_occr, sizeof (idx_hash_rec *) *
474 idx_max_first_occr)))
475 FatalError (1, "Unable to allocate memory for idx_first_occr");
476 ChangeMem (sizeof (idx_hash_rec *) * idx_max_first_occr);
477 }
478 idx_first_occr[IdxHashUsed] = idx_ent;
479 IdxHashUsed++;
480 IdxHashTable[hashval] = idx_ent;
481 break;
482 }
483
484 /* Compare the words */
485 s1 = word;
486 s2 = idx_ent->word;
487 len = *s1 + 1;
488 for (; len; len--)
489 if (*s1++ != *s2++)
490 break;
491
492 if (len)
493 {
494 /* Entry is not the right one - move to next hash index */
495 hashval = (hashval + step);
496 if (hashval >= hsize)
497 hashval -= hsize;
498 }
499 else
500 {
501 /* Entry is correct - added PosEntry to word */
502 PosEntry PE;
503 PE.num_cases = 1;
504 PE.blk = block;
505 PE.blk_index = (unsigned short) blk_index;
506 PE.offset = res % sd->sdh.lookback;
507 AddPosEntry (&(idx_ent->PL), &PE);
508 prevIdx = idx_ent;
509 prevPE = &(idx_ent->PL->PE[idx_ent->PL->num_entries - 1]);
510 break;
511 }
512 }
513 }
514
515 if (IdxHashUsed >= IdxHashSize >> 1)
516 {
517 idx_hash_rec **ht;
518 unsigned long size;
519 unsigned long i;
520 size = prime (IdxHashSize * 2);
521 if (!(ht = Xmalloc (sizeof (idx_hash_rec *) * size)))
522 FatalError (1, "Unable to allocate memory for idx table");
523 bzero ((char *) ht, sizeof (idx_hash_rec *) * size);
524 ChangeMem (sizeof (idx_hash_rec *) * size);
525
526 for (i = 0; i < IdxHashSize; i++)
527 if (IdxHashTable[i])
528 {
529 register u_char *wptr;
530 idx_hash_rec *ent;
531 register unsigned long hashval, step;
532
533 wptr = IdxHashTable[i]->word;
534 HASH (hashval, step, wptr, size);
535 ent = ht[hashval];
536 while (ent)
537 {
538 hashval += step;
539 if (hashval >= size)
540 hashval -= size;
541 ent = ht[hashval];
542 }
543 ht[hashval] = IdxHashTable[i];
544 }
545 Xfree (IdxHashTable);
546 ChangeMem (-sizeof (idx_hash_rec *) * IdxHashSize);
547 IdxHashTable = ht;
548 IdxHashSize = size;
549 }
550
551 } /* end for each word */
552
553 } /* end for each block */
554
555}
556
557static int
558idx_comp (const void *A, const void *B)
559{
560 u_char *s1 = (*((idx_hash_rec **) A))->word;
561 u_char *s2 = (*((idx_hash_rec **) B))->word;
562 return (casecompare (s1, s2));
563}
564
565
566void
567save_idx (char * filename, int stem_method)
568{
569 char *FName;
570 unsigned long i, j, pos, First_word, num;
571 struct stem_idx_header sih;
572 u_char *buffer, *last_word = NULL;
573 unsigned short *pointers;
574 int buf_in_use;
575 unsigned short ptrs_in_use, word_num;
576 FILE *idbi = NULL, *tmp = NULL;
577
578 FName = make_name (filename, ".tmp", NULL);
579 if (!(tmp = fopen (FName, "w+b")))
580 FatalError (1, "Unable to open \"%s\".\n", FName);
581
582 /* Delete the file now */
583 unlink (FName);
584
585 /* Create appropriate stem index file */
586 switch (stem_method)
587 {
588 case (1):
589 {
590 idbi = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX, "wb", MAGIC_STEM_1,
591 MG_ABORT);
592 break;
593 }
594
595 case (2):
596 {
597 idbi = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX, "wb", MAGIC_STEM_2,
598 MG_ABORT);
599 break;
600 }
601 case (3):
602 {
603 idbi = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX, "wb", MAGIC_STEM_3,
604 MG_ABORT);
605 break;
606 }
607 }
608
609 if (!idbi)
610 FatalError (1, "Could NOT create .invf.blocked.%d file", stem_method);
611
612 PackIdxHashTable();
613 qsort (IdxHashTable, IdxHashUsed, sizeof (idx_hash_rec *), idx_comp);
614
615 sih.lookback = lookback;
616 sih.block_size = block_size;
617 sih.num_blocks = 0;
618 sih.blocks_start = 0;
619 sih.index_chars = 0;
620 sih.num_of_words = IdxHashUsed;
621
622 fwrite ((char *) &sih, sizeof (sih), 1, idbi);
623
624 if (!(buffer = Xmalloc (block_size + 512)))
625 FatalError (1, "Unable to allocate memory for \"buffer\"\n");
626 if (!(pointers = Xmalloc (block_size + 512)))
627 FatalError (1, "Unable to allocate memory for \"buffer\"\n");
628
629 buf_in_use = 0;
630 pos = 0;
631 word_num = 0;
632 ptrs_in_use = 0;
633 First_word = 0;
634
635 /* For each word in the hashtable... */
636 for (i = 0; i < IdxHashUsed; i++)
637 {
638 register unsigned long extra, copy, suff;
639 register struct idx_hash_rec *ent = IdxHashTable[i];
640
641 /* build a new word on top of prev */
642 if (last_word != NULL)
643 copy = prefixlen (last_word, ent->word);
644 else
645 copy = 0;
646 suff = *(ent->word) - copy;
647 last_word = ent->word;
648
649 if (word_num % sih.lookback == 0)
650 /* Will need copy chars to add + a pointer in index */
651 extra = copy + sizeof (*pointers);
652 else
653 extra = 0;
654 if ((ptrs_in_use + 1) * sizeof (*pointers) + sizeof (ptrs_in_use) + extra +
655 buf_in_use + sizeof (First_word) + suff + 1 + sizeof (ent->PL->num_entries) +
656 ent->PL->num_entries * sizeof (PosEntry) > block_size)
657 {
658 /* Dump buffer to tmp file */
659 int chunk;
660 HTONUL(First_word); /* [RPAP - Jan 97: Endian Ordering] */
661 HTONUS(word_num); /* [RPAP - Jan 97: Endian Ordering] */
662 fwrite (&First_word, sizeof (First_word), 1, tmp);
663 fwrite (&word_num, sizeof (word_num), 1, tmp);
664 fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
665 fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
666 bzero ((char *) buffer, block_size);
667 chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
668 sizeof (ptrs_in_use) + sizeof (First_word);
669 if (force && chunk < block_size)
670 {
671 fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
672 chunk = block_size;
673 }
674
675 pos += chunk;
676
677 buf_in_use = 0;
678 word_num = 0;
679 ptrs_in_use = 0;
680 sih.num_blocks++;
681
682 /* Check that entry will fit into new block */
683 if (sizeof (*pointers) + sizeof (ptrs_in_use) + extra + sizeof (First_word) +
684 suff + 1 + sizeof (ent->PL->num_entries) +
685 ent->PL->num_entries * sizeof (PosEntry) > block_size)
686 FatalError (1, "Block size to small");
687
688 }
689
690 if (word_num % sih.lookback == 0)
691 {
692 HTONUS2(buf_in_use, pointers[ptrs_in_use++]); /* [RPAP - Jan 97: Endian Ordering] */
693 suff += copy;
694 copy = 0;
695 }
696
697 /* Output Word information */
698 buffer[buf_in_use++] = copy;
699 buffer[buf_in_use++] = suff;
700 bcopy ((char *) (ent->word + copy + 1), (char *) (buffer + buf_in_use), suff);
701 buf_in_use += suff;
702 HTONUI(ent->PL->num_entries); /* [RPAP - Jan 97: Endian Ordering] */
703 bcopy ((char *) &(ent->PL->num_entries), (char *) (buffer + buf_in_use), sizeof (ent->PL->num_entries));
704 NTOHUI(ent->PL->num_entries); /* [RPAP - Jan 97: Endian Ordering] */
705 buf_in_use += sizeof (ent->PL->num_entries);
706
707 for (j = 0; j < ent->PL->num_entries; j++)
708 {
709 register PosEntry *pe = &(ent->PL->PE[j]);
710 HTONUI(pe->num_cases); /* [RPAP - Jan 97: Endian Ordering] */
711 bcopy ((char *) &(pe->num_cases), (char *) (buffer + buf_in_use), sizeof (pe->num_cases));
712 buf_in_use += sizeof (pe->num_cases);
713 HTONUI(pe->blk); /* [RPAP - Jan 97: Endian Ordering] */
714 bcopy ((char *) &(pe->blk), (char *) (buffer + buf_in_use), sizeof (pe->blk));
715 buf_in_use += sizeof (pe->blk);
716 HTONUS(pe->blk_index); /* [RPAP - Jan 97: Endian Ordering] */
717 bcopy ((char *) &(pe->blk_index), (char *) (buffer + buf_in_use), sizeof (pe->blk_index));
718 buf_in_use += sizeof (pe->blk_index);
719 HTONUS(pe->offset); /* [RPAP - Jan 97: Endian Ordering] */
720 bcopy ((char *) &(pe->offset), (char *) (buffer + buf_in_use), sizeof (pe->offset));
721 buf_in_use += sizeof (pe->offset);
722 }
723
724 if (buf_in_use + ptrs_in_use * sizeof (*pointers) +
725 sizeof (ptrs_in_use) > block_size)
726 FatalError (1, "Fatal Internal Error # 64209258\n");
727
728 if (word_num == 0)
729 {
730 /* Write word to main index */
731 fwrite (ent->word, sizeof (u_char), *(ent->word) + 1, idbi);
732 HTONUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
733 fwrite (&pos, sizeof (pos), 1, idbi);
734 NTOHUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
735 sih.index_chars += *(ent->word) + 1;
736 First_word = i;
737 }
738 word_num++;
739 } /* end for each word */
740
741 if (buf_in_use)
742 {
743 /* Write last buffer to tmp file */
744 int chunk;
745
746 /* [RPAP - Jan 97: Endian Ordering] */
747 HTONUL(First_word);
748 HTONUS(word_num);
749
750 fwrite (&First_word, sizeof (First_word), 1, tmp);
751 fwrite (&word_num, sizeof (word_num), 1, tmp);
752 fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
753 fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
754 bzero ((char *) buffer, block_size);
755 chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
756 sizeof (ptrs_in_use) + sizeof (First_word);
757 if (force && chunk < block_size)
758 {
759 fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
760 chunk = block_size;
761 }
762
763 sih.num_blocks++;
764 }
765
766 rewind (tmp);
767 sih.blocks_start = sih.index_chars + sizeof (u_long) + sizeof (sih) +
768 sih.num_blocks * sizeof (pos);
769 if (force)
770 {
771 int amount;
772 amount = sih.blocks_start % block_size;
773 if (amount != 0)
774 {
775 bzero ((char *) buffer, block_size);
776 fwrite (buffer, sizeof (u_char), block_size - amount, idbi);
777 sih.blocks_start += block_size - amount;
778 }
779 }
780
781 while ((num = fread (buffer, sizeof (u_char), block_size, tmp)) != 0)
782 fwrite (buffer, sizeof (u_char), num, idbi);
783 fclose (tmp);
784
785 /* skip over the magic number */
786 fseek (idbi, sizeof (u_long), 0);
787
788 /* [RPAP - Jan 97: Endian Ordering] */
789 HTONUL(sih.lookback);
790 HTONUL(sih.block_size);
791 HTONUL(sih.num_blocks);
792 HTONUL(sih.blocks_start);
793 HTONUL(sih.index_chars);
794 HTONUL(sih.num_of_words);
795
796 fwrite (&sih, sizeof (sih), 1, idbi);
797 fclose (idbi);
798
799#ifndef SILENT
800 Message ("Stem %d:\n", stem_method);
801 Message (" Block size : %10d\n", block_size);
802 Message (" Num_blocks : %10d\n", NTOHUL(sih.num_blocks)); /* [RPAP - Jan 97: Endian Ordering] */
803 Message (" Max mem used : %10.1f Mb\n", (double) MaxMemInUse / 1024.0 / 1024.0);
804 Message (" Num_of_words : %10d\n", NTOHUL(sih.num_of_words)); /* [RPAP - Jan 97: Endian Ordering] */
805#endif
806}
807
808
809void
810UpdateStemDict (char * filename, int stem_method)
811{
812 FILE *idb;
813 struct stem_dict_header sdh;
814
815 if (!(idb = open_file (filename, INVF_DICT_BLOCKED_SUFFIX, "r+b",
816 MAGIC_STEM, MG_CONTINUE)))
817 FatalError (1, "Could not update stemmed dict");
818
819 fread ((char *) &sdh, sizeof (sdh), 1, idb);
820 NTOHUL(sdh.indexed); /* [RPAP - Jan 97: Endian Ordering] */
821 sdh.indexed |= 1 << (stem_method - 1);
822 HTONUL(sdh.indexed);
823 fseek (idb, sizeof (u_long), 0);
824 fwrite ((char *) &sdh, sizeof (sdh), 1, idb);
825 fclose (idb);
826}
827
828
829
830/* Main */
831int main (int argc, char **argv)
832{
833 File *idb; /* File to .invf.dict.blocked */
834 char *filename = "";
835 stemmed_dict *sd; /* Stemmed dictionary */
836 int ch;
837 char path[512];
838 int stem_method = 0;
839
840 msg_prefix = argv[0];
841 opterr = 0;
842 while ((ch = getopt (argc, argv, "f:d:b:hFs:")) != -1)
843 switch (ch)
844 {
845 case 'f': /* input file */
846 filename = optarg;
847 break;
848 case 'd':
849 set_basepath (optarg);
850 break;
851 case 'b':
852 block_size = atoi (optarg);
853 break;
854 case 'F':
855 force = 1;
856 break;
857 case 's':
858 stem_method = atoi (optarg);
859 break;
860 case 'h':
861 case '?':
862 fprintf (stderr, "usage: %s [-d directory] "
863 "[-b num] [-F] [-h] -s 1|2|3 -f name\n", argv[0]);
864 exit (1);
865 }
866
867 if (stem_method < 1 || stem_method > 3)
868 FatalError (1, "Stem method must be 1, 2 or 3");
869
870 /* Open required stem dict file */
871 sprintf (path, FILE_NAME_FORMAT, get_basepath (), filename, INVF_DICT_BLOCKED_SUFFIX);
872 if (!(idb = Fopen (path, "rb", MAGIC_STEM)))
873 FatalError (1, "Unable to open \"%s\"", path);
874
875 /* Read in idb header and index to blocks */
876 if (!(sd = ReadStemDictBlk (idb)))
877 FatalError (1, "Could not read stemmed dictionary");
878
879 /* Process stemmed dictionary */
880 mg_init_process ();
881 process_stem_dict (sd, stem_method);
882 save_idx (filename, stem_method);
883
884 /* Close stemmed dict */
885 Fclose (idb);
886
887 /* Update stemmed dict */
888 UpdateStemDict (filename, stem_method);
889
890 return 0;
891}
892
Note: See TracBrowser for help on using the repository browser.