source: trunk/gsdl/packages/mg-1.3d/src/text/mg_stem_idx.c@ 30

Last change on this file since 30 was 13, checked in by rjmcnab, 26 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 23.9 KB
Line 
1/**************************************************************************
2 *
3 * mg_stem_idx.c -- Memory efficient stem index builder
4 * Copyright (C) 1997 Ross Peeters
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23
24#include "memlib.h"
25#include "messages.h"
26#include "filestats.h"
27#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
28
29#include "mg_files.h"
30#include "invf.h"
31#include "mg.h"
32#include "locallib.h"
33#include "backend.h" /* For struct stemmed_dict */
34#include "words.h"
35#include "stemmer.h"
36#include "hash.h"
37#include "local_strings.h"
38
39typedef struct PosEntry
40{
41 unsigned int num_cases;
42 unsigned int blk;
43 unsigned short blk_index;
44 unsigned short offset;
45}
46PosEntry;
47
48
49typedef struct PosList
50{
51 unsigned int list_size;
52 unsigned int num_entries;
53 PosEntry PE[1];
54}
55PosList;
56
57
58typedef struct idx_hash_rec
59{
60 u_char *word;
61 PosList *PL;
62}
63idx_hash_rec;
64
65
66#define POOL_SIZE 1024*1024
67#define HASH_POOL_SIZE 8192
68#define INITIAL_HASH_SIZE 7927
69
70
71static unsigned long MaxMemInUse = 0;
72static unsigned long MemInUse = 0;
73
74static idx_hash_rec **IdxHashTable;
75static unsigned long IdxHashSize;
76static unsigned long IdxHashUsed;
77static u_char *IdxPool;
78static int IdxPoolLeft;
79
80static idx_hash_rec *idx_hr_pool;
81static int idx_hr_PoolLeft;
82
83static idx_hash_rec **idx_first_occr;
84static int idx_max_first_occr;
85
86int block_size = 1024 * 4;
87
88int force = 0;
89
90static long lookback = 4;
91
92static void
93ChangeMem (int Change)
94{
95 MemInUse += Change;
96 if (MemInUse > MaxMemInUse)
97 MaxMemInUse = MemInUse;
98}
99
100
101/* =========================================================================
102 * Function: MakePosList
103 * Description:
104 * Input:
105 * Output:
106 * ========================================================================= */
107PosList *
108MakePosList (int n)
109{
110 PosList *pl;
111 int list_size = (n == 0 ? 1 : n); /* always allocate at least one node */
112
113 pl = Xmalloc (sizeof (PosList) + (list_size - 1) * sizeof (PosEntry));
114 if (!pl)
115 FatalError (1, "Unable to allocate term list");
116 ChangeMem (sizeof (PosList) + (list_size - 1) * sizeof (PosEntry));
117
118 pl->num_entries = n;
119 pl->list_size = list_size;
120
121 return pl;
122}
123
124/* =========================================================================
125 * Function: ResizePosList
126 * Description:
127 * Input:
128 * Output:
129 * ========================================================================= */
130
131#define GROWTH_FACTOR 2
132#define MIN_SIZE 2
133
134static void
135ResizePosList (PosList ** pos_list)
136{
137 PosList *pl = *pos_list;
138
139 ChangeMem (-(sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry)));
140 if (pl->num_entries > pl->list_size)
141 {
142 if (pl->list_size)
143 pl->list_size *= GROWTH_FACTOR;
144 else
145 pl->list_size = MIN_SIZE;
146 }
147 pl = Xrealloc (pl, sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry));
148
149 if (!pl)
150 FatalError (1, "Unable to resize pos list");
151 ChangeMem (sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry));
152
153 *pos_list = pl;
154}
155
156
157/* =========================================================================
158 * Function: FreePosList
159 * Description:
160 * Input:
161 * Output:
162 * ========================================================================= */
163
164void
165FreePosList (PosList ** the_pl)
166{
167 PosList *pl = *the_pl;
168
169 ChangeMem (-(sizeof(PosList) + sizeof (PosEntry) * (pl->list_size - 1)));
170 Xfree (pl);
171
172 *the_pl = NULL;
173}
174
175
176/* =========================================================================
177 * Function: ResetPosList
178 * Description:
179 * Input:
180 * Output:
181 * ========================================================================= */
182
183void
184ResetPosList (PosList ** pl)
185{
186 if (*pl)
187 FreePosList (pl);
188 *pl = MakePosList (0);
189}
190
191/* =========================================================================
192 * Function: AddPosEntry
193 * Description:
194 * Input:
195 * Output:
196 * ========================================================================= */
197
198int
199AddPosEntry (PosList ** pos_list, PosEntry * pe)
200{
201 PosList *pl = *pos_list;
202
203 pl->num_entries++;
204 ResizePosList (pos_list);
205 pl = *pos_list;
206
207 /* copy the structure contents */
208 bcopy ((char *) pe, (char *) &(pl->PE[pl->num_entries - 1]), sizeof (PosEntry));
209
210 return pl->num_entries - 1;
211}
212
213
214/* Modified from stem_search.c */
215stemmed_dict *
216ReadStemDictBlk (File * stem_file)
217{
218 unsigned long i;
219 stemmed_dict *sd;
220 u_char *buffer;
221
222 if (!(sd = Xmalloc (sizeof (stemmed_dict))))
223 FatalError (1, "Could not allocate memory for stemmed dict");
224
225
226 sd->stem_file = stem_file;
227 sd->MemForStemDict = 0;
228
229 Fread (&sd->sdh, sizeof (sd->sdh), 1, stem_file);
230
231 /* [RPAP - Jan 97: Endian Ordering] */
232 NTOHUL(sd->sdh.lookback);
233 NTOHUL(sd->sdh.block_size);
234 NTOHUL(sd->sdh.num_blocks);
235 NTOHUL(sd->sdh.blocks_start);
236 NTOHUL(sd->sdh.index_chars);
237 NTOHUL(sd->sdh.num_of_docs);
238 NTOHUL(sd->sdh.static_num_of_docs);
239 NTOHUL(sd->sdh.num_of_words);
240 NTOHUL(sd->sdh.stem_method);
241 NTOHUL(sd->sdh.indexed);
242
243 if (!(buffer = Xmalloc (sd->sdh.index_chars)))
244 {
245 Xfree (sd);
246 FatalError (1, "Could not allocate memory for stemmed dict");
247 return (NULL);
248 };
249 sd->MemForStemDict += sd->sdh.index_chars;
250
251 if (!(sd->index = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->index))))
252 {
253 Xfree (sd);
254 Xfree (buffer);
255 FatalError (1, "Could not allocate memory for stemmed dict");
256 return (NULL);
257 };
258 sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->index);
259
260 if (!(sd->pos = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->pos))))
261 {
262 Xfree (sd);
263 Xfree (buffer);
264 Xfree (sd->index);
265 FatalError (1, "Could not allocate memory for stemmed dict");
266 return (NULL);
267 };
268 sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->pos);
269
270 if (!(sd->buffer = Xmalloc (sd->sdh.block_size * sizeof (*sd->buffer))))
271 {
272 Xfree (sd);
273 Xfree (buffer);
274 Xfree (sd->index);
275 Xfree (sd->buffer);
276 FatalError (1, "Could not allocate memory for stemmed dict");
277 return (NULL);
278 };
279 sd->MemForStemDict += sd->sdh.block_size * sizeof (*sd->buffer);
280
281 sd->active = -1;
282
283 for (i = 0; i < sd->sdh.num_blocks; i++)
284 {
285 register u_char len;
286 sd->index[i] = buffer;
287 len = Getc (stem_file);
288 *buffer++ = len;
289 Fread (buffer, sizeof (u_char), len, stem_file);
290 buffer += len;
291 Fread (&sd->pos[i], sizeof (*sd->pos), 1, stem_file);
292 NTOHUL(sd->pos[i]); /* [RPAP - Jan 97: Endian Ordering] */
293 }
294
295 return sd;
296}
297
298
299void
300init_process ()
301{
302 /* Allocate memory for idx hash table */
303
304 if (!(IdxPool = Xmalloc (POOL_SIZE)))
305 FatalError (1, "Unable to allocate memory for idx pool");
306 IdxPoolLeft = POOL_SIZE;
307 ChangeMem (POOL_SIZE);
308
309 if (!(idx_hr_pool = Xmalloc (HASH_POOL_SIZE * sizeof (idx_hash_rec))))
310 FatalError (1, "Unable to allocate memory for idx pool");
311 idx_hr_PoolLeft = HASH_POOL_SIZE;
312 ChangeMem (HASH_POOL_SIZE * sizeof (idx_hash_rec));
313
314 IdxHashSize = INITIAL_HASH_SIZE;
315 IdxHashUsed = 0;
316 if (!(IdxHashTable = Xmalloc (sizeof (idx_hash_rec *) * IdxHashSize)))
317 FatalError (1, "Unable to allocate memory for idx table");
318 ChangeMem (sizeof (idx_hash_rec *) * IdxHashSize);
319 bzero ((char *) IdxHashTable, sizeof (idx_hash_rec *) * IdxHashSize);
320 idx_max_first_occr = 8192;
321 if (!(idx_first_occr = Xmalloc (sizeof (idx_hash_rec *) * idx_max_first_occr)))
322 FatalError (1, "Unable to allocate memory for idx_first_occur");
323 ChangeMem (sizeof (idx_hash_rec *) * idx_max_first_occr);
324}
325
326
327void
328PackIdxHashTable (void)
329{
330 int s, d;
331 for (s = d = 0; s < IdxHashSize; s++)
332 if (IdxHashTable[s])
333 IdxHashTable[d++] = IdxHashTable[s];
334 ChangeMem (-sizeof (idx_hash_rec *) * IdxHashSize);
335 ChangeMem (sizeof (idx_hash_rec *) * IdxHashUsed);
336 if (!(IdxHashTable = Xrealloc (IdxHashTable, sizeof (idx_hash_rec *) * IdxHashUsed)))
337 FatalError (1, "Out of memory");
338 IdxHashSize = IdxHashUsed;
339}
340
341
342void
343process_stem_dict (stemmed_dict * sd, int stem_method)
344{
345 int block;
346 short blk_index = -1;
347 int wordnum = -1;
348 PosEntry *prevPE = NULL;
349 idx_hash_rec *prevIdx = NULL;
350 u_char word[MAXSTEMLEN + 1];
351 u_char prev[MAXSTEMLEN + 1];
352
353 /* For each block in stem dict... */
354 for (block = 0; block < sd->sdh.num_blocks; block++)
355 {
356 register unsigned int res;
357 int num_indexes;
358 unsigned long *first_word, *last_invf_len;
359 unsigned short *num_words;
360 u_char *base;
361 unsigned short *index;
362
363 /* Read block into buffer */
364 Fseek (sd->stem_file, sd->pos[block] + sd->sdh.blocks_start, 0);
365 Fread (sd->buffer, sd->sdh.block_size, sizeof (u_char), sd->stem_file);
366 sd->active = sd->pos[block];
367
368 /* Move through block header */
369 first_word = (unsigned long *) (sd->buffer);
370 NTOHUL(*first_word); /* [RPAP - Jan 97: Endian Ordering] */
371 last_invf_len = (unsigned long *) (first_word + 1);
372 NTOHUL(*last_invf_len); /* [RPAP - Jan 97: Endian Ordering] */
373 num_words = (unsigned short *) (last_invf_len + 1);
374 NTOHUS(*num_words); /* [RPAP - Jan 97: Endian Ordering] */
375 index = num_words + 1;
376 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
377
378 {
379 /* [RPAP - Jan 97: Endian Ordering] */
380 int i;
381 for (i = 0; i < num_indexes; i++)
382 NTOHUS(index[i]);
383 }
384
385 base = (u_char *) (index + num_indexes);
386 blk_index = -1;
387
388 /* For each word in block... */
389 for (res = 0; res < *num_words; res++)
390 {
391 unsigned copy, suff;
392 register idx_hash_rec *idx_ent = 0;
393
394 /* Update blk_index */
395 if (!(res % sd->sdh.lookback))
396 blk_index++;
397
398 copy = *base++;
399 suff = *base++;
400 bcopy ((char *) base, (char *) (prev + copy + 1), suff);
401 base += suff;
402 *prev = copy + suff;
403
404 /* Skip irrelevant word info */
405 base += sizeof (u_long);
406 base += sizeof (u_long);
407 base += sizeof (u_long);
408
409 /* Stem word */
410 bcopy ((char *) prev, (char *) word, *prev + 1);
411 stemmer (stem_method, word);
412
413 /* Check if word follows straight on from previous word */
414 if (prevIdx && !compare (word, prevIdx->word))
415 prevPE->num_cases++;
416
417 else
418 {
419 /* Search the idx hash table for word */
420 register unsigned long hashval, step;
421 register int hsize = IdxHashSize;
422 HASH (hashval, step, word, hsize);
423 for (;;)
424 {
425 register u_char *s1;
426 register u_char *s2;
427 register int len;
428 idx_ent = IdxHashTable[hashval];
429 if (!idx_ent)
430 {
431 /* Create a next entry in the hash table */
432 int len = *word + 1;
433 if (!idx_hr_PoolLeft)
434 {
435 if (!(idx_hr_pool = Xmalloc (HASH_POOL_SIZE *
436 sizeof (idx_hash_rec))))
437 FatalError (1, "Unable to allocate memory for pool");
438 idx_hr_PoolLeft = HASH_POOL_SIZE;
439 ChangeMem (HASH_POOL_SIZE * sizeof (idx_hash_rec));
440 }
441 idx_ent = idx_hr_pool++;
442 idx_hr_PoolLeft--;
443 if (len > IdxPoolLeft)
444 {
445 if (!(IdxPool = Xmalloc (POOL_SIZE)))
446 FatalError (1, "Unable to allocate memory for pool");
447 IdxPoolLeft = POOL_SIZE;
448 ChangeMem (POOL_SIZE);
449 }
450 wordnum++;
451
452 idx_ent->word = IdxPool;
453 idx_ent->PL = MakePosList (0);
454 {
455 PosEntry PE;
456 PE.num_cases = 1;
457 PE.blk = block;
458 PE.blk_index = (unsigned short) blk_index;
459 PE.offset = res % sd->sdh.lookback;
460 AddPosEntry (&(idx_ent->PL), &PE);
461 }
462 prevIdx = idx_ent;
463 prevPE = &(idx_ent->PL->PE[idx_ent->PL->num_entries - 1]);
464
465 bcopy ((char *) word, (char *) IdxPool, len);
466 IdxPool += len;
467 IdxPoolLeft -= len;
468 if (IdxHashUsed == idx_max_first_occr - 1)
469 {
470 ChangeMem (-sizeof (idx_hash_rec *) * idx_max_first_occr);
471 idx_max_first_occr *= 2;
472 if (!(idx_first_occr = Xrealloc (idx_first_occr, sizeof (idx_hash_rec *) *
473 idx_max_first_occr)))
474 FatalError (1, "Unable to allocate memory for idx_first_occr");
475 ChangeMem (sizeof (idx_hash_rec *) * idx_max_first_occr);
476 }
477 idx_first_occr[IdxHashUsed] = idx_ent;
478 IdxHashUsed++;
479 IdxHashTable[hashval] = idx_ent;
480 break;
481 }
482
483 /* Compare the words */
484 s1 = word;
485 s2 = idx_ent->word;
486 len = *s1 + 1;
487 for (; len; len--)
488 if (*s1++ != *s2++)
489 break;
490
491 if (len)
492 {
493 /* Entry is not the right one - move to next hash index */
494 hashval = (hashval + step);
495 if (hashval >= hsize)
496 hashval -= hsize;
497 }
498 else
499 {
500 /* Entry is correct - added PosEntry to word */
501 PosEntry PE;
502 PE.num_cases = 1;
503 PE.blk = block;
504 PE.blk_index = (unsigned short) blk_index;
505 PE.offset = res % sd->sdh.lookback;
506 AddPosEntry (&(idx_ent->PL), &PE);
507 prevIdx = idx_ent;
508 prevPE = &(idx_ent->PL->PE[idx_ent->PL->num_entries - 1]);
509 break;
510 }
511 }
512 }
513
514 if (IdxHashUsed >= IdxHashSize >> 1)
515 {
516 idx_hash_rec **ht;
517 unsigned long size;
518 unsigned long i;
519 size = prime (IdxHashSize * 2);
520 if (!(ht = Xmalloc (sizeof (idx_hash_rec *) * size)))
521 FatalError (1, "Unable to allocate memory for idx table");
522 bzero ((char *) ht, sizeof (idx_hash_rec *) * size);
523 ChangeMem (sizeof (idx_hash_rec *) * size);
524
525 for (i = 0; i < IdxHashSize; i++)
526 if (IdxHashTable[i])
527 {
528 register u_char *wptr;
529 idx_hash_rec *ent;
530 register unsigned long hashval, step;
531
532 wptr = IdxHashTable[i]->word;
533 HASH (hashval, step, wptr, size);
534 ent = ht[hashval];
535 while (ent)
536 {
537 hashval += step;
538 if (hashval >= size)
539 hashval -= size;
540 ent = ht[hashval];
541 }
542 ht[hashval] = IdxHashTable[i];
543 }
544 Xfree (IdxHashTable);
545 ChangeMem (-sizeof (idx_hash_rec *) * IdxHashSize);
546 IdxHashTable = ht;
547 IdxHashSize = size;
548 }
549
550 } /* end for each word */
551
552 } /* end for each block */
553
554}
555
556static int
557idx_comp (const void *A, const void *B)
558{
559 u_char *s1 = (*((idx_hash_rec **) A))->word;
560 u_char *s2 = (*((idx_hash_rec **) B))->word;
561 return (casecompare (s1, s2));
562}
563
564
565void
566save_idx (char * filename, int stem_method)
567{
568 char *FName;
569 unsigned long i, j, pos, First_word, num;
570 struct stem_idx_header sih;
571 u_char *buffer, *last_word = NULL;
572 unsigned short *pointers;
573 int buf_in_use;
574 unsigned short ptrs_in_use, word_num;
575 FILE *idbi = NULL, *tmp = NULL;
576
577 FName = make_name (filename, ".tmp", NULL);
578 if (!(tmp = fopen (FName, "w+b")))
579 FatalError (1, "Unable to open \"%s\".\n", FName);
580
581 /* Delete the file now */
582 unlink (FName);
583
584 /* Create appropriate stem index file */
585 switch (stem_method)
586 {
587 case (1):
588 {
589 idbi = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX, "wb", MAGIC_STEM_1,
590 MG_ABORT);
591 break;
592 }
593
594 case (2):
595 {
596 idbi = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX, "wb", MAGIC_STEM_2,
597 MG_ABORT);
598 break;
599 }
600 case (3):
601 {
602 idbi = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX, "wb", MAGIC_STEM_3,
603 MG_ABORT);
604 break;
605 }
606 }
607
608 if (!idbi)
609 FatalError (1, "Could NOT create .invf.blocked.%d file", stem_method);
610
611 PackIdxHashTable();
612 qsort (IdxHashTable, IdxHashUsed, sizeof (idx_hash_rec *), idx_comp);
613
614 sih.lookback = lookback;
615 sih.block_size = block_size;
616 sih.num_blocks = 0;
617 sih.blocks_start = 0;
618 sih.index_chars = 0;
619 sih.num_of_words = IdxHashUsed;
620
621 fwrite ((char *) &sih, sizeof (sih), 1, idbi);
622
623 if (!(buffer = Xmalloc (block_size + 512)))
624 FatalError (1, "Unable to allocate memory for \"buffer\"\n");
625 if (!(pointers = Xmalloc (block_size + 512)))
626 FatalError (1, "Unable to allocate memory for \"buffer\"\n");
627
628 buf_in_use = 0;
629 pos = 0;
630 word_num = 0;
631 ptrs_in_use = 0;
632 First_word = 0;
633
634 /* For each word in the hashtable... */
635 for (i = 0; i < IdxHashUsed; i++)
636 {
637 register unsigned long extra, copy, suff;
638 register struct idx_hash_rec *ent = IdxHashTable[i];
639
640 /* build a new word on top of prev */
641 if (last_word != NULL)
642 copy = prefixlen (last_word, ent->word);
643 else
644 copy = 0;
645 suff = *(ent->word) - copy;
646 last_word = ent->word;
647
648 if (word_num % sih.lookback == 0)
649 /* Will need copy chars to add + a pointer in index */
650 extra = copy + sizeof (*pointers);
651 else
652 extra = 0;
653 if ((ptrs_in_use + 1) * sizeof (*pointers) + sizeof (ptrs_in_use) + extra +
654 buf_in_use + sizeof (First_word) + suff + 1 + sizeof (ent->PL->num_entries) +
655 ent->PL->num_entries * sizeof (PosEntry) > block_size)
656 {
657 /* Dump buffer to tmp file */
658 int chunk;
659 HTONUL(First_word); /* [RPAP - Jan 97: Endian Ordering] */
660 HTONUS(word_num); /* [RPAP - Jan 97: Endian Ordering] */
661 fwrite (&First_word, sizeof (First_word), 1, tmp);
662 fwrite (&word_num, sizeof (word_num), 1, tmp);
663 fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
664 fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
665 bzero ((char *) buffer, block_size);
666 chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
667 sizeof (ptrs_in_use) + sizeof (First_word);
668 if (force && chunk < block_size)
669 {
670 fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
671 chunk = block_size;
672 }
673
674 pos += chunk;
675
676 buf_in_use = 0;
677 word_num = 0;
678 ptrs_in_use = 0;
679 sih.num_blocks++;
680
681 /* Check that entry will fit into new block */
682 if (sizeof (*pointers) + sizeof (ptrs_in_use) + extra + sizeof (First_word) +
683 suff + 1 + sizeof (ent->PL->num_entries) +
684 ent->PL->num_entries * sizeof (PosEntry) > block_size)
685 FatalError (1, "Block size to small");
686
687 }
688
689 if (word_num % sih.lookback == 0)
690 {
691 HTONUS2(buf_in_use, pointers[ptrs_in_use++]); /* [RPAP - Jan 97: Endian Ordering] */
692 suff += copy;
693 copy = 0;
694 }
695
696 /* Output Word information */
697 buffer[buf_in_use++] = copy;
698 buffer[buf_in_use++] = suff;
699 bcopy ((char *) (ent->word + copy + 1), (char *) (buffer + buf_in_use), suff);
700 buf_in_use += suff;
701 HTONUI(ent->PL->num_entries); /* [RPAP - Jan 97: Endian Ordering] */
702 bcopy ((char *) &(ent->PL->num_entries), (char *) (buffer + buf_in_use), sizeof (ent->PL->num_entries));
703 NTOHUI(ent->PL->num_entries); /* [RPAP - Jan 97: Endian Ordering] */
704 buf_in_use += sizeof (ent->PL->num_entries);
705
706 for (j = 0; j < ent->PL->num_entries; j++)
707 {
708 register PosEntry *pe = &(ent->PL->PE[j]);
709 HTONUI(pe->num_cases); /* [RPAP - Jan 97: Endian Ordering] */
710 bcopy ((char *) &(pe->num_cases), (char *) (buffer + buf_in_use), sizeof (pe->num_cases));
711 buf_in_use += sizeof (pe->num_cases);
712 HTONUI(pe->blk); /* [RPAP - Jan 97: Endian Ordering] */
713 bcopy ((char *) &(pe->blk), (char *) (buffer + buf_in_use), sizeof (pe->blk));
714 buf_in_use += sizeof (pe->blk);
715 HTONUS(pe->blk_index); /* [RPAP - Jan 97: Endian Ordering] */
716 bcopy ((char *) &(pe->blk_index), (char *) (buffer + buf_in_use), sizeof (pe->blk_index));
717 buf_in_use += sizeof (pe->blk_index);
718 HTONUS(pe->offset); /* [RPAP - Jan 97: Endian Ordering] */
719 bcopy ((char *) &(pe->offset), (char *) (buffer + buf_in_use), sizeof (pe->offset));
720 buf_in_use += sizeof (pe->offset);
721 }
722
723 if (buf_in_use + ptrs_in_use * sizeof (*pointers) +
724 sizeof (ptrs_in_use) > block_size)
725 FatalError (1, "Fatal Internal Error # 64209258\n");
726
727 if (word_num == 0)
728 {
729 /* Write word to main index */
730 fwrite (ent->word, sizeof (u_char), *(ent->word) + 1, idbi);
731 HTONUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
732 fwrite (&pos, sizeof (pos), 1, idbi);
733 NTOHUL(pos); /* [RPAP - Jan 97: Endian Ordering] */
734 sih.index_chars += *(ent->word) + 1;
735 First_word = i;
736 }
737 word_num++;
738 } /* end for each word */
739
740 if (buf_in_use)
741 {
742 /* Write last buffer to tmp file */
743 int chunk;
744
745 /* [RPAP - Jan 97: Endian Ordering] */
746 HTONUL(First_word);
747 HTONUS(word_num);
748
749 fwrite (&First_word, sizeof (First_word), 1, tmp);
750 fwrite (&word_num, sizeof (word_num), 1, tmp);
751 fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
752 fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
753 bzero ((char *) buffer, block_size);
754 chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
755 sizeof (ptrs_in_use) + sizeof (First_word);
756 if (force && chunk < block_size)
757 {
758 fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
759 chunk = block_size;
760 }
761
762 sih.num_blocks++;
763 }
764
765 rewind (tmp);
766 sih.blocks_start = sih.index_chars + sizeof (u_long) + sizeof (sih) +
767 sih.num_blocks * sizeof (pos);
768 if (force)
769 {
770 int amount;
771 amount = sih.blocks_start % block_size;
772 if (amount != 0)
773 {
774 bzero ((char *) buffer, block_size);
775 fwrite (buffer, sizeof (u_char), block_size - amount, idbi);
776 sih.blocks_start += block_size - amount;
777 }
778 }
779
780 while ((num = fread (buffer, sizeof (u_char), block_size, tmp)) != 0)
781 fwrite (buffer, sizeof (u_char), num, idbi);
782 fclose (tmp);
783
784 /* skip over the magic number */
785 fseek (idbi, sizeof (u_long), 0);
786
787 /* [RPAP - Jan 97: Endian Ordering] */
788 HTONUL(sih.lookback);
789 HTONUL(sih.block_size);
790 HTONUL(sih.num_blocks);
791 HTONUL(sih.blocks_start);
792 HTONUL(sih.index_chars);
793 HTONUL(sih.num_of_words);
794
795 fwrite (&sih, sizeof (sih), 1, idbi);
796 fclose (idbi);
797
798#ifndef SILENT
799 Message ("Stem %d:\n", stem_method);
800 Message (" Block size : %10d\n", block_size);
801 Message (" Num_blocks : %10d\n", NTOHUL(sih.num_blocks)); /* [RPAP - Jan 97: Endian Ordering] */
802 Message (" Max mem used : %10.1f Mb\n", (double) MaxMemInUse / 1024.0 / 1024.0);
803 Message (" Num_of_words : %10d\n", NTOHUL(sih.num_of_words)); /* [RPAP - Jan 97: Endian Ordering] */
804#endif
805}
806
807
808void
809UpdateStemDict (char * filename, int stem_method)
810{
811 FILE *idb;
812 struct stem_dict_header sdh;
813
814 if (!(idb = open_file (filename, INVF_DICT_BLOCKED_SUFFIX, "r+b",
815 MAGIC_STEM, MG_CONTINUE)))
816 FatalError (1, "Could not update stemmed dict");
817
818 fread ((char *) &sdh, sizeof (sdh), 1, idb);
819 NTOHUL(sdh.indexed); /* [RPAP - Jan 97: Endian Ordering] */
820 sdh.indexed |= 1 << (stem_method - 1);
821 HTONUL(sdh.indexed);
822 fseek (idb, sizeof (u_long), 0);
823 fwrite ((char *) &sdh, sizeof (sdh), 1, idb);
824 fclose (idb);
825}
826
827
828
829/* Main */
830void main (int argc, char **argv)
831{
832 File *idb; /* File to .invf.dict.blocked */
833 char *filename = "";
834 stemmed_dict *sd; /* Stemmed dictionary */
835 int ch;
836 char path[512];
837 int stem_method = 0;
838
839 msg_prefix = argv[0];
840 opterr = 0;
841 while ((ch = getopt (argc, argv, "f:d:b:hFs:")) != -1)
842 switch (ch)
843 {
844 case 'f': /* input file */
845 filename = optarg;
846 break;
847 case 'd':
848 set_basepath (optarg);
849 break;
850 case 'b':
851 block_size = atoi (optarg);
852 break;
853 case 'F':
854 force = 1;
855 break;
856 case 's':
857 stem_method = atoi (optarg);
858 break;
859 case 'h':
860 case '?':
861 fprintf (stderr, "usage: %s [-d directory] "
862 "[-b num] [-F] [-h] -s 1|2|3 -f name\n", argv[0]);
863 exit (1);
864 }
865
866 if (stem_method < 1 || stem_method > 3)
867 FatalError (1, "Stem method must be 1, 2 or 3");
868
869 /* Open required stem dict file */
870 sprintf (path, FILE_NAME_FORMAT, get_basepath (), filename, INVF_DICT_BLOCKED_SUFFIX);
871 if (!(idb = Fopen (path, "rb", MAGIC_STEM)))
872 FatalError (1, "Unable to open \"%s\"", path);
873
874 /* Read in idb header and index to blocks */
875 if (!(sd = ReadStemDictBlk (idb)))
876 FatalError (1, "Could not read stemmed dictionary");
877
878 /* Process stemmed dictionary */
879 init_process ();
880 process_stem_dict (sd, stem_method);
881 save_idx (filename, stem_method);
882
883 /* Close stemmed dict */
884 Fclose (idb);
885
886 /* Update stemmed dict */
887 UpdateStemDict (filename, stem_method);
888
889 exit (0);
890}
891
Note: See TracBrowser for help on using the repository browser.