root/trunk/gsdl/packages/mg/src/text/mg_stem_idx.c @ 2378

Revision 2378, 23.9 KB (checked in by jrm21, 19 years ago)

renamed function init_process to mg_init_process as OS X happens to have an
internal library function called that...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * mg_stem_idx.c -- Memory efficient stem index builder
4 * Copyright (C) 1997  Ross Peeters
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "sysfuncs.h"
23
24#include "memlib.h"
25#include "messages.h"
26#include "filestats.h"
27#include "netorder.h"  /* [RPAP - Jan 97: Endian Ordering] */
28
29#include "mg_files.h"
30#include "invf.h"
31#include "mg.h"
32#include "locallib.h"
33#include "backend.h"     /* For struct stemmed_dict */
34#include "words.h"
35#include "stemmer.h"
36#include "hash.h"
37#include "local_strings.h"
38
39typedef struct PosEntry
40{
41  unsigned int num_cases;
42  unsigned int blk;
43  unsigned short blk_index;
44  unsigned short offset;
45}
46PosEntry;
47
48
49typedef struct PosList
50{
51  unsigned int list_size;
52  unsigned int num_entries;
53  PosEntry PE[1];
54}
55PosList;
56
57
58typedef struct idx_hash_rec
59{
60  u_char *word;
61  PosList *PL;
62}
63idx_hash_rec;
64
65
66#define POOL_SIZE 1024*1024
67#define HASH_POOL_SIZE 8192
68#define INITIAL_HASH_SIZE 7927
69
70
71static unsigned long MaxMemInUse = 0;
72static unsigned long MemInUse = 0;
73
74static idx_hash_rec **IdxHashTable;
75static unsigned long IdxHashSize;
76static unsigned long IdxHashUsed;
77static u_char *IdxPool;
78static int IdxPoolLeft;
79
80static idx_hash_rec *idx_hr_pool;
81static int idx_hr_PoolLeft;
82
83static idx_hash_rec **idx_first_occr;
84static int idx_max_first_occr;
85
86int block_size = 1024 * 4;
87
88int force = 0;
89
90static long lookback = 4;
91
92static void
93ChangeMem (int Change)
94{
95  MemInUse += Change;
96  if (MemInUse > MaxMemInUse)
97    MaxMemInUse = MemInUse;
98}
99
100
101/* =========================================================================
102 * Function: MakePosList
103 * Description:
104 * Input:
105 * Output:
106 * ========================================================================= */
107PosList *
108MakePosList (int n)
109{
110  PosList *pl;
111  int list_size = (n == 0 ? 1 : n); /* always allocate at least one node */
112
113  pl = Xmalloc (sizeof (PosList) + (list_size - 1) * sizeof (PosEntry));
114  if (!pl)
115    FatalError (1, "Unable to allocate term list");
116  ChangeMem (sizeof (PosList) + (list_size - 1) * sizeof (PosEntry));
117
118  pl->num_entries = n;
119  pl->list_size = list_size;
120
121  return pl;
122}
123
124/* =========================================================================
125 * Function: ResizePosList
126 * Description:
127 * Input:
128 * Output:
129 * ========================================================================= */
130
131#define GROWTH_FACTOR 2
132#define MIN_SIZE 2
133
134static void
135ResizePosList (PosList ** pos_list)
136{
137  PosList *pl = *pos_list;
138
139  ChangeMem (-(sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry)));
140  if (pl->num_entries > pl->list_size)
141    {
142      if (pl->list_size)
143    pl->list_size *= GROWTH_FACTOR;
144      else
145    pl->list_size = MIN_SIZE;
146    }
147  pl = Xrealloc (pl, sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry));
148
149  if (!pl)
150    FatalError (1, "Unable to resize pos list");
151  ChangeMem (sizeof (PosList) + (pl->list_size - 1) * sizeof (PosEntry));
152
153  *pos_list = pl;
154}
155
156
157/* =========================================================================
158 * Function: FreePosList
159 * Description:
160 * Input:
161 * Output:
162 * ========================================================================= */
163
164void
165FreePosList (PosList ** the_pl)
166{
167  PosList *pl = *the_pl;
168
169  ChangeMem (-(sizeof(PosList) + sizeof (PosEntry) * (pl->list_size - 1)));
170  Xfree (pl);
171
172  *the_pl = NULL;
173}
174
175
176/* =========================================================================
177 * Function: ResetPosList
178 * Description:
179 * Input:
180 * Output:
181 * ========================================================================= */
182
183void
184ResetPosList (PosList ** pl)
185{
186  if (*pl)
187    FreePosList (pl);
188  *pl = MakePosList (0);
189}
190
191/* =========================================================================
192 * Function: AddPosEntry
193 * Description:
194 * Input:
195 * Output:
196 * ========================================================================= */
197
198int
199AddPosEntry (PosList ** pos_list, PosEntry * pe)
200{
201  PosList *pl = *pos_list;
202
203  pl->num_entries++;
204  ResizePosList (pos_list);
205  pl = *pos_list;
206
207  /* copy the structure contents */
208  bcopy ((char *) pe, (char *) &(pl->PE[pl->num_entries - 1]), sizeof (PosEntry));
209
210  return pl->num_entries - 1;
211}
212
213
214/* Modified from stem_search.c */
215stemmed_dict *
216ReadStemDictBlk (File * stem_file)
217{
218  unsigned long i;
219  stemmed_dict *sd;
220  u_char *buffer;
221
222  if (!(sd = Xmalloc (sizeof (stemmed_dict))))
223    FatalError (1, "Could not allocate memory for stemmed dict");
224
225
226  sd->stem_file = stem_file;
227  sd->MemForStemDict = 0;
228
229  Fread (&sd->sdh, sizeof (sd->sdh), 1, stem_file);
230
231  /* [RPAP - Jan 97: Endian Ordering] */
232  NTOHUL(sd->sdh.lookback);
233  NTOHUL(sd->sdh.block_size);
234  NTOHUL(sd->sdh.num_blocks);
235  NTOHUL(sd->sdh.blocks_start);
236  NTOHUL(sd->sdh.index_chars);
237  NTOHUL(sd->sdh.num_of_docs);
238  NTOHUL(sd->sdh.static_num_of_docs);
239  NTOHUL(sd->sdh.num_of_words);
240  NTOHUL(sd->sdh.stemmer_num);
241  NTOHUL(sd->sdh.stem_method);
242  NTOHUL(sd->sdh.indexed);
243
244  if (!(buffer = Xmalloc (sd->sdh.index_chars)))
245    {
246      Xfree (sd);
247      FatalError (1, "Could not allocate memory for stemmed dict");
248      return (NULL);
249    };
250  sd->MemForStemDict += sd->sdh.index_chars;
251
252  if (!(sd->index = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->index))))
253    {
254      Xfree (sd);
255      Xfree (buffer);
256      FatalError (1, "Could not allocate memory for stemmed dict");
257      return (NULL);
258    };
259  sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->index);
260
261  if (!(sd->pos = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->pos))))
262    {
263      Xfree (sd);
264      Xfree (buffer);
265      Xfree (sd->index);
266      FatalError (1, "Could not allocate memory for stemmed dict");
267      return (NULL);
268    };
269  sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->pos);
270
271  if (!(sd->buffer = Xmalloc (sd->sdh.block_size * sizeof (*sd->buffer))))
272    {
273      Xfree (sd);
274      Xfree (buffer);
275      Xfree (sd->index);
276      Xfree (sd->buffer);
277      FatalError (1, "Could not allocate memory for stemmed dict");
278      return (NULL);
279    };
280  sd->MemForStemDict += sd->sdh.block_size * sizeof (*sd->buffer);
281
282  sd->active = -1;
283
284  for (i = 0; i < sd->sdh.num_blocks; i++)
285    {
286      register u_char len;
287      sd->index[i] = buffer;
288      len = Getc (stem_file);
289      *buffer++ = len;
290      Fread (buffer, sizeof (u_char), len, stem_file);
291      buffer += len;
292      Fread (&sd->pos[i], sizeof (*sd->pos), 1, stem_file);
293      NTOHUL(sd->pos[i]);  /* [RPAP - Jan 97: Endian Ordering] */
294    }
295
296  return sd;
297}
298
299
300void
301mg_init_process ()
302{
303  /* Allocate memory for idx hash table */
304
305  if (!(IdxPool = Xmalloc (POOL_SIZE)))
306    FatalError (1, "Unable to allocate memory for idx pool");
307  IdxPoolLeft = POOL_SIZE;
308  ChangeMem (POOL_SIZE);
309
310  if (!(idx_hr_pool = Xmalloc (HASH_POOL_SIZE * sizeof (idx_hash_rec))))
311    FatalError (1, "Unable to allocate memory for idx pool");
312  idx_hr_PoolLeft = HASH_POOL_SIZE;
313  ChangeMem (HASH_POOL_SIZE * sizeof (idx_hash_rec));
314
315  IdxHashSize = INITIAL_HASH_SIZE;
316  IdxHashUsed = 0;
317  if (!(IdxHashTable = Xmalloc (sizeof (idx_hash_rec *) * IdxHashSize)))
318    FatalError (1, "Unable to allocate memory for idx table");
319  ChangeMem (sizeof (idx_hash_rec *) * IdxHashSize);
320  bzero ((char *) IdxHashTable, sizeof (idx_hash_rec *) * IdxHashSize);
321  idx_max_first_occr = 8192;
322  if (!(idx_first_occr = Xmalloc (sizeof (idx_hash_rec *) * idx_max_first_occr)))
323    FatalError (1, "Unable to allocate memory for idx_first_occur");
324  ChangeMem (sizeof (idx_hash_rec *) * idx_max_first_occr);
325}
326
327
328void
329PackIdxHashTable (void)
330{
331  int s, d;
332  for (s = d = 0; s < IdxHashSize; s++)
333    if (IdxHashTable[s])
334      IdxHashTable[d++] = IdxHashTable[s];
335  ChangeMem (-sizeof (idx_hash_rec *) * IdxHashSize);
336  ChangeMem (sizeof (idx_hash_rec *) * IdxHashUsed);
337  if (!(IdxHashTable = Xrealloc (IdxHashTable, sizeof (idx_hash_rec *) * IdxHashUsed)))
338    FatalError (1, "Out of memory");
339  IdxHashSize = IdxHashUsed;
340}
341
342
343void
344process_stem_dict (stemmed_dict * sd, int stem_method)
345{
346  int block;
347  short blk_index = -1;
348  int wordnum = -1;
349  PosEntry *prevPE = NULL;
350  idx_hash_rec *prevIdx = NULL;
351  u_char word[MAXSTEMLEN + 1];
352  u_char prev[MAXSTEMLEN + 1];
353
354  /* For each block in stem dict... */
355  for (block = 0; block < sd->sdh.num_blocks; block++)
356    {
357      register unsigned int res;
358      int num_indexes;
359      unsigned long *first_word, *last_invf_len;
360      unsigned short *num_words;
361      u_char *base;
362      unsigned short *index;
363
364      /* Read block into buffer */
365      Fseek (sd->stem_file, sd->pos[block] + sd->sdh.blocks_start, 0);
366      Fread (sd->buffer, sd->sdh.block_size, sizeof (u_char), sd->stem_file);
367      sd->active = sd->pos[block];
368
369      /* Move through block header */
370      first_word = (unsigned long *) (sd->buffer);
371      NTOHUL(*first_word);  /* [RPAP - Jan 97: Endian Ordering] */
372      last_invf_len = (unsigned long *) (first_word + 1);
373      NTOHUL(*last_invf_len);  /* [RPAP - Jan 97: Endian Ordering] */
374      num_words = (unsigned short *) (last_invf_len + 1);
375      NTOHUS(*num_words);  /* [RPAP - Jan 97: Endian Ordering] */
376      index = num_words + 1;
377      num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
378
379      {
380    /* [RPAP - Jan 97: Endian Ordering] */
381    int i;
382    for (i = 0; i < num_indexes; i++)
383      NTOHUS(index[i]);
384      }
385
386      base = (u_char *) (index + num_indexes);
387      blk_index = -1;
388
389      /* For each word in block... */
390      for (res = 0; res < *num_words; res++)
391    {
392      unsigned copy, suff;
393      register idx_hash_rec *idx_ent = 0;
394
395      /* Update blk_index */
396      if (!(res % sd->sdh.lookback))
397        blk_index++;
398
399      copy = *base++;
400      suff = *base++;
401      bcopy ((char *) base, (char *) (prev + copy + 1), suff);
402      base += suff;
403      *prev = copy + suff;
404
405      /* Skip irrelevant word info */
406      base += sizeof (u_long);
407      base += sizeof (u_long);
408      base += sizeof (u_long);
409
410      /* Stem word */
411      bcopy ((char *) prev, (char *) word, *prev + 1);
412      stemmer (stem_method, sd->sdh.stemmer_num, word);
413
414      /* Check if word follows straight on from previous word */
415      if (prevIdx && !compare (word, prevIdx->word))
416        prevPE->num_cases++;
417
418      else
419        {
420          /* Search the idx hash table for word */
421          register unsigned long hashval, step;
422          register int hsize = IdxHashSize;
423          HASH (hashval, step, word, hsize);
424          for (;;)
425        {
426          register u_char *s1;
427          register u_char *s2;
428          register int len;
429          idx_ent = IdxHashTable[hashval];
430          if (!idx_ent)
431            {
432              /* Create a next entry in the hash table */
433              int len = *word + 1;
434              if (!idx_hr_PoolLeft)
435            {
436              if (!(idx_hr_pool = Xmalloc (HASH_POOL_SIZE *
437                               sizeof (idx_hash_rec))))
438                FatalError (1, "Unable to allocate memory for pool");
439              idx_hr_PoolLeft = HASH_POOL_SIZE;
440              ChangeMem (HASH_POOL_SIZE * sizeof (idx_hash_rec));
441            }
442              idx_ent = idx_hr_pool++;
443              idx_hr_PoolLeft--;
444              if (len > IdxPoolLeft)
445            {
446              if (!(IdxPool = Xmalloc (POOL_SIZE)))
447                FatalError (1, "Unable to allocate memory for pool");
448              IdxPoolLeft = POOL_SIZE;
449              ChangeMem (POOL_SIZE);
450            }
451              wordnum++;
452
453              idx_ent->word = IdxPool;
454              idx_ent->PL = MakePosList (0);
455              {
456            PosEntry PE;
457            PE.num_cases = 1;
458            PE.blk = block;
459            PE.blk_index = (unsigned short) blk_index;
460            PE.offset = res % sd->sdh.lookback;
461            AddPosEntry (&(idx_ent->PL), &PE);
462              }
463              prevIdx = idx_ent;
464              prevPE = &(idx_ent->PL->PE[idx_ent->PL->num_entries - 1]);
465
466              bcopy ((char *) word, (char *) IdxPool, len);
467              IdxPool += len;
468              IdxPoolLeft -= len;
469              if (IdxHashUsed == idx_max_first_occr - 1)
470            {
471              ChangeMem (-sizeof (idx_hash_rec *) * idx_max_first_occr);
472              idx_max_first_occr *= 2;
473              if (!(idx_first_occr = Xrealloc (idx_first_occr, sizeof (idx_hash_rec *) *
474                               idx_max_first_occr)))
475                FatalError (1, "Unable to allocate memory for idx_first_occr");
476              ChangeMem (sizeof (idx_hash_rec *) * idx_max_first_occr);
477            }
478              idx_first_occr[IdxHashUsed] = idx_ent;
479              IdxHashUsed++;
480              IdxHashTable[hashval] = idx_ent;
481              break;
482            }
483         
484          /* Compare the words */
485          s1 = word;
486          s2 = idx_ent->word;
487          len = *s1 + 1;
488          for (; len; len--)
489            if (*s1++ != *s2++)
490              break;
491         
492          if (len)
493            {
494              /* Entry is not the right one - move to next hash index */
495              hashval = (hashval + step);
496              if (hashval >= hsize)
497            hashval -= hsize;
498            }
499          else
500            {
501              /* Entry is correct - added PosEntry to word */
502              PosEntry PE;
503              PE.num_cases = 1;
504              PE.blk = block;
505              PE.blk_index = (unsigned short) blk_index;
506              PE.offset = res % sd->sdh.lookback;
507              AddPosEntry (&(idx_ent->PL), &PE);
508              prevIdx = idx_ent;
509              prevPE = &(idx_ent->PL->PE[idx_ent->PL->num_entries - 1]);
510              break;
511            }
512        }     
513        }
514
515      if (IdxHashUsed >= IdxHashSize >> 1)
516        {
517          idx_hash_rec **ht;
518          unsigned long size;
519          unsigned long i;
520          size = prime (IdxHashSize * 2);
521          if (!(ht = Xmalloc (sizeof (idx_hash_rec *) * size)))
522        FatalError (1, "Unable to allocate memory for idx table");
523          bzero ((char *) ht, sizeof (idx_hash_rec *) * size);
524          ChangeMem (sizeof (idx_hash_rec *) * size);
525
526          for (i = 0; i < IdxHashSize; i++)
527        if (IdxHashTable[i])
528          {
529            register u_char *wptr;
530            idx_hash_rec *ent;
531            register unsigned long hashval, step;
532           
533            wptr = IdxHashTable[i]->word;
534            HASH (hashval, step, wptr, size);
535            ent = ht[hashval];
536            while (ent)
537              {
538            hashval += step;
539            if (hashval >= size)
540              hashval -= size;
541            ent = ht[hashval];
542              }
543            ht[hashval] = IdxHashTable[i];
544          }
545          Xfree (IdxHashTable);
546          ChangeMem (-sizeof (idx_hash_rec *) * IdxHashSize);
547          IdxHashTable = ht;
548          IdxHashSize = size;
549        }
550     
551    } /* end for each word */
552
553    } /* end for each block */
554
555}
556 
557static int
558idx_comp (const void *A, const void *B)
559{
560  u_char *s1 = (*((idx_hash_rec **) A))->word;
561  u_char *s2 = (*((idx_hash_rec **) B))->word;
562  return (casecompare (s1, s2));
563}
564
565
566void
567save_idx (char * filename, int stem_method)
568{
569  char *FName;
570  unsigned long i, j, pos, First_word, num;
571  struct stem_idx_header sih;
572  u_char *buffer, *last_word = NULL;
573  unsigned short *pointers;
574  int buf_in_use;
575  unsigned short ptrs_in_use, word_num;
576  FILE *idbi = NULL, *tmp = NULL;
577
578  FName = make_name (filename, ".tmp", NULL);
579  if (!(tmp = fopen (FName, "w+b")))
580    FatalError (1, "Unable to open \"%s\".\n", FName);
581
582  /* Delete the file now */
583  unlink (FName);
584
585  /* Create appropriate stem index file */
586  switch (stem_method)
587    {
588    case (1):
589      {
590    idbi = create_file (filename, INVF_DICT_BLOCKED_1_SUFFIX, "wb", MAGIC_STEM_1,
591                MG_ABORT);
592    break;
593      }
594
595    case (2):
596      {
597    idbi = create_file (filename, INVF_DICT_BLOCKED_2_SUFFIX, "wb", MAGIC_STEM_2,
598                MG_ABORT);
599    break;
600      }
601    case (3):
602      {
603    idbi = create_file (filename, INVF_DICT_BLOCKED_3_SUFFIX, "wb", MAGIC_STEM_3,
604                MG_ABORT);
605    break;
606      }
607    }
608
609  if (!idbi)
610    FatalError (1, "Could NOT create .invf.blocked.%d file", stem_method);
611
612  PackIdxHashTable();
613  qsort (IdxHashTable, IdxHashUsed, sizeof (idx_hash_rec *), idx_comp);
614
615  sih.lookback = lookback;
616  sih.block_size = block_size;
617  sih.num_blocks = 0;
618  sih.blocks_start = 0;
619  sih.index_chars = 0;
620  sih.num_of_words = IdxHashUsed;
621
622  fwrite ((char *) &sih, sizeof (sih), 1, idbi);
623
624  if (!(buffer = Xmalloc (block_size + 512)))
625    FatalError (1, "Unable to allocate memory for \"buffer\"\n");
626  if (!(pointers = Xmalloc (block_size + 512)))
627    FatalError (1, "Unable to allocate memory for \"buffer\"\n");
628
629  buf_in_use = 0;
630  pos = 0;
631  word_num = 0;
632  ptrs_in_use = 0;
633  First_word = 0;
634
635  /* For each word in the hashtable... */
636  for (i = 0; i < IdxHashUsed; i++)
637    {
638      register unsigned long extra, copy, suff;
639      register struct idx_hash_rec *ent = IdxHashTable[i];
640
641      /* build a new word on top of prev */
642      if (last_word != NULL)
643    copy = prefixlen (last_word, ent->word);
644      else
645    copy = 0;
646      suff = *(ent->word) - copy;
647      last_word = ent->word;
648
649      if (word_num % sih.lookback == 0)
650    /* Will need copy chars to add + a pointer in index */
651    extra = copy + sizeof (*pointers);
652      else
653    extra = 0;
654      if ((ptrs_in_use + 1) * sizeof (*pointers) + sizeof (ptrs_in_use) + extra +
655      buf_in_use + sizeof (First_word) + suff + 1 + sizeof (ent->PL->num_entries) +
656      ent->PL->num_entries * sizeof (PosEntry) > block_size)
657    {
658      /* Dump buffer to tmp file */
659      int chunk;
660      HTONUL(First_word);  /* [RPAP - Jan 97: Endian Ordering] */
661      HTONUS(word_num);  /* [RPAP - Jan 97: Endian Ordering] */
662      fwrite (&First_word, sizeof (First_word), 1, tmp);
663      fwrite (&word_num, sizeof (word_num), 1, tmp);
664      fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
665      fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
666      bzero ((char *) buffer, block_size);
667      chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
668        sizeof (ptrs_in_use) + sizeof (First_word);
669      if (force && chunk < block_size)
670        {
671          fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
672          chunk = block_size;
673        }
674
675      pos += chunk;
676
677      buf_in_use = 0;
678      word_num = 0;
679      ptrs_in_use = 0;
680      sih.num_blocks++;
681
682      /* Check that entry will fit into new block */
683      if (sizeof (*pointers) + sizeof (ptrs_in_use) + extra + sizeof (First_word) +
684          suff + 1 + sizeof (ent->PL->num_entries) +
685          ent->PL->num_entries * sizeof (PosEntry) > block_size)
686        FatalError (1, "Block size to small");
687
688    }
689
690      if (word_num % sih.lookback == 0)
691    {
692      HTONUS2(buf_in_use, pointers[ptrs_in_use++]);  /* [RPAP - Jan 97: Endian Ordering] */
693      suff += copy;
694      copy = 0;
695    }
696
697      /* Output Word information */
698      buffer[buf_in_use++] = copy;
699      buffer[buf_in_use++] = suff;
700      bcopy ((char *) (ent->word + copy + 1), (char *) (buffer + buf_in_use), suff);
701      buf_in_use += suff;
702      HTONUI(ent->PL->num_entries);  /* [RPAP - Jan 97: Endian Ordering] */
703      bcopy ((char *) &(ent->PL->num_entries), (char *) (buffer + buf_in_use), sizeof (ent->PL->num_entries));
704      NTOHUI(ent->PL->num_entries);  /* [RPAP - Jan 97: Endian Ordering] */
705      buf_in_use += sizeof (ent->PL->num_entries);
706
707      for (j = 0; j < ent->PL->num_entries; j++)
708    {
709      register PosEntry *pe = &(ent->PL->PE[j]);
710      HTONUI(pe->num_cases);  /* [RPAP - Jan 97: Endian Ordering] */
711      bcopy ((char *) &(pe->num_cases), (char *) (buffer + buf_in_use), sizeof (pe->num_cases));
712      buf_in_use += sizeof (pe->num_cases);
713      HTONUI(pe->blk);  /* [RPAP - Jan 97: Endian Ordering] */
714      bcopy ((char *) &(pe->blk), (char *) (buffer + buf_in_use), sizeof (pe->blk));
715      buf_in_use += sizeof (pe->blk);
716      HTONUS(pe->blk_index);  /* [RPAP - Jan 97: Endian Ordering] */
717      bcopy ((char *) &(pe->blk_index), (char *) (buffer + buf_in_use), sizeof (pe->blk_index));
718      buf_in_use += sizeof (pe->blk_index);
719      HTONUS(pe->offset);  /* [RPAP - Jan 97: Endian Ordering] */
720      bcopy ((char *) &(pe->offset), (char *) (buffer + buf_in_use), sizeof (pe->offset));
721      buf_in_use += sizeof (pe->offset);
722    }
723
724      if (buf_in_use + ptrs_in_use * sizeof (*pointers) +
725      sizeof (ptrs_in_use) > block_size)
726    FatalError (1, "Fatal Internal Error # 64209258\n");
727
728      if (word_num == 0)
729    {
730      /* Write word to main index */
731      fwrite (ent->word, sizeof (u_char), *(ent->word) + 1, idbi);
732      HTONUL(pos);  /* [RPAP - Jan 97: Endian Ordering] */
733      fwrite (&pos, sizeof (pos), 1, idbi);
734      NTOHUL(pos);  /* [RPAP - Jan 97: Endian Ordering] */
735      sih.index_chars += *(ent->word) + 1;
736      First_word = i;
737    }
738      word_num++;
739    } /* end for each word */
740
741  if (buf_in_use)
742    {
743      /* Write last buffer to tmp file */
744      int chunk;
745
746      /* [RPAP - Jan 97: Endian Ordering] */
747      HTONUL(First_word);
748      HTONUS(word_num);
749
750      fwrite (&First_word, sizeof (First_word), 1, tmp);
751      fwrite (&word_num, sizeof (word_num), 1, tmp);
752      fwrite (pointers, sizeof (*pointers), ptrs_in_use, tmp);
753      fwrite (buffer, sizeof (u_char), buf_in_use, tmp);
754      bzero ((char *) buffer, block_size);
755      chunk = buf_in_use + ptrs_in_use * sizeof (*pointers) +
756    sizeof (ptrs_in_use) + sizeof (First_word);
757      if (force && chunk < block_size)
758    {
759      fwrite (buffer, sizeof (u_char), block_size - chunk, tmp);
760      chunk = block_size;
761    }
762
763      sih.num_blocks++;
764    }
765
766  rewind (tmp);
767  sih.blocks_start = sih.index_chars + sizeof (u_long) + sizeof (sih) +
768    sih.num_blocks * sizeof (pos);
769  if (force)
770    {
771      int amount;
772      amount = sih.blocks_start % block_size;
773      if (amount != 0)
774    {
775      bzero ((char *) buffer, block_size);
776      fwrite (buffer, sizeof (u_char), block_size - amount, idbi);
777      sih.blocks_start += block_size - amount;
778    }
779    }
780
781  while ((num = fread (buffer, sizeof (u_char), block_size, tmp)) != 0)
782    fwrite (buffer, sizeof (u_char), num, idbi);
783  fclose (tmp);
784
785  /* skip over the magic number */
786  fseek (idbi, sizeof (u_long), 0);
787
788  /* [RPAP - Jan 97: Endian Ordering] */
789  HTONUL(sih.lookback);
790  HTONUL(sih.block_size);
791  HTONUL(sih.num_blocks);
792  HTONUL(sih.blocks_start);
793  HTONUL(sih.index_chars);
794  HTONUL(sih.num_of_words);
795
796  fwrite (&sih, sizeof (sih), 1, idbi);
797  fclose (idbi);
798
799#ifndef SILENT
800  Message ("Stem %d:\n", stem_method);
801  Message ("     Block size   : %10d\n", block_size);
802  Message ("     Num_blocks   : %10d\n", NTOHUL(sih.num_blocks));  /* [RPAP - Jan 97: Endian Ordering] */
803  Message ("     Max mem used : %10.1f Mb\n", (double) MaxMemInUse / 1024.0 / 1024.0);
804  Message ("     Num_of_words : %10d\n", NTOHUL(sih.num_of_words));  /* [RPAP - Jan 97: Endian Ordering] */
805#endif
806}
807
808
809void
810UpdateStemDict (char * filename, int stem_method)
811{
812  FILE *idb;
813  struct stem_dict_header sdh;
814
815  if (!(idb = open_file (filename, INVF_DICT_BLOCKED_SUFFIX, "r+b",
816             MAGIC_STEM, MG_CONTINUE)))
817    FatalError (1, "Could not update stemmed dict");
818
819  fread ((char *) &sdh, sizeof (sdh), 1, idb);
820  NTOHUL(sdh.indexed);  /* [RPAP - Jan 97: Endian Ordering] */
821  sdh.indexed |= 1 << (stem_method - 1);
822  HTONUL(sdh.indexed);
823  fseek (idb, sizeof (u_long), 0);
824  fwrite ((char *) &sdh, sizeof (sdh), 1, idb);
825  fclose (idb);
826}
827
828
829
830/* Main */
831int main (int argc, char **argv)
832{
833  File *idb;    /* File to .invf.dict.blocked */
834  char *filename = "";
835  stemmed_dict *sd;   /* Stemmed dictionary */
836  int ch;
837  char path[512];
838  int stem_method = 0;
839
840  msg_prefix = argv[0];
841  opterr = 0;
842  while ((ch = getopt (argc, argv, "f:d:b:hFs:")) != -1)
843    switch (ch)
844      {
845      case 'f':     /* input file */
846    filename = optarg;
847    break;
848      case 'd':
849    set_basepath (optarg);
850    break;
851      case 'b':
852    block_size = atoi (optarg);
853    break;
854      case 'F':
855    force = 1;
856    break;
857      case 's':
858    stem_method = atoi (optarg);
859    break;
860      case 'h':
861      case '?':
862    fprintf (stderr, "usage: %s [-d directory] "
863         "[-b num] [-F] [-h] -s 1|2|3 -f name\n", argv[0]);
864    exit (1);
865      }
866
867  if (stem_method < 1 || stem_method > 3)
868    FatalError (1, "Stem method must be 1, 2 or 3");
869
870  /* Open required stem dict file */
871  sprintf (path, FILE_NAME_FORMAT, get_basepath (), filename, INVF_DICT_BLOCKED_SUFFIX);
872  if (!(idb = Fopen (path, "rb", MAGIC_STEM)))
873    FatalError (1, "Unable to open \"%s\"", path);
874
875  /* Read in idb header and index to blocks */
876  if (!(sd = ReadStemDictBlk (idb)))
877    FatalError (1, "Could not read stemmed dictionary");
878
879  /* Process stemmed dictionary */
880  mg_init_process ();
881  process_stem_dict (sd, stem_method);
882  save_idx (filename, stem_method);
883 
884  /* Close stemmed dict */
885  Fclose (idb);
886
887  /* Update stemmed dict */
888  UpdateStemDict (filename, stem_method);
889
890  return 0;
891}
892
Note: See TracBrowser for help on using the browser.