source: trunk/gsdl/packages/mg/src/text/stem_search.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 20.8 KB
Line 
1/**************************************************************************
2 *
3 * stem_search.c -- Functions for searching the blocked stemmed dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: stem_search.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "filestats.h"
29#include "timing.h"
30#include "local_strings.h"
31#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
32
33#include "mg.h"
34#include "invf.h"
35#include "text.h"
36#include "lists.h"
37#include "backend.h"
38#include "words.h"
39#include "locallib.h"
40#include "stem_search.h"
41#include "mg_errors.h"
42#include "term_lists.h"
43#include "stemmer.h"
44
45
46/*
47 $Log$
48 Revision 1.1 1999/08/10 21:18:22 sjboddie
49 renamed mg-1.3d directory mg
50
51 Revision 1.3 1999/07/02 00:18:55 rjmcnab
52 Changed so FindWords could be used in new ways.
53
54 Revision 1.2 1998/11/25 07:55:51 rjmcnab
55
56 Modified mg to that you can specify the stemmer you want
57 to use via a command line option. You specify it to
58 mg_passes during the build process. The number of the
59 stemmer that you used is stored within the inverted
60 dictionary header and the stemmed dictionary header so
61 the correct stemmer is used in later stages of building
62 and querying.
63
64 Revision 1.1 1998/11/17 09:35:39 rjmcnab
65 *** empty log message ***
66
67 * Revision 1.3 1994/10/20 03:57:04 tes
68 * I have rewritten the boolean query optimiser and abstracted out the
69 * components of the boolean query.
70 *
71 * Revision 1.2 1994/09/20 04:42:08 tes
72 * For version 1.1
73 *
74 */
75
76static char *RCSID = "$Id: stem_search.c 439 1999-08-10 21:23:37Z sjboddie $";
77
78
79stemmed_dict *
80ReadStemDictBlk (File * stem_file)
81{
82 unsigned long i;
83 stemmed_dict *sd;
84 u_char *buffer;
85
86 if (!(sd = Xmalloc (sizeof (stemmed_dict))))
87 {
88 mg_errno = MG_NOMEM;
89 return (NULL);
90 }
91
92 sd->stem_file = stem_file;
93 sd->MemForStemDict = 0;
94
95 Fread (&sd->sdh, sizeof (sd->sdh), 1, stem_file);
96 /* [RPAP - Jan 97: Endian Ordering] */
97 NTOHUL(sd->sdh.lookback);
98 NTOHUL(sd->sdh.block_size);
99 NTOHUL(sd->sdh.num_blocks);
100 NTOHUL(sd->sdh.blocks_start);
101 NTOHUL(sd->sdh.index_chars);
102 NTOHUL(sd->sdh.num_of_docs);
103 NTOHUL(sd->sdh.static_num_of_docs);
104 NTOHUL(sd->sdh.num_of_words);
105 NTOHUL(sd->sdh.stem_method);
106 NTOHUL(sd->sdh.indexed);
107
108 if (!(buffer = Xmalloc (sd->sdh.index_chars)))
109 {
110 Xfree (sd);
111 mg_errno = MG_NOMEM;
112 return (NULL);
113 };
114 sd->MemForStemDict += sd->sdh.index_chars;
115
116 if (!(sd->index = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->index))))
117 {
118 Xfree (sd);
119 Xfree (buffer);
120 mg_errno = MG_NOMEM;
121 return (NULL);
122 };
123 sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->index);
124
125 if (!(sd->pos = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->pos))))
126 {
127 Xfree (sd);
128 Xfree (buffer);
129 Xfree (sd->index);
130 mg_errno = MG_NOMEM;
131 return (NULL);
132 };
133 sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->pos);
134
135 if (!(sd->buffer = Xmalloc (sd->sdh.block_size * sizeof (*sd->buffer))))
136 {
137 Xfree (sd);
138 Xfree (buffer);
139 Xfree (sd->index);
140 Xfree (sd->buffer);
141 mg_errno = MG_NOMEM;
142 return (NULL);
143 };
144 sd->MemForStemDict += sd->sdh.block_size * sizeof (*sd->buffer);
145
146 sd->active = -1;
147
148 for (i = 0; i < sd->sdh.num_blocks; i++)
149 {
150 register u_char len;
151 sd->index[i] = buffer;
152 len = Getc (stem_file);
153 *buffer++ = len;
154 Fread (buffer, sizeof (u_char), len, stem_file);
155 buffer += len;
156 Fread (&sd->pos[i], sizeof (*sd->pos), 1, stem_file);
157 NTOHUL(sd->pos[i]); /* [RPAP - Jan 97: Endian Ordering] */
158 }
159
160 mg_errno = MG_NOERROR;
161
162 /* fprintf (stderr, "mem for stem dict = %i\n", sd->MemForStemDict); */
163
164 return sd;
165}
166
167
168/* [RPAP - Jan 97: Stem Index Change] */
169stemmed_idx *
170ReadStemIdxBlk (File * stem_idx_file)
171{
172 unsigned long i;
173 stemmed_idx *si;
174 u_char *buffer;
175
176 if (!(si = Xmalloc (sizeof (stemmed_idx))))
177 {
178 mg_errno = MG_NOMEM;
179 return (NULL);
180 }
181
182 si->stem_idx_file = stem_idx_file;
183 si->MemForStemIdx = 0;
184
185 Fread (&si->sih, sizeof (si->sih), 1, stem_idx_file);
186 /* [RPAP - Jan 97: Endian Ordering] */
187 NTOHUL(si->sih.lookback);
188 NTOHUL(si->sih.block_size);
189 NTOHUL(si->sih.num_blocks);
190 NTOHUL(si->sih.blocks_start);
191 NTOHUL(si->sih.index_chars);
192 NTOHUL(si->sih.num_of_words);
193
194 if (!(buffer = Xmalloc (si->sih.index_chars)))
195 {
196 Xfree (si);
197 mg_errno = MG_NOMEM;
198 return (NULL);
199 };
200 si->MemForStemIdx += si->sih.index_chars;
201
202 if (!(si->index = Xmalloc (si->sih.num_blocks * sizeof (*si->index))))
203 {
204 Xfree (si);
205 Xfree (buffer);
206 mg_errno = MG_NOMEM;
207 return (NULL);
208 };
209 si->MemForStemIdx += si->sih.num_blocks * sizeof (*si->index);
210
211 if (!(si->pos = Xmalloc (si->sih.num_blocks * sizeof (*si->pos))))
212 {
213 Xfree (si->index);
214 Xfree (si);
215 Xfree (buffer);
216 mg_errno = MG_NOMEM;
217 return (NULL);
218 };
219 si->MemForStemIdx += si->sih.num_blocks * sizeof (*si->pos);
220
221 if (!(si->buffer = Xmalloc (si->sih.block_size * sizeof (*si->buffer))))
222 {
223 Xfree (buffer);
224 Xfree (si->index);
225 Xfree (si->buffer);
226 Xfree (si);
227 mg_errno = MG_NOMEM;
228 return (NULL);
229 };
230 si->MemForStemIdx += si->sih.block_size * sizeof (*si->buffer);
231
232 si->active = -1;
233
234 for (i = 0; i < si->sih.num_blocks; i++)
235 {
236 register u_char len;
237 si->index[i] = buffer;
238 len = Getc (stem_idx_file);
239 *buffer++ = len;
240 Fread (buffer, sizeof (u_char), len, stem_idx_file);
241 buffer += len;
242 Fread (&si->pos[i], sizeof (*si->pos), 1, stem_idx_file);
243 NTOHUL(si->pos[i]); /* [RPAP - Jan 97: Endian Ordering] */
244 }
245 mg_errno = MG_NOERROR;
246
247 /* fprintf (stderr, "mem for stem idx = %i\n", si->MemForStemIdx); */
248
249 return si;
250}
251
252
253/* [RPAP - Jan 97: Stem Index Change] */
254/* word should be appropriately stemed */
255static int
256GetIdxBlock (stemmed_idx * si, u_char * word)
257{
258 register int lo = 0, hi = si->sih.num_blocks - 1;
259 register int mid = 0, c = 0;
260
261 while (lo <= hi)
262 {
263 mid = (lo + hi) / 2;
264 c = casecompare (word, si->index[mid]);
265 if (c < 0)
266 hi = mid - 1;
267 else if (c > 0)
268 lo = mid + 1;
269 else
270 return mid;
271 }
272 return hi < 0 ? 0 : (c < 0 ? mid - 1 : mid);
273}
274
275
276static int
277GetBlock (stemmed_dict * sd, u_char * Word)
278{
279 register int lo = 0, hi = sd->sdh.num_blocks - 1;
280 register int mid = 0, c = 0;
281 while (lo <= hi)
282 {
283 mid = (lo + hi) / 2;
284 c = casecompare (Word, sd->index[mid]); /* [RPAP - Jan 97: Stem Index Change] */
285 if (c < 0)
286 hi = mid - 1;
287 else if (c > 0)
288 lo = mid + 1;
289 else
290 return mid;
291 }
292 return hi < 0 ? 0 : (c < 0 ? mid - 1 : mid);
293}
294
295
296/*
297 * This function looks up a word in the stemmed dictionary, it returns -1
298 * if the word cound not be found, and 0 if it successfully finds the word.
299 * If count is non-null the ulong it is pointing to is set to the number of
300 * occurances of the stemmed word in the collection. i.e wcnt.
301 * If doc_count is non-null the ulong it is pointing to is set to the number
302 * of documents that the word occurs in. i.e fcnt
303 * If invf_ptr is non-null the ulong it is pointing to is set to the position
304 * of the inverted file where the entry for this word start.
305 */
306int
307FindWord (stemmed_dict * sd, u_char * Word, unsigned long *count,
308 unsigned long *doc_count, unsigned long *invf_ptr,
309 unsigned long *invf_len)
310{
311 register int lo, hi, mid, c;
312 register unsigned int res;
313 int block, num_indexes;
314 unsigned long *first_word, *last_invf_len;
315 unsigned short *num_words;
316 u_char *base;
317 unsigned short *index;
318 u_char prev[MAXSTEMLEN + 1];
319
320 block = GetBlock (sd, Word);
321 /* [RPAP - Jan 97: Endian Ordering] */
322 if (sd->active != sd->pos[block])
323 {
324 int i;
325
326 Fseek (sd->stem_file, sd->pos[block] + sd->sdh.blocks_start, 0);
327 Fread (sd->buffer, sd->sdh.block_size, sizeof (u_char), sd->stem_file);
328 sd->active = sd->pos[block];
329
330 /* [RPAP - Jan 97: Endian Ordering] */
331 first_word = (unsigned long *) (sd->buffer);
332 NTOHUL(*first_word);
333 last_invf_len = (unsigned long *) (first_word + 1);
334 NTOHUL(*last_invf_len);
335 num_words = (unsigned short *) (last_invf_len + 1);
336 NTOHUS(*num_words);
337 index = num_words + 1;
338 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
339
340 for (i = 0; i < num_indexes; i++)
341 NTOHUS(index[i]);
342 }
343 else
344 {
345 first_word = (unsigned long *) (sd->buffer);
346 last_invf_len = (unsigned long *) (first_word + 1);
347 num_words = (unsigned short *) (last_invf_len + 1);
348 index = num_words + 1;
349 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
350 }
351 base = (u_char *) (index + num_indexes);
352
353 lo = 0;
354 hi = num_indexes - 1;
355 while (lo <= hi)
356 {
357 mid = (lo + hi) / 2;
358 c = casecompare (Word, base + index[mid] + 1); /* [RPAP - Jan 97: Stem Index Change] */
359 if (c < 0)
360 hi = mid - 1;
361 else if (c > 0)
362 lo = mid + 1;
363 else
364 {
365 hi = mid;
366 break;
367 }
368 }
369 if (hi < 0)
370 hi = 0;
371
372 res = hi * sd->sdh.lookback;
373 base += index[hi];
374
375 for (;;)
376 {
377 unsigned copy, suff;
378 unsigned long invfp;
379 if (res >= *num_words)
380 return (-1);
381 copy = *base++;
382 suff = *base++;
383 bcopy ((char *) base, (char *) (prev + copy + 1), suff);
384 base += suff;
385 *prev = copy + suff;
386
387 c = casecompare (Word, prev); /* [RPAP - Jan 97: Stem Index Change] */
388 if (c < 0)
389 return (-1);
390
391 if (c == 0 && doc_count)
392 {
393 bcopy ((char *) base, (char *) doc_count, sizeof (*doc_count));
394 NTOHUL(*doc_count); /* [RPAP - Jan 97: Endian Ordering] */
395 }
396 base += sizeof (*doc_count);
397
398 if (c == 0 && count)
399 {
400 bcopy ((char *) base, (char *) count, sizeof (*count));
401 NTOHUL(*count); /* [RPAP - Jan 97: Endian Ordering] */
402 }
403 base += sizeof (*count);
404
405 if (c == 0 && invf_ptr)
406 {
407 bcopy ((char *) base, (char *) &invfp, sizeof (invf_ptr));
408 NTOHUL(invfp); /* [RPAP - Jan 97: Endian Ordering] */
409 *invf_ptr = invfp;
410 }
411 base += sizeof (*invf_ptr);
412
413 if (c == 0)
414 {
415 /* Calculate invf_len is necessary */
416 unsigned long next_invfp;
417 if (!invf_len)
418 return (*first_word + res);
419
420 /* If the current word is the last word of the block the get the
421 length from last_invf_len */
422 if (res == *num_words - 1)
423 {
424 *invf_len = *last_invf_len;
425 return (*first_word + res);
426 }
427
428 /* Skip over most of the next word to get to the invf_ptr */
429 base++;
430 suff = *base++;
431 base += suff + sizeof (unsigned long) * 2;
432 bcopy ((char *) base, (char *) &next_invfp, sizeof (next_invfp));
433 NTOHUL(next_invfp); /* [RPAP - Jan 97: Endian Ordering] */
434 *invf_len = next_invfp - invfp;
435 return (*first_word + res);
436 }
437 res++;
438 }
439}
440
441
442/* [RPAP - Jan 97: Stem Index Change] */
443int
444FindWords (stemmed_dict * sd, u_char * sWord, int stem_method, TermList ** tl)
445{
446 register unsigned int res;
447 unsigned int idx_res;
448 unsigned copy, suff;
449 int j, k;
450
451 int block, num_indexes;
452 unsigned long *first_word, *last_invf_len;
453 unsigned short *num_words;
454 u_char *base;
455 unsigned short *index;
456 u_char prev[MAXSTEMLEN + 1];
457
458 int idx_block, idx_num_indexes;
459 unsigned long *idx_first_word;
460 unsigned short *idx_num_words;
461 u_char *idx_base;
462 unsigned short *idx_index;
463 u_char idx_prev[MAXSTEMLEN + 1];
464
465 unsigned int num_entries, num_cases;
466 unsigned short blk_index, offset;
467 stemmed_idx * si = NULL;
468
469 /* handle stem_method 0 seperately */
470 if (stem_method == 0) {
471 TermEntry te;
472 if ((te.WE.word_num = FindWord (sd, sWord, &te.WE.count, &te.WE.doc_count,
473 &te.WE.invf_ptr, &te.WE.invf_len)) != -1) {
474 te.WE.max_doc_count = te.WE.doc_count;
475
476 te.Count = 1;
477 te.Word = copy_string (sWord);
478 if (!te.Word)
479 FatalError (1, "Could NOT create memory to add term");
480 te.Stem = copy_string (sWord);
481 if (!te.Stem)
482 FatalError (1, "Could NOT create memory to add term");
483 /* te.query_mask = NULL;*/
484
485 AddTermEntry (tl, &te);
486 return (*tl)->num;
487
488 } else {
489 /* didn't match */
490 return 0;
491 }
492 }
493
494 if (stem_method == 1)
495 si = sd->stem1;
496 else if (stem_method == 2)
497 si = sd->stem2;
498 else
499 si = sd->stem3;
500
501 /* Locate block */
502 idx_block = GetIdxBlock (si, sWord);
503
504 /* [RPAP - Jan 97: Endian Ordering] */
505 if (si->active != si->pos[idx_block])
506 {
507 Fseek (si->stem_idx_file, si->pos[idx_block] + si->sih.blocks_start, 0);
508 Fread (si->buffer, si->sih.block_size, sizeof (u_char), si->stem_idx_file);
509 si->active = si->pos[idx_block];
510
511 idx_first_word = (unsigned long *) (si->buffer);
512 NTOHUL(*idx_first_word); /* [RPAP - Jan 97: Endian Ordering] */
513 idx_num_words = (unsigned short *) (idx_first_word + 1);
514 NTOHUS(*idx_num_words); /* [RPAP - Jan 97: Endian Ordering] */
515 idx_index = idx_num_words + 1;
516 idx_num_indexes = ((*idx_num_words - 1) / si->sih.lookback) + 1;
517
518 /* [RPAP - Jan 97: Endian Ordering] */
519 for (j = 0; j < idx_num_indexes; j++)
520 NTOHUS(idx_index[j]);
521 }
522 else
523 {
524 idx_first_word = (unsigned long *) (si->buffer);
525 idx_num_words = (unsigned short *) (idx_first_word + 1);
526 idx_index = idx_num_words + 1;
527 idx_num_indexes = ((*idx_num_words - 1) / si->sih.lookback) + 1;
528 }
529 idx_base = (u_char *) (idx_index + idx_num_indexes);
530
531 {
532 /* Locate 3-in-4 block */
533 register int lo, hi, mid, c;
534 lo = 0;
535 hi = idx_num_indexes - 1;
536 while (lo <= hi)
537 {
538 mid = (lo + hi) / 2;
539 c = casecompare (sWord, idx_base + idx_index[mid] + 1);
540 if (c < 0)
541 hi = mid - 1;
542 else if (c > 0)
543 lo = mid + 1;
544 else
545 {
546 hi = mid;
547 break;
548 }
549 }
550 if (hi < 0)
551 hi = 0;
552
553 idx_res = hi * si->sih.lookback;
554 idx_base += idx_index[hi];
555 }
556
557 /* Locate actual word entry */
558 for (;;)
559 {
560 int c;
561 if (idx_res >= *idx_num_words)
562 return (-1);
563 copy = *idx_base++;
564 suff = *idx_base++;
565 bcopy ((char *) idx_base, (char *) (idx_prev + copy + 1), suff);
566 idx_base += suff;
567 *idx_prev = copy + suff;
568
569 c = casecompare (sWord, idx_prev);
570 if (c < 0)
571 return (-1);
572
573 bcopy ((char *) idx_base, (char *) &num_entries, sizeof (num_entries));
574 NTOHUI(num_entries); /* [RPAP - Jan 97: Endian Ordering] */
575 idx_base += sizeof (num_entries);
576
577 if (c > 0)
578 idx_base += num_entries * (sizeof (num_cases) + sizeof (block) +
579 sizeof (blk_index) + sizeof (offset));
580
581 else
582 break;
583
584 idx_res++;
585 }
586
587 for (k = 0; k < num_entries; k++)
588 {
589 unsigned copy, suff;
590 unsigned long invfp;
591 /* Read next stem index pos */
592 bcopy ((char *) idx_base, (char *) &num_cases, sizeof (num_cases));
593 NTOHUI(num_cases); /* [RPAP - Jan 97: Endian Ordering] */
594 idx_base += sizeof (num_cases);
595 bcopy ((char *) idx_base, (char *) &block, sizeof (block));
596 NTOHUI(block); /* [RPAP - Jan 97: Endian Ordering] */
597 idx_base += sizeof (block);
598 bcopy ((char *) idx_base, (char *) &blk_index, sizeof (blk_index));
599 NTOHUS(blk_index); /* [RPAP - Jan 97: Endian Ordering] */
600 idx_base += sizeof (blk_index);
601 bcopy ((char *) idx_base, (char *) &offset, sizeof (offset));
602 NTOHUS(offset); /* [RPAP - Jan 97: Endian Ordering] */
603 idx_base += sizeof (offset);
604
605 /* [RPAP - Jan 97: Endian Ordering] */
606 if (sd->active != sd->pos[block])
607 {
608 Fseek (sd->stem_file, sd->pos[block] + sd->sdh.blocks_start, 0);
609 Fread (sd->buffer, sd->sdh.block_size, sizeof (u_char), sd->stem_file);
610 sd->active = sd->pos[block];
611
612 first_word = (unsigned long *) (sd->buffer);
613 NTOHUL(*first_word); /* [RPAP - Jan 97: Endian Ordering] */
614 last_invf_len = (unsigned long *) (first_word + 1);
615 NTOHUL(*last_invf_len); /* [RPAP - Jan 97: Endian Ordering] */
616 num_words = (unsigned short *) (last_invf_len + 1);
617 NTOHUS(*num_words); /* [RPAP - Jan 97: Endian Ordering] */
618 index = num_words + 1;
619 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
620
621 /* [RPAP - Jan 97: Endian Ordering] */
622 for (j = 0; j < num_indexes; j++)
623 NTOHUS(index[j]);
624 }
625 else
626 {
627 first_word = (unsigned long *) (sd->buffer);
628 last_invf_len = (unsigned long *) (first_word + 1);
629 num_words = (unsigned short *) (last_invf_len + 1);
630 index = num_words + 1;
631 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
632 }
633 base = (u_char *) (index + num_indexes);
634
635 res = blk_index * sd->sdh.lookback;
636 base += index[blk_index];
637
638 for (j = 0; j < offset; j++)
639 {
640 copy = *base++;
641 suff = *base++;
642 bcopy ((char *) base, (char *) (prev + copy + 1), suff);
643 base += suff;
644 *prev = copy + suff;
645 base += sizeof (unsigned long); /* skip doc_count */
646 base += sizeof (unsigned long); /* skip count */
647 base += sizeof (unsigned long); /* skip invf_ptr */
648 res++;
649 }
650
651 for (j = 0; j < num_cases; j++)
652 {
653 TermEntry te;
654
655 if (res >= *num_words)
656 return (-1);
657 copy = *base++;
658 suff = *base++;
659 bcopy ((char *) base, (char *) (prev + copy + 1), suff);
660 base += suff;
661 *prev = copy + suff;
662
663 te.Word = copy_string (prev);
664 if (!te.Word)
665 FatalError (1, "Could NOT create memory to add term");
666 te.Stem = copy_string (prev);
667 if (!te.Stem)
668 FatalError (1, "Could NOT create memory to add term");
669 stemmer (2, sd->sdh.stemmer_num, te.Stem);
670
671 te.Count = 1;
672 te.WE.word_num = *first_word + res;
673 bcopy ((char *) base, (char *) &te.WE.doc_count, sizeof (te.WE.doc_count));
674 NTOHUL(te.WE.doc_count); /* [RPAP - Jan 97: Endian Ordering] */
675 te.WE.max_doc_count = te.WE.doc_count;
676 base += sizeof (te.WE.doc_count);
677
678 bcopy ((char *) base, (char *) &te.WE.count, sizeof (te.WE.count));
679 NTOHUL(te.WE.count);
680 base += sizeof (te.WE.count);
681
682 bcopy ((char *) base, (char *) &invfp, sizeof (te.WE.invf_ptr));
683 NTOHUL(invfp); /* [RPAP - Jan 97: Endian Ordering] */
684 te.WE.invf_ptr = invfp;
685 base += sizeof (te.WE.invf_ptr);
686
687 /* If the current word is the last word of the block the get the
688 length from last_invf_len */
689 if (res == *num_words - 1)
690 te.WE.invf_len = *last_invf_len;
691 else
692 {
693 unsigned long next_invfp;
694 u_char *oldbase = base;
695
696 /* Skip over most of the next word to get to the invf_ptr */
697 base++;
698 suff = *base++;
699 base += suff + sizeof (unsigned long) * 2;
700 bcopy ((char *) base, (char *) &next_invfp, sizeof (next_invfp));
701 NTOHUL(next_invfp); /* [RPAP - Jan 97: Endian Ordering] */
702 te.WE.invf_len = next_invfp - invfp;
703 base = oldbase;
704 }
705
706 /* Add term entry to term list */
707 /* te.query_mask = NULL;*/
708 AddTermEntry (tl, &te);
709
710 if (res == *num_words - 1 && j + 1 < num_cases)
711 {
712 int ii;
713 /* Read in next block */
714 block++;
715 Fseek (sd->stem_file, sd->pos[block] + sd->sdh.blocks_start, 0);
716 Fread (sd->buffer, sd->sdh.block_size, sizeof (u_char), sd->stem_file);
717 sd->active = sd->pos[block];
718
719 first_word = (unsigned long *) (sd->buffer);
720 NTOHUL(*first_word); /* [RPAP - Jan 97: Endian Ordering] */
721 last_invf_len = (unsigned long *) (first_word + 1);
722 NTOHUL(*last_invf_len); /* [RPAP - Jan 97: Endian Ordering] */
723 num_words = (unsigned short *) (last_invf_len + 1);
724 NTOHUS(*num_words); /* [RPAP - Jan 97: Endian Ordering] */
725 index = num_words + 1;
726 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
727
728 /* [RPAP - Jan 97: Endian Ordering] */
729 for (ii = 0; ii < num_indexes; ii++)
730 NTOHUS(index[ii]);
731
732 base = (u_char *) (index + num_indexes);
733 base += index[0];
734 res = 0;
735 blk_index = 0;
736 }
737 else
738 res++;
739 } /* end for num_cases */
740 } /* end for num_entries */
741 return (*tl)->num;
742}
743
744
745void
746FreeStemDict (stemmed_dict * sd)
747{
748 /* [RPAP - Jan 97: Stem Index Change] */
749 if (sd->stem1)
750 FreeStemIdx (sd->stem1);
751 if (sd->stem2)
752 FreeStemIdx (sd->stem2);
753 if (sd->stem3)
754 FreeStemIdx (sd->stem3);
755
756 Xfree (sd->index[0]);
757 Xfree (sd->index);
758 Xfree (sd->buffer);
759 Xfree (sd->pos);
760 Xfree (sd);
761}
762
763/* [RPAP - Jan 97: Stem Index Change] */
764void
765FreeStemIdx (stemmed_idx * si)
766{
767 Xfree (si->index[0]);
768 Xfree (si->index);
769 Xfree (si->buffer);
770 Xfree (si->pos);
771 Xfree (si);
772}
Note: See TracBrowser for help on using the repository browser.