source: main/tags/2.80/indexers/mg/src/text/stem_search.c@ 24541

Last change on this file since 24541 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 20.9 KB
Line 
1/**************************************************************************
2 *
3 * stem_search.c -- Functions for searching the blocked stemmed dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: stem_search.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "filestats.h"
29#include "timing.h"
30#include "local_strings.h"
31#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
32
33#include "mg.h"
34#include "invf.h"
35#include "text.h"
36#include "lists.h"
37#include "backend.h"
38#include "words.h"
39#include "locallib.h"
40#include "stem_search.h"
41#include "mg_errors.h"
42#include "term_lists.h"
43#include "stemmer.h"
44
45
46/*
47 $Log$
48 Revision 1.1 2003/02/20 21:18:24 mdewsnip
49 Addition of MG package for search and retrieval
50
51 Revision 1.1 1999/08/10 21:18:22 sjboddie
52 renamed mg-1.3d directory mg
53
54 Revision 1.3 1999/07/02 00:18:55 rjmcnab
55 Changed so FindWords could be used in new ways.
56
57 Revision 1.2 1998/11/25 07:55:51 rjmcnab
58
59 Modified mg to that you can specify the stemmer you want
60 to use via a command line option. You specify it to
61 mg_passes during the build process. The number of the
62 stemmer that you used is stored within the inverted
63 dictionary header and the stemmed dictionary header so
64 the correct stemmer is used in later stages of building
65 and querying.
66
67 Revision 1.1 1998/11/17 09:35:39 rjmcnab
68 *** empty log message ***
69
70 * Revision 1.3 1994/10/20 03:57:04 tes
71 * I have rewritten the boolean query optimiser and abstracted out the
72 * components of the boolean query.
73 *
74 * Revision 1.2 1994/09/20 04:42:08 tes
75 * For version 1.1
76 *
77 */
78
79static char *RCSID = "$Id: stem_search.c 3745 2003-02-20 21:20:24Z mdewsnip $";
80
81
82stemmed_dict *
83ReadStemDictBlk (File * stem_file)
84{
85 unsigned long i;
86 stemmed_dict *sd;
87 u_char *buffer;
88
89 if (!(sd = Xmalloc (sizeof (stemmed_dict))))
90 {
91 mg_errno = MG_NOMEM;
92 return (NULL);
93 }
94
95 sd->stem_file = stem_file;
96 sd->MemForStemDict = 0;
97
98 Fread (&sd->sdh, sizeof (sd->sdh), 1, stem_file);
99 /* [RPAP - Jan 97: Endian Ordering] */
100 NTOHUL(sd->sdh.lookback);
101 NTOHUL(sd->sdh.block_size);
102 NTOHUL(sd->sdh.num_blocks);
103 NTOHUL(sd->sdh.blocks_start);
104 NTOHUL(sd->sdh.index_chars);
105 NTOHUL(sd->sdh.num_of_docs);
106 NTOHUL(sd->sdh.static_num_of_docs);
107 NTOHUL(sd->sdh.num_of_words);
108 NTOHUL(sd->sdh.stem_method);
109 NTOHUL(sd->sdh.indexed);
110
111 if (!(buffer = Xmalloc (sd->sdh.index_chars)))
112 {
113 Xfree (sd);
114 mg_errno = MG_NOMEM;
115 return (NULL);
116 };
117 sd->MemForStemDict += sd->sdh.index_chars;
118
119 if (!(sd->index = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->index))))
120 {
121 Xfree (sd);
122 Xfree (buffer);
123 mg_errno = MG_NOMEM;
124 return (NULL);
125 };
126 sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->index);
127
128 if (!(sd->pos = Xmalloc (sd->sdh.num_blocks * sizeof (*sd->pos))))
129 {
130 Xfree (sd);
131 Xfree (buffer);
132 Xfree (sd->index);
133 mg_errno = MG_NOMEM;
134 return (NULL);
135 };
136 sd->MemForStemDict += sd->sdh.num_blocks * sizeof (*sd->pos);
137
138 if (!(sd->buffer = Xmalloc (sd->sdh.block_size * sizeof (*sd->buffer))))
139 {
140 Xfree (sd);
141 Xfree (buffer);
142 Xfree (sd->index);
143 Xfree (sd->buffer);
144 mg_errno = MG_NOMEM;
145 return (NULL);
146 };
147 sd->MemForStemDict += sd->sdh.block_size * sizeof (*sd->buffer);
148
149 sd->active = -1;
150
151 for (i = 0; i < sd->sdh.num_blocks; i++)
152 {
153 register u_char len;
154 sd->index[i] = buffer;
155 len = Getc (stem_file);
156 *buffer++ = len;
157 Fread (buffer, sizeof (u_char), len, stem_file);
158 buffer += len;
159 Fread (&sd->pos[i], sizeof (*sd->pos), 1, stem_file);
160 NTOHUL(sd->pos[i]); /* [RPAP - Jan 97: Endian Ordering] */
161 }
162
163 mg_errno = MG_NOERROR;
164
165 /* fprintf (stderr, "mem for stem dict = %i\n", sd->MemForStemDict); */
166
167 return sd;
168}
169
170
171/* [RPAP - Jan 97: Stem Index Change] */
172stemmed_idx *
173ReadStemIdxBlk (File * stem_idx_file)
174{
175 unsigned long i;
176 stemmed_idx *si;
177 u_char *buffer;
178
179 if (!(si = Xmalloc (sizeof (stemmed_idx))))
180 {
181 mg_errno = MG_NOMEM;
182 return (NULL);
183 }
184
185 si->stem_idx_file = stem_idx_file;
186 si->MemForStemIdx = 0;
187
188 Fread (&si->sih, sizeof (si->sih), 1, stem_idx_file);
189 /* [RPAP - Jan 97: Endian Ordering] */
190 NTOHUL(si->sih.lookback);
191 NTOHUL(si->sih.block_size);
192 NTOHUL(si->sih.num_blocks);
193 NTOHUL(si->sih.blocks_start);
194 NTOHUL(si->sih.index_chars);
195 NTOHUL(si->sih.num_of_words);
196
197 if (!(buffer = Xmalloc (si->sih.index_chars)))
198 {
199 Xfree (si);
200 mg_errno = MG_NOMEM;
201 return (NULL);
202 };
203 si->MemForStemIdx += si->sih.index_chars;
204
205 if (!(si->index = Xmalloc (si->sih.num_blocks * sizeof (*si->index))))
206 {
207 Xfree (si);
208 Xfree (buffer);
209 mg_errno = MG_NOMEM;
210 return (NULL);
211 };
212 si->MemForStemIdx += si->sih.num_blocks * sizeof (*si->index);
213
214 if (!(si->pos = Xmalloc (si->sih.num_blocks * sizeof (*si->pos))))
215 {
216 Xfree (si->index);
217 Xfree (si);
218 Xfree (buffer);
219 mg_errno = MG_NOMEM;
220 return (NULL);
221 };
222 si->MemForStemIdx += si->sih.num_blocks * sizeof (*si->pos);
223
224 if (!(si->buffer = Xmalloc (si->sih.block_size * sizeof (*si->buffer))))
225 {
226 Xfree (buffer);
227 Xfree (si->index);
228 Xfree (si->buffer);
229 Xfree (si);
230 mg_errno = MG_NOMEM;
231 return (NULL);
232 };
233 si->MemForStemIdx += si->sih.block_size * sizeof (*si->buffer);
234
235 si->active = -1;
236
237 for (i = 0; i < si->sih.num_blocks; i++)
238 {
239 register u_char len;
240 si->index[i] = buffer;
241 len = Getc (stem_idx_file);
242 *buffer++ = len;
243 Fread (buffer, sizeof (u_char), len, stem_idx_file);
244 buffer += len;
245 Fread (&si->pos[i], sizeof (*si->pos), 1, stem_idx_file);
246 NTOHUL(si->pos[i]); /* [RPAP - Jan 97: Endian Ordering] */
247 }
248 mg_errno = MG_NOERROR;
249
250 /* fprintf (stderr, "mem for stem idx = %i\n", si->MemForStemIdx); */
251
252 return si;
253}
254
255
256/* [RPAP - Jan 97: Stem Index Change] */
257/* word should be appropriately stemed */
258static int
259GetIdxBlock (stemmed_idx * si, u_char * word)
260{
261 register int lo = 0, hi = si->sih.num_blocks - 1;
262 register int mid = 0, c = 0;
263
264 while (lo <= hi)
265 {
266 mid = (lo + hi) / 2;
267 c = casecompare (word, si->index[mid]);
268 if (c < 0)
269 hi = mid - 1;
270 else if (c > 0)
271 lo = mid + 1;
272 else
273 return mid;
274 }
275 return hi < 0 ? 0 : (c < 0 ? mid - 1 : mid);
276}
277
278
279static int
280GetBlock (stemmed_dict * sd, u_char * Word)
281{
282 register int lo = 0, hi = sd->sdh.num_blocks - 1;
283 register int mid = 0, c = 0;
284 while (lo <= hi)
285 {
286 mid = (lo + hi) / 2;
287 c = casecompare (Word, sd->index[mid]); /* [RPAP - Jan 97: Stem Index Change] */
288 if (c < 0)
289 hi = mid - 1;
290 else if (c > 0)
291 lo = mid + 1;
292 else
293 return mid;
294 }
295 return hi < 0 ? 0 : (c < 0 ? mid - 1 : mid);
296}
297
298
299/*
300 * This function looks up a word in the stemmed dictionary, it returns -1
301 * if the word cound not be found, and 0 if it successfully finds the word.
302 * If count is non-null the ulong it is pointing to is set to the number of
303 * occurances of the stemmed word in the collection. i.e wcnt.
304 * If doc_count is non-null the ulong it is pointing to is set to the number
305 * of documents that the word occurs in. i.e fcnt
306 * If invf_ptr is non-null the ulong it is pointing to is set to the position
307 * of the inverted file where the entry for this word start.
308 */
309int
310FindWord (stemmed_dict * sd, u_char * Word, unsigned long *count,
311 unsigned long *doc_count, unsigned long *invf_ptr,
312 unsigned long *invf_len)
313{
314 register int lo, hi, mid, c;
315 register unsigned int res;
316 int block, num_indexes;
317 unsigned long *first_word, *last_invf_len;
318 unsigned short *num_words;
319 u_char *base;
320 unsigned short *index;
321 u_char prev[MAXSTEMLEN + 1];
322
323 block = GetBlock (sd, Word);
324 /* [RPAP - Jan 97: Endian Ordering] */
325 if (sd->active != sd->pos[block])
326 {
327 int i;
328
329 Fseek (sd->stem_file, sd->pos[block] + sd->sdh.blocks_start, 0);
330 Fread (sd->buffer, sd->sdh.block_size, sizeof (u_char), sd->stem_file);
331 sd->active = sd->pos[block];
332
333 /* [RPAP - Jan 97: Endian Ordering] */
334 first_word = (unsigned long *) (sd->buffer);
335 NTOHUL(*first_word);
336 last_invf_len = (unsigned long *) (first_word + 1);
337 NTOHUL(*last_invf_len);
338 num_words = (unsigned short *) (last_invf_len + 1);
339 NTOHUS(*num_words);
340 index = num_words + 1;
341 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
342
343 for (i = 0; i < num_indexes; i++)
344 NTOHUS(index[i]);
345 }
346 else
347 {
348 first_word = (unsigned long *) (sd->buffer);
349 last_invf_len = (unsigned long *) (first_word + 1);
350 num_words = (unsigned short *) (last_invf_len + 1);
351 index = num_words + 1;
352 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
353 }
354 base = (u_char *) (index + num_indexes);
355
356 lo = 0;
357 hi = num_indexes - 1;
358 while (lo <= hi)
359 {
360 mid = (lo + hi) / 2;
361 c = casecompare (Word, base + index[mid] + 1); /* [RPAP - Jan 97: Stem Index Change] */
362 if (c < 0)
363 hi = mid - 1;
364 else if (c > 0)
365 lo = mid + 1;
366 else
367 {
368 hi = mid;
369 break;
370 }
371 }
372 if (hi < 0)
373 hi = 0;
374
375 res = hi * sd->sdh.lookback;
376 base += index[hi];
377
378 for (;;)
379 {
380 unsigned copy, suff;
381 unsigned long invfp;
382 if (res >= *num_words)
383 return (-1);
384 copy = *base++;
385 suff = *base++;
386 bcopy ((char *) base, (char *) (prev + copy + 1), suff);
387 base += suff;
388 *prev = copy + suff;
389
390 c = casecompare (Word, prev); /* [RPAP - Jan 97: Stem Index Change] */
391 if (c < 0)
392 return (-1);
393
394 if (c == 0 && doc_count)
395 {
396 bcopy ((char *) base, (char *) doc_count, sizeof (*doc_count));
397 NTOHUL(*doc_count); /* [RPAP - Jan 97: Endian Ordering] */
398 }
399 base += sizeof (*doc_count);
400
401 if (c == 0 && count)
402 {
403 bcopy ((char *) base, (char *) count, sizeof (*count));
404 NTOHUL(*count); /* [RPAP - Jan 97: Endian Ordering] */
405 }
406 base += sizeof (*count);
407
408 if (c == 0 && invf_ptr)
409 {
410 bcopy ((char *) base, (char *) &invfp, sizeof (invf_ptr));
411 NTOHUL(invfp); /* [RPAP - Jan 97: Endian Ordering] */
412 *invf_ptr = invfp;
413 }
414 base += sizeof (*invf_ptr);
415
416 if (c == 0)
417 {
418 /* Calculate invf_len is necessary */
419 unsigned long next_invfp;
420 if (!invf_len)
421 return (*first_word + res);
422
423 /* If the current word is the last word of the block the get the
424 length from last_invf_len */
425 if (res == *num_words - 1)
426 {
427 *invf_len = *last_invf_len;
428 return (*first_word + res);
429 }
430
431 /* Skip over most of the next word to get to the invf_ptr */
432 base++;
433 suff = *base++;
434 base += suff + sizeof (unsigned long) * 2;
435 bcopy ((char *) base, (char *) &next_invfp, sizeof (next_invfp));
436 NTOHUL(next_invfp); /* [RPAP - Jan 97: Endian Ordering] */
437 *invf_len = next_invfp - invfp;
438 return (*first_word + res);
439 }
440 res++;
441 }
442}
443
444
445/* [RPAP - Jan 97: Stem Index Change] */
446int
447FindWords (stemmed_dict * sd, u_char * sWord, int stem_method, TermList ** tl)
448{
449 register unsigned int res;
450 unsigned int idx_res;
451 unsigned copy, suff;
452 int j, k;
453
454 int block, num_indexes;
455 unsigned long *first_word, *last_invf_len;
456 unsigned short *num_words;
457 u_char *base;
458 unsigned short *index;
459 u_char prev[MAXSTEMLEN + 1];
460
461 int idx_block, idx_num_indexes;
462 unsigned long *idx_first_word;
463 unsigned short *idx_num_words;
464 u_char *idx_base;
465 unsigned short *idx_index;
466 u_char idx_prev[MAXSTEMLEN + 1];
467
468 unsigned int num_entries, num_cases;
469 unsigned short blk_index, offset;
470 stemmed_idx * si = NULL;
471
472 /* handle stem_method 0 seperately */
473 if (stem_method == 0) {
474 TermEntry te;
475 if ((te.WE.word_num = FindWord (sd, sWord, &te.WE.count, &te.WE.doc_count,
476 &te.WE.invf_ptr, &te.WE.invf_len)) != -1) {
477 te.WE.max_doc_count = te.WE.doc_count;
478
479 te.Count = 1;
480 te.Word = copy_string (sWord);
481 if (!te.Word)
482 FatalError (1, "Could NOT create memory to add term");
483 te.Stem = copy_string (sWord);
484 if (!te.Stem)
485 FatalError (1, "Could NOT create memory to add term");
486 /* te.query_mask = NULL;*/
487
488 AddTermEntry (tl, &te);
489 return (*tl)->num;
490
491 } else {
492 /* didn't match */
493 return 0;
494 }
495 }
496
497 if (stem_method == 1)
498 si = sd->stem1;
499 else if (stem_method == 2)
500 si = sd->stem2;
501 else
502 si = sd->stem3;
503
504 /* Locate block */
505 idx_block = GetIdxBlock (si, sWord);
506
507 /* [RPAP - Jan 97: Endian Ordering] */
508 if (si->active != si->pos[idx_block])
509 {
510 Fseek (si->stem_idx_file, si->pos[idx_block] + si->sih.blocks_start, 0);
511 Fread (si->buffer, si->sih.block_size, sizeof (u_char), si->stem_idx_file);
512 si->active = si->pos[idx_block];
513
514 idx_first_word = (unsigned long *) (si->buffer);
515 NTOHUL(*idx_first_word); /* [RPAP - Jan 97: Endian Ordering] */
516 idx_num_words = (unsigned short *) (idx_first_word + 1);
517 NTOHUS(*idx_num_words); /* [RPAP - Jan 97: Endian Ordering] */
518 idx_index = idx_num_words + 1;
519 idx_num_indexes = ((*idx_num_words - 1) / si->sih.lookback) + 1;
520
521 /* [RPAP - Jan 97: Endian Ordering] */
522 for (j = 0; j < idx_num_indexes; j++)
523 NTOHUS(idx_index[j]);
524 }
525 else
526 {
527 idx_first_word = (unsigned long *) (si->buffer);
528 idx_num_words = (unsigned short *) (idx_first_word + 1);
529 idx_index = idx_num_words + 1;
530 idx_num_indexes = ((*idx_num_words - 1) / si->sih.lookback) + 1;
531 }
532 idx_base = (u_char *) (idx_index + idx_num_indexes);
533
534 {
535 /* Locate 3-in-4 block */
536 register int lo, hi, mid, c;
537 lo = 0;
538 hi = idx_num_indexes - 1;
539 while (lo <= hi)
540 {
541 mid = (lo + hi) / 2;
542 c = casecompare (sWord, idx_base + idx_index[mid] + 1);
543 if (c < 0)
544 hi = mid - 1;
545 else if (c > 0)
546 lo = mid + 1;
547 else
548 {
549 hi = mid;
550 break;
551 }
552 }
553 if (hi < 0)
554 hi = 0;
555
556 idx_res = hi * si->sih.lookback;
557 idx_base += idx_index[hi];
558 }
559
560 /* Locate actual word entry */
561 for (;;)
562 {
563 int c;
564 if (idx_res >= *idx_num_words)
565 return (-1);
566 copy = *idx_base++;
567 suff = *idx_base++;
568 bcopy ((char *) idx_base, (char *) (idx_prev + copy + 1), suff);
569 idx_base += suff;
570 *idx_prev = copy + suff;
571
572 c = casecompare (sWord, idx_prev);
573 if (c < 0)
574 return (-1);
575
576 bcopy ((char *) idx_base, (char *) &num_entries, sizeof (num_entries));
577 NTOHUI(num_entries); /* [RPAP - Jan 97: Endian Ordering] */
578 idx_base += sizeof (num_entries);
579
580 if (c > 0)
581 idx_base += num_entries * (sizeof (num_cases) + sizeof (block) +
582 sizeof (blk_index) + sizeof (offset));
583
584 else
585 break;
586
587 idx_res++;
588 }
589
590 for (k = 0; k < num_entries; k++)
591 {
592 unsigned copy, suff;
593 unsigned long invfp;
594 /* Read next stem index pos */
595 bcopy ((char *) idx_base, (char *) &num_cases, sizeof (num_cases));
596 NTOHUI(num_cases); /* [RPAP - Jan 97: Endian Ordering] */
597 idx_base += sizeof (num_cases);
598 bcopy ((char *) idx_base, (char *) &block, sizeof (block));
599 NTOHUI(block); /* [RPAP - Jan 97: Endian Ordering] */
600 idx_base += sizeof (block);
601 bcopy ((char *) idx_base, (char *) &blk_index, sizeof (blk_index));
602 NTOHUS(blk_index); /* [RPAP - Jan 97: Endian Ordering] */
603 idx_base += sizeof (blk_index);
604 bcopy ((char *) idx_base, (char *) &offset, sizeof (offset));
605 NTOHUS(offset); /* [RPAP - Jan 97: Endian Ordering] */
606 idx_base += sizeof (offset);
607
608 /* [RPAP - Jan 97: Endian Ordering] */
609 if (sd->active != sd->pos[block])
610 {
611 Fseek (sd->stem_file, sd->pos[block] + sd->sdh.blocks_start, 0);
612 Fread (sd->buffer, sd->sdh.block_size, sizeof (u_char), sd->stem_file);
613 sd->active = sd->pos[block];
614
615 first_word = (unsigned long *) (sd->buffer);
616 NTOHUL(*first_word); /* [RPAP - Jan 97: Endian Ordering] */
617 last_invf_len = (unsigned long *) (first_word + 1);
618 NTOHUL(*last_invf_len); /* [RPAP - Jan 97: Endian Ordering] */
619 num_words = (unsigned short *) (last_invf_len + 1);
620 NTOHUS(*num_words); /* [RPAP - Jan 97: Endian Ordering] */
621 index = num_words + 1;
622 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
623
624 /* [RPAP - Jan 97: Endian Ordering] */
625 for (j = 0; j < num_indexes; j++)
626 NTOHUS(index[j]);
627 }
628 else
629 {
630 first_word = (unsigned long *) (sd->buffer);
631 last_invf_len = (unsigned long *) (first_word + 1);
632 num_words = (unsigned short *) (last_invf_len + 1);
633 index = num_words + 1;
634 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
635 }
636 base = (u_char *) (index + num_indexes);
637
638 res = blk_index * sd->sdh.lookback;
639 base += index[blk_index];
640
641 for (j = 0; j < offset; j++)
642 {
643 copy = *base++;
644 suff = *base++;
645 bcopy ((char *) base, (char *) (prev + copy + 1), suff);
646 base += suff;
647 *prev = copy + suff;
648 base += sizeof (unsigned long); /* skip doc_count */
649 base += sizeof (unsigned long); /* skip count */
650 base += sizeof (unsigned long); /* skip invf_ptr */
651 res++;
652 }
653
654 for (j = 0; j < num_cases; j++)
655 {
656 TermEntry te;
657
658 if (res >= *num_words)
659 return (-1);
660 copy = *base++;
661 suff = *base++;
662 bcopy ((char *) base, (char *) (prev + copy + 1), suff);
663 base += suff;
664 *prev = copy + suff;
665
666 te.Word = copy_string (prev);
667 if (!te.Word)
668 FatalError (1, "Could NOT create memory to add term");
669 te.Stem = copy_string (prev);
670 if (!te.Stem)
671 FatalError (1, "Could NOT create memory to add term");
672 stemmer (2, sd->sdh.stemmer_num, te.Stem);
673
674 te.Count = 1;
675 te.WE.word_num = *first_word + res;
676 bcopy ((char *) base, (char *) &te.WE.doc_count, sizeof (te.WE.doc_count));
677 NTOHUL(te.WE.doc_count); /* [RPAP - Jan 97: Endian Ordering] */
678 te.WE.max_doc_count = te.WE.doc_count;
679 base += sizeof (te.WE.doc_count);
680
681 bcopy ((char *) base, (char *) &te.WE.count, sizeof (te.WE.count));
682 NTOHUL(te.WE.count);
683 base += sizeof (te.WE.count);
684
685 bcopy ((char *) base, (char *) &invfp, sizeof (te.WE.invf_ptr));
686 NTOHUL(invfp); /* [RPAP - Jan 97: Endian Ordering] */
687 te.WE.invf_ptr = invfp;
688 base += sizeof (te.WE.invf_ptr);
689
690 /* If the current word is the last word of the block the get the
691 length from last_invf_len */
692 if (res == *num_words - 1)
693 te.WE.invf_len = *last_invf_len;
694 else
695 {
696 unsigned long next_invfp;
697 u_char *oldbase = base;
698
699 /* Skip over most of the next word to get to the invf_ptr */
700 base++;
701 suff = *base++;
702 base += suff + sizeof (unsigned long) * 2;
703 bcopy ((char *) base, (char *) &next_invfp, sizeof (next_invfp));
704 NTOHUL(next_invfp); /* [RPAP - Jan 97: Endian Ordering] */
705 te.WE.invf_len = next_invfp - invfp;
706 base = oldbase;
707 }
708
709 /* Add term entry to term list */
710 /* te.query_mask = NULL;*/
711 AddTermEntry (tl, &te);
712
713 if (res == *num_words - 1 && j + 1 < num_cases)
714 {
715 int ii;
716 /* Read in next block */
717 block++;
718 Fseek (sd->stem_file, sd->pos[block] + sd->sdh.blocks_start, 0);
719 Fread (sd->buffer, sd->sdh.block_size, sizeof (u_char), sd->stem_file);
720 sd->active = sd->pos[block];
721
722 first_word = (unsigned long *) (sd->buffer);
723 NTOHUL(*first_word); /* [RPAP - Jan 97: Endian Ordering] */
724 last_invf_len = (unsigned long *) (first_word + 1);
725 NTOHUL(*last_invf_len); /* [RPAP - Jan 97: Endian Ordering] */
726 num_words = (unsigned short *) (last_invf_len + 1);
727 NTOHUS(*num_words); /* [RPAP - Jan 97: Endian Ordering] */
728 index = num_words + 1;
729 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
730
731 /* [RPAP - Jan 97: Endian Ordering] */
732 for (ii = 0; ii < num_indexes; ii++)
733 NTOHUS(index[ii]);
734
735 base = (u_char *) (index + num_indexes);
736 base += index[0];
737 res = 0;
738 blk_index = 0;
739 }
740 else
741 res++;
742 } /* end for num_cases */
743 } /* end for num_entries */
744 return (*tl)->num;
745}
746
747
748void
749FreeStemDict (stemmed_dict * sd)
750{
751 /* [RPAP - Jan 97: Stem Index Change] */
752 if (sd->stem1)
753 FreeStemIdx (sd->stem1);
754 if (sd->stem2)
755 FreeStemIdx (sd->stem2);
756 if (sd->stem3)
757 FreeStemIdx (sd->stem3);
758
759 Xfree (sd->index[0]);
760 Xfree (sd->index);
761 Xfree (sd->buffer);
762 Xfree (sd->pos);
763 Xfree (sd);
764}
765
766/* [RPAP - Jan 97: Stem Index Change] */
767void
768FreeStemIdx (stemmed_idx * si)
769{
770 Xfree (si->index[0]);
771 Xfree (si->index);
772 Xfree (si->buffer);
773 Xfree (si->pos);
774 Xfree (si);
775}
Note: See TracBrowser for help on using the repository browser.