source: trunk/gsdl/src/mgpp/text/stem_search.cpp@ 711

Last change on this file since 711 was 711, checked in by cs025, 25 years ago

Changes to eradicate Xmalloc

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.7 KB
Line 
1/**************************************************************************
2 *
3 * stem_search.c -- Functions for searching the blocked stemmed dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: stem_search.cpp 711 1999-10-17 23:43:31Z cs025 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "messages.h"
28#include "filestats.h"
29#include "timing.h"
30#include "local_strings.h"
31#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
32
33#include "mg.h"
34#include "invf.h"
35#include "text.h"
36#include "lists.h"
37#include "backend.h"
38#include "words.h"
39#include "locallib.h"
40#include "stem_search.h"
41#include "StemBlock.h"
42#include "StemIdx.h"
43#include "mg_errors.h"
44#include "term_lists.h"
45#include "stemmer.h"
46
47
48/*
49 $Log$
50 Revision 1.2 1999/10/17 23:43:29 cs025
51 Changes to eradicate Xmalloc
52
53 Revision 1.1 1999/10/11 02:58:32 cs025
54 Base install of MG-PP
55
56 Revision 1.1 1999/08/10 21:18:22 sjboddie
57 renamed mg-1.3d directory mg
58
59 Revision 1.3 1999/07/02 00:18:55 rjmcnab
60 Changed so FindWords could be used in new ways.
61
62 Revision 1.2 1998/11/25 07:55:51 rjmcnab
63
64 Modified mg to that you can specify the stemmer you want
65 to use via a command line option. You specify it to
66 mg_passes during the build process. The number of the
67 stemmer that you used is stored within the inverted
68 dictionary header and the stemmed dictionary header so
69 the correct stemmer is used in later stages of building
70 and querying.
71
72 Revision 1.1 1998/11/17 09:35:39 rjmcnab
73 *** empty log message ***
74
75 * Revision 1.3 1994/10/20 03:57:04 tes
76 * I have rewritten the boolean query optimiser and abstracted out the
77 * components of the boolean query.
78 *
79 * Revision 1.2 1994/09/20 04:42:08 tes
80 * For version 1.1
81 *
82 */
83
84static char *RCSID = "$Id: stem_search.cpp 711 1999-10-17 23:43:31Z cs025 $";
85
86
87size_t stemAbstractReader(void *ptr, size_t size, size_t nitems, void *file)
88{ return Fread(ptr, size, nitems, (File *) file);
89}
90
91char stemAbstractGetc(void *file)
92{ return Getc(((File *) file));
93}
94
95int stemAbstractSeeker(void *file, long offset, int ptrname)
96{ return Fseek((File *) file, offset, ptrname);
97}
98
99/* [RPAP - Jan 97: Stem Index Change] */
100stemmed_idx *
101ReadStemIdxBlk (File * stem_idx_file)
102{
103 unsigned long i;
104 stemmed_idx *si;
105 u_char *buffer;
106
107 si = StemIdx_header_read((void *) stem_idx_file, 1, stemAbstractReader, stemAbstractGetc);
108 if (si == NULL)
109 {
110 mg_errno = MG_NOMEM;
111 return (NULL);
112 }
113 mg_errno = MG_NOERROR;
114
115 /* fprintf (stderr, "mem for stem idx = %i\n", si->MemForStemIdx); */
116
117 return si;
118}
119
120static int
121Word_BaseIndexNo(int num_indexes, u_char *base, unsigned short *index, u_char *word)
122{
123 register int lo, hi, mid, c;
124
125 lo = 0;
126 hi = num_indexes - 1;
127 while (lo <= hi)
128 {
129 mid = (lo + hi) / 2;
130 c = casecompare (word, base + index[mid] + 1); /* [RPAP - Jan 97: Stem Index Change] */
131 if (c < 0)
132 hi = mid - 1;
133 else if (c > 0)
134 lo = mid + 1;
135 else
136 {
137 hi = mid;
138 break;
139 }
140 }
141 if (hi < 0)
142 hi = 0;
143
144 return hi;
145}
146
147
148/* [RPAP - Jan 97: Stem Index Change] */
149/* word should be appropriately stemed */
150static int
151Word_IdxBlockNo (stemmed_idx * si, u_char * word)
152{
153 register int lo = 0, hi = si->sih.num_blocks - 1;
154 register int mid = 0, c = 0;
155
156 while (lo <= hi)
157 {
158 mid = (lo + hi) / 2;
159 c = casecompare (word, si->index[mid]);
160 if (c < 0)
161 hi = mid - 1;
162 else if (c > 0)
163 lo = mid + 1;
164 else
165 return mid;
166 }
167 return hi < 0 ? 0 : (c < 0 ? mid - 1 : mid);
168}
169
170
171static int
172Word_BlockNo (stemmed_dict * sd, u_char * word)
173{
174 register int lo = 0, hi = sd->sdh.num_blocks - 1;
175 register int mid = 0, c = 0;
176 while (lo <= hi)
177 {
178 mid = (lo + hi) / 2;
179 c = casecompare (word, sd->index[mid]); /* [RPAP - Jan 97: Stem Index Change] */
180 if (c < 0)
181 hi = mid - 1;
182 else if (c > 0)
183 lo = mid + 1;
184 else
185 return mid;
186 }
187 return hi < 0 ? 0 : (c < 0 ? mid - 1 : mid);
188}
189
190unsigned long Word_SkipToNextInvfPtr(u_char **base, unsigned long invfp)
191{
192 unsigned suff;
193 unsigned long next_invfp;
194
195 *base = *base + 1;
196 suff = **base;
197 *base = *base + 1;
198 *base += suff + sizeof (unsigned long) * 2;
199 bcopy ((void *) *base, (void *) &next_invfp, sizeof (next_invfp));
200 NTOHUL(next_invfp); /* [RPAP - Jan 97: Endian Ordering] */
201 return next_invfp - invfp;
202}
203
204/*
205 * This function looks up a word in the stemmed dictionary, it returns -1
206 * if the word cound not be found, and 0 if it successfully finds the word.
207 * If count is non-null the ulong it is pointing to is set to the number of
208 * occurances of the stemmed word in the collection. i.e wcnt.
209 * If doc_count is non-null the ulong it is pointing to is set to the number
210 * of documents that the word occurs in. i.e fcnt
211 * If invf_ptr is non-null the ulong it is pointing to is set to the position
212 * of the inverted file where the entry for this word start.
213 */
214int
215FindWord (stemmed_dict * sd, u_char * Word, unsigned long *count,
216 unsigned long *doc_count, unsigned long *invf_ptr,
217 unsigned long *invf_len)
218{
219 register int lo, hi, mid, c;
220 register unsigned int res;
221 int block, num_indexes;
222 unsigned long *first_word, *last_invf_len;
223 unsigned short *num_words;
224 u_char *base;
225 unsigned short *index;
226 u_char prev[MAXSTEMLEN + 1];
227
228 /**
229 * GRB : Get the index for the block of words in the stemmed dictionary;
230 * If the block required is not the current cached (active) block,
231 * then load that block in now.
232 */
233
234 block = Word_BlockNo (sd, Word);
235 /* [RPAP - Jan 97: Endian Ordering] */
236 if (sd->active != sd->pos[block])
237 {
238 StemBlock_read(sd, block, &first_word, &last_invf_len, &num_words,
239 &index, &num_indexes);
240 }
241 else
242 {
243 first_word = (unsigned long *) (sd->buffer);
244 last_invf_len = (unsigned long *) (first_word + 1);
245 num_words = (unsigned short *) (last_invf_len + 1);
246 index = num_words + 1;
247 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
248 }
249 base = (u_char *) (index + num_indexes);
250
251 /* --
252 -- GRB: search current block for the given word using binary chop
253 --
254 */
255 hi = Word_BaseIndexNo(num_indexes, base, index, Word);
256
257 res = hi * sd->sdh.lookback;
258 base += index[hi];
259
260 /**
261 *
262 * GRB: stream over the words which come from the same stem, and
263 * check for matches amongst them
264 */
265 while (res < *num_words)
266 {
267 unsigned long invfp;
268 StemBlock_ReadWordString(&base, prev);
269
270 c = casecompare (Word, prev); /* [RPAP - Jan 97: Stem Index Change] */
271 if (c < 0)
272 return (-1);
273
274 if (c == 0 && doc_count)
275 {
276 bcopy ((void *) base, (void *) doc_count, sizeof (*doc_count));
277 NTOHUL(*doc_count); /* [RPAP - Jan 97: Endian Ordering] */
278 }
279 base += sizeof (*doc_count);
280
281 if (c == 0 && count)
282 {
283 bcopy ((void *) base, (void *) count, sizeof (*count));
284 NTOHUL(*count); /* [RPAP - Jan 97: Endian Ordering] */
285 }
286 base += sizeof (*count);
287
288 if (c == 0 && invf_ptr)
289 {
290 bcopy ((void *) base, (void *) &invfp, sizeof (invf_ptr));
291 NTOHUL(invfp); /* [RPAP - Jan 97: Endian Ordering] */
292 *invf_ptr = invfp;
293 }
294 base += sizeof (*invf_ptr);
295
296 if (c == 0)
297 {
298 /* Calculate invf_len is necessary */
299 if (!invf_len)
300 return (*first_word + res);
301
302 /* If the current word is the last word of the block then get the
303 length from last_invf_len */
304 if (res == *num_words - 1)
305 {
306 *invf_len = *last_invf_len;
307 return (*first_word + res);
308 }
309
310 /* Skip over most of the next word to get to the invf_ptr */
311 *invf_len = Word_SkipToNextInvfPtr(&base, invfp);
312 return (*first_word + res);
313 }
314 res++;
315 }
316 /* -- GRB return a negative response; word was not actually found after all */
317 return (-1);
318}
319
320
321/* [RPAP - Jan 97: Stem Index Change] */
322int
323FindWords (stemmed_dict * sd, u_char * sWord, int stem_method, TermList ** tl)
324{
325 register unsigned int res;
326 unsigned int idx_res;
327 unsigned copy, suff;
328 int j, k;
329
330 int block, num_indexes;
331 unsigned long *first_word, *last_invf_len;
332 unsigned short *num_words;
333 u_char *base;
334 unsigned short *index;
335 u_char prev[MAXSTEMLEN + 1];
336
337 int idx_block, idx_num_indexes;
338 unsigned long *idx_first_word;
339 unsigned short *idx_num_words;
340 u_char *idx_base;
341 unsigned short *idx_index;
342 u_char idx_prev[MAXSTEMLEN + 1];
343
344 unsigned int num_entries, num_cases;
345 unsigned short blk_index, offset;
346 stemmed_idx * si = NULL;
347
348 /* handle stem_method 0 seperately */
349 if (stem_method == 0) {
350 TermEntry te;
351 if ((te.WE.word_num = FindWord (sd, sWord, &te.WE.count, &te.WE.doc_count,
352 &te.WE.invf_ptr, &te.WE.invf_len)) != -1) {
353 te.WE.max_doc_count = te.WE.doc_count;
354
355 te.Count = 1;
356 te.Word = copy_string (sWord);
357 if (!te.Word)
358 FatalError (1, "Could NOT create memory to add term");
359 te.Stem = copy_string (sWord);
360 if (!te.Stem)
361 FatalError (1, "Could NOT create memory to add term");
362 /* te.query_mask = NULL;*/
363
364 TermList_AddTermEntry (*tl, &te);
365 return (*tl)->num;
366
367 } else {
368 /* didn't match */
369 return 0;
370 }
371 }
372
373 if (stem_method == 1)
374 si = sd->stem1;
375 else if (stem_method == 2)
376 si = sd->stem2;
377 else
378 si = sd->stem3;
379
380 /**
381 * Locate block: If this block is not the active one, then make it active.
382 */
383 idx_block = Word_IdxBlockNo (si, sWord);
384
385 /* [RPAP - Jan 97: Endian Ordering] */
386 if (si->active != si->pos[idx_block])
387 {
388 StemIdx_block_readNext((void *) si->stem_idx_file, si, idx_block, &idx_first_word, &idx_num_words,
389 &idx_index, &idx_num_indexes, stemAbstractSeeker, stemAbstractReader);
390 }
391 else
392 {
393 idx_first_word = (unsigned long *) (si->buffer);
394 idx_num_words = (unsigned short *) (idx_first_word + 1);
395 idx_index = idx_num_words + 1;
396 idx_num_indexes = ((*idx_num_words - 1) / si->sih.lookback) + 1;
397 }
398 idx_base = (u_char *) (idx_index + idx_num_indexes);
399
400 /**
401 * Search in this stem index for the word using binary chop
402 */
403 {
404 /* Locate 3-in-4 block */
405 //register int lo, hi, mid, c;
406 int hi;
407
408 hi = Word_BaseIndexNo(idx_num_indexes, idx_base, idx_index, sWord);
409 idx_res = hi * si->sih.lookback;
410 idx_base += idx_index[hi];
411 }
412
413 /* Locate word entry by streaming forwards through the index */
414 for (;;)
415 {
416 int c;
417 if (idx_res >= *idx_num_words)
418 return (-1);
419 StemIdx_ReadWordString(&idx_base, idx_prev);
420
421 c = casecompare (sWord, idx_prev);
422 if (c < 0)
423 return (-1);
424
425 bcopy ((void *) idx_base, (void *) &num_entries, sizeof (num_entries));
426 NTOHUI(num_entries); /* [RPAP - Jan 97: Endian Ordering] */
427 idx_base += sizeof (num_entries);
428
429 if (c > 0)
430 idx_base += num_entries * (sizeof (num_cases) + sizeof (block) +
431 sizeof (blk_index) + sizeof (offset));
432 else
433 break;
434
435 idx_res++;
436 }
437
438 for (k = 0; k < num_entries; k++)
439 {
440 //unsigned copy, suff;
441 unsigned long invfp;
442 /**
443 * GRB: Read next stem index pos, getting the details of the next word
444 * which is an extension of this stem (a wordform of this stem)
445 */
446 StemIdx_ReadPosEntry(&idx_base, &num_cases, (unsigned int *) &block, &blk_index, &offset);
447
448
449 /**
450 * GRB: Now look up the given word form, as per FindWord above.
451 * Firstly, we need to get to the right block; we read this
452 * above [rather than using Word_BlockIndex as in FindWord()]
453 * So we just need to make sure that it is the active block
454 */
455
456 /* [RPAP - Jan 97: Endian Ordering] */
457 if (sd->active != sd->pos[block])
458 {
459 StemBlock_read(sd, block, &first_word, &last_invf_len, &num_words,
460 &index, &num_indexes);
461 }
462 else
463 {
464 first_word = (unsigned long *) (sd->buffer);
465 last_invf_len = (unsigned long *) (first_word + 1);
466 num_words = (unsigned short *) (last_invf_len + 1);
467 index = num_words + 1;
468 num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
469 }
470 base = (u_char *) (index + num_indexes);
471
472 res = blk_index * sd->sdh.lookback;
473 base += index[blk_index];
474
475 /**
476 * GRB: We know how many words to offset by, so let's scramble
477 * past them
478 */
479 for (j = 0; j < offset; j++)
480 {
481 StemBlock_ReadWordString(&base, prev);
482
483 base += sizeof (unsigned long); /* skip doc_count */
484 base += sizeof (unsigned long); /* skip count */
485 base += sizeof (unsigned long); /* skip invf_ptr */
486 res++;
487 }
488
489 /**
490 * GRB: Now take this wordform case by case
491 */
492 for (j = 0; j < num_cases; j++)
493 {
494 TermEntry te;
495
496 if (res >= *num_words)
497 return (-1);
498 StemBlock_ReadWordString(&base, prev);
499
500 te.Word = copy_string (prev);
501 if (!te.Word)
502 FatalError (1, "Could NOT create memory to add term");
503 te.Stem = copy_string (prev);
504 if (!te.Stem)
505 FatalError (1, "Could NOT create memory to add term");
506 stemmer (2, sd->sdh.stemmer_num, te.Stem);
507
508 te.Count = 1;
509 te.WE.word_num = *first_word + res;
510 bcopy ((void *) base, (void *) &te.WE.doc_count, sizeof (te.WE.doc_count));
511 NTOHUL(te.WE.doc_count); /* [RPAP - Jan 97: Endian Ordering] */
512 te.WE.max_doc_count = te.WE.doc_count;
513 base += sizeof (te.WE.doc_count);
514
515 bcopy ((void *) base, (void *) &te.WE.count, sizeof (te.WE.count));
516 NTOHUL(te.WE.count);
517 base += sizeof (te.WE.count);
518
519 bcopy ((void *) base, (void *) &invfp, sizeof (te.WE.invf_ptr));
520 NTOHUL(invfp); /* [RPAP - Jan 97: Endian Ordering] */
521 te.WE.invf_ptr = invfp;
522 base += sizeof (te.WE.invf_ptr);
523
524 /* If the current word is the last word of the block the get the
525 length from last_invf_len */
526 if (res == *num_words - 1)
527 te.WE.invf_len = *last_invf_len;
528 else
529 {
530 u_char *oldbase = base;
531
532 /* Skip over most of the next word to get to the invf_ptr */
533 te.WE.invf_len = Word_SkipToNextInvfPtr(&base, invfp);
534 base = oldbase;
535 }
536
537 /* Add term entry to term list */
538 /* te.query_mask = NULL;*/
539 TermList_AddTermEntry (*tl, &te);
540
541 /**
542 * GRB: If this is the final case of this wordform in this block,
543 * and we have one or more remaining blocks containing the wordform,
544 * then read in next block
545 */
546 if (res == *num_words - 1 && j + 1 < num_cases)
547 {
548 /* Read in next block */
549 block++;
550 StemBlock_read(sd, block, &first_word, &last_invf_len, &num_words,
551 &index, &num_indexes);
552 base = (u_char *) (index + num_indexes);
553 base += index[0];
554 res = 0;
555 blk_index = 0;
556 }
557 else
558 res++;
559 } /* end for num_cases */
560 } /* end for num_entries */
561 return (*tl)->num;
562}
563
564
565void
566FreeStemDict (stemmed_dict * sd)
567{
568 /* [RPAP - Jan 97: Stem Index Change] */
569 if (sd->stem1)
570 FreeStemIdx (sd->stem1);
571 if (sd->stem2)
572 FreeStemIdx (sd->stem2);
573 if (sd->stem3)
574 FreeStemIdx (sd->stem3);
575
576 delete (sd->index[0]);
577 delete (sd->index);
578 delete (sd->buffer);
579 delete (sd->pos);
580 delete sd;
581}
582
583/* [RPAP - Jan 97: Stem Index Change] */
584void
585FreeStemIdx (stemmed_idx * si)
586{
587 delete si->index[0];
588 delete si->index;
589 delete si->buffer;
590 delete si->pos;
591 delete si;
592}
Note: See TracBrowser for help on using the repository browser.