Context Navigation

source: trunk/gsdl/src/mgpp/text/stem_search.cpp@ 711

Last change on this file since 711 was 711, checked in by cs025, 25 years ago
Changes to eradicate Xmalloc
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 15.7 KB

Line
1	/**************************************************************************
2	*
3	* stem_search.c -- Functions for searching the blocked stemmed dictionary
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: stem_search.cpp 711 1999-10-17 23:43:31Z cs025 $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "memlib.h"
27	#include "messages.h"
28	#include "filestats.h"
29	#include "timing.h"
30	#include "local_strings.h"
31	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
32
33	#include "mg.h"
34	#include "invf.h"
35	#include "text.h"
36	#include "lists.h"
37	#include "backend.h"
38	#include "words.h"
39	#include "locallib.h"
40	#include "stem_search.h"
41	#include "StemBlock.h"
42	#include "StemIdx.h"
43	#include "mg_errors.h"
44	#include "term_lists.h"
45	#include "stemmer.h"
46
47
48	/*
49	$Log$
50	Revision 1.2 1999/10/17 23:43:29 cs025
51	Changes to eradicate Xmalloc
52
53	Revision 1.1 1999/10/11 02:58:32 cs025
54	Base install of MG-PP
55
56	Revision 1.1 1999/08/10 21:18:22 sjboddie
57	renamed mg-1.3d directory mg
58
59	Revision 1.3 1999/07/02 00:18:55 rjmcnab
60	Changed so FindWords could be used in new ways.
61
62	Revision 1.2 1998/11/25 07:55:51 rjmcnab
63
64	Modified mg to that you can specify the stemmer you want
65	to use via a command line option. You specify it to
66	mg_passes during the build process. The number of the
67	stemmer that you used is stored within the inverted
68	dictionary header and the stemmed dictionary header so
69	the correct stemmer is used in later stages of building
70	and querying.
71
72	Revision 1.1 1998/11/17 09:35:39 rjmcnab
73	* empty log message *
74
75	* Revision 1.3 1994/10/20 03:57:04 tes
76	* I have rewritten the boolean query optimiser and abstracted out the
77	* components of the boolean query.
78	*
79	* Revision 1.2 1994/09/20 04:42:08 tes
80	* For version 1.1
81	*
82	*/
83
84	static char *RCSID = "$Id: stem_search.cpp 711 1999-10-17 23:43:31Z cs025 $";
85
86
87	size_t stemAbstractReader(void ptr, size_t size, size_t nitems, void file)
88	{ return Fread(ptr, size, nitems, (File *) file);
89	}
90
91	char stemAbstractGetc(void *file)
92	{ return Getc(((File *) file));
93	}
94
95	int stemAbstractSeeker(void *file, long offset, int ptrname)
96	{ return Fseek((File *) file, offset, ptrname);
97	}
98
99	/* [RPAP - Jan 97: Stem Index Change] */
100	stemmed_idx *
101	ReadStemIdxBlk (File * stem_idx_file)
102	{
103	unsigned long i;
104	stemmed_idx *si;
105	u_char *buffer;
106
107	si = StemIdx_header_read((void *) stem_idx_file, 1, stemAbstractReader, stemAbstractGetc);
108	if (si == NULL)
109	{
110	mg_errno = MG_NOMEM;
111	return (NULL);
112	}
113	mg_errno = MG_NOERROR;
114
115	/* fprintf (stderr, "mem for stem idx = %i\n", si->MemForStemIdx); */
116
117	return si;
118	}
119
120	static int
121	Word_BaseIndexNo(int num_indexes, u_char base, unsigned short index, u_char *word)
122	{
123	register int lo, hi, mid, c;
124
125	lo = 0;
126	hi = num_indexes - 1;
127	while (lo <= hi)
128	{
129	mid = (lo + hi) / 2;
130	c = casecompare (word, base + index[mid] + 1); /* [RPAP - Jan 97: Stem Index Change] */
131	if (c < 0)
132	hi = mid - 1;
133	else if (c > 0)
134	lo = mid + 1;
135	else
136	{
137	hi = mid;
138	break;
139	}
140	}
141	if (hi < 0)
142	hi = 0;
143
144	return hi;
145	}
146
147
148	/* [RPAP - Jan 97: Stem Index Change] */
149	/* word should be appropriately stemed */
150	static int
151	Word_IdxBlockNo (stemmed_idx * si, u_char * word)
152	{
153	register int lo = 0, hi = si->sih.num_blocks - 1;
154	register int mid = 0, c = 0;
155
156	while (lo <= hi)
157	{
158	mid = (lo + hi) / 2;
159	c = casecompare (word, si->index[mid]);
160	if (c < 0)
161	hi = mid - 1;
162	else if (c > 0)
163	lo = mid + 1;
164	else
165	return mid;
166	}
167	return hi < 0 ? 0 : (c < 0 ? mid - 1 : mid);
168	}
169
170
171	static int
172	Word_BlockNo (stemmed_dict * sd, u_char * word)
173	{
174	register int lo = 0, hi = sd->sdh.num_blocks - 1;
175	register int mid = 0, c = 0;
176	while (lo <= hi)
177	{
178	mid = (lo + hi) / 2;
179	c = casecompare (word, sd->index[mid]); /* [RPAP - Jan 97: Stem Index Change] */
180	if (c < 0)
181	hi = mid - 1;
182	else if (c > 0)
183	lo = mid + 1;
184	else
185	return mid;
186	}
187	return hi < 0 ? 0 : (c < 0 ? mid - 1 : mid);
188	}
189
190	unsigned long Word_SkipToNextInvfPtr(u_char **base, unsigned long invfp)
191	{
192	unsigned suff;
193	unsigned long next_invfp;
194
195	base = base + 1;
196	suff = **base;
197	base = base + 1;
198	base += suff + sizeof (unsigned long) 2;
199	bcopy ((void ) base, (void *) &next_invfp, sizeof (next_invfp));
200	NTOHUL(next_invfp); /* [RPAP - Jan 97: Endian Ordering] */
201	return next_invfp - invfp;
202	}
203
204	/*
205	* This function looks up a word in the stemmed dictionary, it returns -1
206	* if the word cound not be found, and 0 if it successfully finds the word.
207	* If count is non-null the ulong it is pointing to is set to the number of
208	* occurances of the stemmed word in the collection. i.e wcnt.
209	* If doc_count is non-null the ulong it is pointing to is set to the number
210	* of documents that the word occurs in. i.e fcnt
211	* If invf_ptr is non-null the ulong it is pointing to is set to the position
212	* of the inverted file where the entry for this word start.
213	*/
214	int
215	FindWord (stemmed_dict * sd, u_char * Word, unsigned long *count,
216	unsigned long doc_count, unsigned long invf_ptr,
217	unsigned long *invf_len)
218	{
219	register int lo, hi, mid, c;
220	register unsigned int res;
221	int block, num_indexes;
222	unsigned long first_word, last_invf_len;
223	unsigned short *num_words;
224	u_char *base;
225	unsigned short *index;
226	u_char prev[MAXSTEMLEN + 1];
227
228	/**
229	* GRB : Get the index for the block of words in the stemmed dictionary;
230	* If the block required is not the current cached (active) block,
231	* then load that block in now.
232	*/
233
234	block = Word_BlockNo (sd, Word);
235	/* [RPAP - Jan 97: Endian Ordering] */
236	if (sd->active != sd->pos[block])
237	{
238	StemBlock_read(sd, block, &first_word, &last_invf_len, &num_words,
239	&index, &num_indexes);
240	}
241	else
242	{
243	first_word = (unsigned long *) (sd->buffer);
244	last_invf_len = (unsigned long *) (first_word + 1);
245	num_words = (unsigned short *) (last_invf_len + 1);
246	index = num_words + 1;
247	num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
248	}
249	base = (u_char *) (index + num_indexes);
250
251	/* --
252	-- GRB: search current block for the given word using binary chop
253	--
254	*/
255	hi = Word_BaseIndexNo(num_indexes, base, index, Word);
256
257	res = hi * sd->sdh.lookback;
258	base += index[hi];
259
260	/**
261	*
262	* GRB: stream over the words which come from the same stem, and
263	* check for matches amongst them
264	*/
265	while (res < *num_words)
266	{
267	unsigned long invfp;
268	StemBlock_ReadWordString(&base, prev);
269
270	c = casecompare (Word, prev); /* [RPAP - Jan 97: Stem Index Change] */
271	if (c < 0)
272	return (-1);
273
274	if (c == 0 && doc_count)
275	{
276	bcopy ((void ) base, (void ) doc_count, sizeof (*doc_count));
277	NTOHUL(doc_count); / [RPAP - Jan 97: Endian Ordering] */
278	}
279	base += sizeof (*doc_count);
280
281	if (c == 0 && count)
282	{
283	bcopy ((void ) base, (void ) count, sizeof (*count));
284	NTOHUL(count); / [RPAP - Jan 97: Endian Ordering] */
285	}
286	base += sizeof (*count);
287
288	if (c == 0 && invf_ptr)
289	{
290	bcopy ((void ) base, (void ) &invfp, sizeof (invf_ptr));
291	NTOHUL(invfp); /* [RPAP - Jan 97: Endian Ordering] */
292	*invf_ptr = invfp;
293	}
294	base += sizeof (*invf_ptr);
295
296	if (c == 0)
297	{
298	/* Calculate invf_len is necessary */
299	if (!invf_len)
300	return (*first_word + res);
301
302	/* If the current word is the last word of the block then get the
303	length from last_invf_len */
304	if (res == *num_words - 1)
305	{
306	invf_len = last_invf_len;
307	return (*first_word + res);
308	}
309
310	/* Skip over most of the next word to get to the invf_ptr */
311	*invf_len = Word_SkipToNextInvfPtr(&base, invfp);
312	return (*first_word + res);
313	}
314	res++;
315	}
316	/* -- GRB return a negative response; word was not actually found after all */
317	return (-1);
318	}
319
320
321	/* [RPAP - Jan 97: Stem Index Change] */
322	int
323	FindWords (stemmed_dict * sd, u_char * sWord, int stem_method, TermList ** tl)
324	{
325	register unsigned int res;
326	unsigned int idx_res;
327	unsigned copy, suff;
328	int j, k;
329
330	int block, num_indexes;
331	unsigned long first_word, last_invf_len;
332	unsigned short *num_words;
333	u_char *base;
334	unsigned short *index;
335	u_char prev[MAXSTEMLEN + 1];
336
337	int idx_block, idx_num_indexes;
338	unsigned long *idx_first_word;
339	unsigned short *idx_num_words;
340	u_char *idx_base;
341	unsigned short *idx_index;
342	u_char idx_prev[MAXSTEMLEN + 1];
343
344	unsigned int num_entries, num_cases;
345	unsigned short blk_index, offset;
346	stemmed_idx * si = NULL;
347
348	/* handle stem_method 0 seperately */
349	if (stem_method == 0) {
350	TermEntry te;
351	if ((te.WE.word_num = FindWord (sd, sWord, &te.WE.count, &te.WE.doc_count,
352	&te.WE.invf_ptr, &te.WE.invf_len)) != -1) {
353	te.WE.max_doc_count = te.WE.doc_count;
354
355	te.Count = 1;
356	te.Word = copy_string (sWord);
357	if (!te.Word)
358	FatalError (1, "Could NOT create memory to add term");
359	te.Stem = copy_string (sWord);
360	if (!te.Stem)
361	FatalError (1, "Could NOT create memory to add term");
362	/* te.query_mask = NULL;*/
363
364	TermList_AddTermEntry (*tl, &te);
365	return (*tl)->num;
366
367	} else {
368	/* didn't match */
369	return 0;
370	}
371	}
372
373	if (stem_method == 1)
374	si = sd->stem1;
375	else if (stem_method == 2)
376	si = sd->stem2;
377	else
378	si = sd->stem3;
379
380	/**
381	* Locate block: If this block is not the active one, then make it active.
382	*/
383	idx_block = Word_IdxBlockNo (si, sWord);
384
385	/* [RPAP - Jan 97: Endian Ordering] */
386	if (si->active != si->pos[idx_block])
387	{
388	StemIdx_block_readNext((void *) si->stem_idx_file, si, idx_block, &idx_first_word, &idx_num_words,
389	&idx_index, &idx_num_indexes, stemAbstractSeeker, stemAbstractReader);
390	}
391	else
392	{
393	idx_first_word = (unsigned long *) (si->buffer);
394	idx_num_words = (unsigned short *) (idx_first_word + 1);
395	idx_index = idx_num_words + 1;
396	idx_num_indexes = ((*idx_num_words - 1) / si->sih.lookback) + 1;
397	}
398	idx_base = (u_char *) (idx_index + idx_num_indexes);
399
400	/**
401	* Search in this stem index for the word using binary chop
402	*/
403	{
404	/* Locate 3-in-4 block */
405	//register int lo, hi, mid, c;
406	int hi;
407
408	hi = Word_BaseIndexNo(idx_num_indexes, idx_base, idx_index, sWord);
409	idx_res = hi * si->sih.lookback;
410	idx_base += idx_index[hi];
411	}
412
413	/* Locate word entry by streaming forwards through the index */
414	for (;;)
415	{
416	int c;
417	if (idx_res >= *idx_num_words)
418	return (-1);
419	StemIdx_ReadWordString(&idx_base, idx_prev);
420
421	c = casecompare (sWord, idx_prev);
422	if (c < 0)
423	return (-1);
424
425	bcopy ((void ) idx_base, (void ) &num_entries, sizeof (num_entries));
426	NTOHUI(num_entries); /* [RPAP - Jan 97: Endian Ordering] */
427	idx_base += sizeof (num_entries);
428
429	if (c > 0)
430	idx_base += num_entries * (sizeof (num_cases) + sizeof (block) +
431	sizeof (blk_index) + sizeof (offset));
432	else
433	break;
434
435	idx_res++;
436	}
437
438	for (k = 0; k < num_entries; k++)
439	{
440	//unsigned copy, suff;
441	unsigned long invfp;
442	/**
443	* GRB: Read next stem index pos, getting the details of the next word
444	* which is an extension of this stem (a wordform of this stem)
445	*/
446	StemIdx_ReadPosEntry(&idx_base, &num_cases, (unsigned int *) &block, &blk_index, &offset);
447
448
449	/**
450	* GRB: Now look up the given word form, as per FindWord above.
451	* Firstly, we need to get to the right block; we read this
452	* above [rather than using Word_BlockIndex as in FindWord()]
453	* So we just need to make sure that it is the active block
454	*/
455
456	/* [RPAP - Jan 97: Endian Ordering] */
457	if (sd->active != sd->pos[block])
458	{
459	StemBlock_read(sd, block, &first_word, &last_invf_len, &num_words,
460	&index, &num_indexes);
461	}
462	else
463	{
464	first_word = (unsigned long *) (sd->buffer);
465	last_invf_len = (unsigned long *) (first_word + 1);
466	num_words = (unsigned short *) (last_invf_len + 1);
467	index = num_words + 1;
468	num_indexes = ((*num_words - 1) / sd->sdh.lookback) + 1;
469	}
470	base = (u_char *) (index + num_indexes);
471
472	res = blk_index * sd->sdh.lookback;
473	base += index[blk_index];
474
475	/**
476	* GRB: We know how many words to offset by, so let's scramble
477	* past them
478	*/
479	for (j = 0; j < offset; j++)
480	{
481	StemBlock_ReadWordString(&base, prev);
482
483	base += sizeof (unsigned long); /* skip doc_count */
484	base += sizeof (unsigned long); /* skip count */
485	base += sizeof (unsigned long); /* skip invf_ptr */
486	res++;
487	}
488
489	/**
490	* GRB: Now take this wordform case by case
491	*/
492	for (j = 0; j < num_cases; j++)
493	{
494	TermEntry te;
495
496	if (res >= *num_words)
497	return (-1);
498	StemBlock_ReadWordString(&base, prev);
499
500	te.Word = copy_string (prev);
501	if (!te.Word)
502	FatalError (1, "Could NOT create memory to add term");
503	te.Stem = copy_string (prev);
504	if (!te.Stem)
505	FatalError (1, "Could NOT create memory to add term");
506	stemmer (2, sd->sdh.stemmer_num, te.Stem);
507
508	te.Count = 1;
509	te.WE.word_num = *first_word + res;
510	bcopy ((void ) base, (void ) &te.WE.doc_count, sizeof (te.WE.doc_count));
511	NTOHUL(te.WE.doc_count); /* [RPAP - Jan 97: Endian Ordering] */
512	te.WE.max_doc_count = te.WE.doc_count;
513	base += sizeof (te.WE.doc_count);
514
515	bcopy ((void ) base, (void ) &te.WE.count, sizeof (te.WE.count));
516	NTOHUL(te.WE.count);
517	base += sizeof (te.WE.count);
518
519	bcopy ((void ) base, (void ) &invfp, sizeof (te.WE.invf_ptr));
520	NTOHUL(invfp); /* [RPAP - Jan 97: Endian Ordering] */
521	te.WE.invf_ptr = invfp;
522	base += sizeof (te.WE.invf_ptr);
523
524	/* If the current word is the last word of the block the get the
525	length from last_invf_len */
526	if (res == *num_words - 1)
527	te.WE.invf_len = *last_invf_len;
528	else
529	{
530	u_char *oldbase = base;
531
532	/* Skip over most of the next word to get to the invf_ptr */
533	te.WE.invf_len = Word_SkipToNextInvfPtr(&base, invfp);
534	base = oldbase;
535	}
536
537	/* Add term entry to term list */
538	/* te.query_mask = NULL;*/
539	TermList_AddTermEntry (*tl, &te);
540
541	/**
542	* GRB: If this is the final case of this wordform in this block,
543	* and we have one or more remaining blocks containing the wordform,
544	* then read in next block
545	*/
546	if (res == *num_words - 1 && j + 1 < num_cases)
547	{
548	/* Read in next block */
549	block++;
550	StemBlock_read(sd, block, &first_word, &last_invf_len, &num_words,
551	&index, &num_indexes);
552	base = (u_char *) (index + num_indexes);
553	base += index[0];
554	res = 0;
555	blk_index = 0;
556	}
557	else
558	res++;
559	} /* end for num_cases */
560	} /* end for num_entries */
561	return (*tl)->num;
562	}
563
564
565	void
566	FreeStemDict (stemmed_dict * sd)
567	{
568	/* [RPAP - Jan 97: Stem Index Change] */
569	if (sd->stem1)
570	FreeStemIdx (sd->stem1);
571	if (sd->stem2)
572	FreeStemIdx (sd->stem2);
573	if (sd->stem3)
574	FreeStemIdx (sd->stem3);
575
576	delete (sd->index[0]);
577	delete (sd->index);
578	delete (sd->buffer);
579	delete (sd->pos);
580	delete sd;
581	}
582
583	/* [RPAP - Jan 97: Stem Index Change] */
584	void
585	FreeStemIdx (stemmed_idx * si)
586	{
587	delete si->index[0];
588	delete si->index;
589	delete si->buffer;
590	delete si->pos;
591	delete si;
592	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: