Context Navigation

source: trunk/gsdl/src/mgpp/text/query.ranked.cpp@ 711

Last change on this file since 711 was 711, checked in by cs025, 25 years ago
Changes to eradicate Xmalloc
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 25.8 KB

Line
1	/**************************************************************************
2	*
3	* query.ranked.c -- Ranked query evaluation
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: query.ranked.cpp 711 1999-10-17 23:43:31Z cs025 $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "memlib.h"
27	#include "filestats.h"
28	#include "messages.h"
29	#include "timing.h"
30	#include "sptree.h"
31	#include "unitool.h"
32
33	#include "mg.h"
34	#include "invf.h"
35	#include "text.h"
36	#include "lists.h"
37	#include "backend.h"
38	#include "DocEntry.h"
39	#include "stem_search.h"
40	#include "weights.h"
41	#include "text_get.h"
42	#include "invf_get.h"
43	#include "words.h"
44	#include "stemmer.h"
45	#include "locallib.h"
46	#include "environment.h"
47	#include "term_lists.h"
48	#include "local_strings.h"
49	#include "query_term_list.h" /* [RPAP - Feb 97: Term Frequency] */
50
51	/*
52	$Log$
53	Revision 1.3 1999/10/17 23:43:29 cs025
54	Changes to eradicate Xmalloc
55
56	Revision 1.2 1999/10/12 04:13:52 cs025
57	Calls to GetEnv eradicated; interface to parsing of query text
58	cleaned up.
59
60	Revision 1.1 1999/10/11 02:58:14 cs025
61	Base install of MG-PP
62
63	Revision 1.1 1999/08/10 21:18:20 sjboddie
64	renamed mg-1.3d directory mg
65
66	Revision 1.2 1998/11/25 07:55:50 rjmcnab
67
68	Modified mg to that you can specify the stemmer you want
69	to use via a command line option. You specify it to
70	mg_passes during the build process. The number of the
71	stemmer that you used is stored within the inverted
72	dictionary header and the stemmed dictionary header so
73	the correct stemmer is used in later stages of building
74	and querying.
75
76	Revision 1.1 1998/11/17 09:35:34 rjmcnab
77	* empty log message *
78
79	* Revision 1.4 1994/11/25 03:47:46 tes
80	* Committing files before adding the merge stuff.
81	*
82	* Revision 1.3 1994/10/20 03:57:03 tes
83	* I have rewritten the boolean query optimiser and abstracted out the
84	* components of the boolean query.
85	*
86	* Revision 1.2 1994/09/20 04:42:04 tes
87	* For version 1.1
88	*
89	*/
90
91	static char *RCSID = "$Id: query.ranked.cpp 711 1999-10-17 23:43:31Z cs025 $";
92
93	/*************************************************************************/
94
95	class HeapEntry : public loadableDocEntry
96	{
97	public:
98	float *OrgWeight;
99
100	HeapEntry()
101	{
102	// do nothing
103	}
104
105	inline void chain(int docno, float *weightPtr)
106	{
107	*OrgWeight = this->Weight;
108	DocNum = docno;
109	Weight = *weightPtr;
110	OrgWeight = weightPtr;
111	*OrgWeight = 0;
112	}
113
114	inline void setValues(int docno, float _weight, u_long filepos, u_long _len, float *weightPtr)
115	{
116	DocNum = docno;
117	OrgWeight = weightPtr;
118	Weight = _weight;
119	SeekPos = filepos;
120	Len = _len;
121	*weightPtr = 0;
122	}
123	}
124	;
125
126	typedef int (HeapComp) (HeapEntry , HeapEntry *);
127
128	typedef struct Heap
129	{
130	int NumItems;
131	int MaxSize;
132	HeapComp HC;
133	HeapEntry *HE;
134
135	Heap(int size)
136	{
137	HE = new HeapEntry[size];
138	}
139
140	~Heap()
141	{
142	if (HE != NULL)
143	delete HE;
144	}
145	}
146	Heap;
147
148
149	Heap *
150	Heap_Make (int size, HeapComp hc)
151	{
152	Heap *H;
153	H = new Heap(size);
154	if (!H)
155	return NULL;
156	H->NumItems = 0;
157	H->MaxSize = size;
158	H->HC = hc;
159	return H;
160	}
161
162	int
163	Heap_Size (Heap * H)
164	{
165	return sizeof (Heap) + H->MaxSize * sizeof (HeapEntry);
166	}
167
168	HeapEntry *
169	Heap_GetHead (Heap * H)
170	{
171	if (H && H->NumItems)
172	return &H->HE[0];
173	else
174	return NULL;
175	}
176
177
178
179	void
180	Heap_Heapify (Heap * H, int i)
181	{
182	register int curr, child;
183	curr = i;
184	child = curr * 2;
185	while (child <= H->NumItems)
186	{
187	if (child < H->NumItems && H->HC (&H->HE[child], &H->HE[child - 1]) > 0)
188	child++;
189	if (H->HC (&H->HE[curr - 1], &H->HE[child - 1]) < 0)
190	{
191	HeapEntry temp = H->HE[child - 1];
192	H->HE[child - 1] = H->HE[curr - 1];
193	H->HE[curr - 1] = temp;
194	curr = child;
195	child = 2 * child;
196	}
197	else
198	break;
199	}
200	}
201
202
203	void
204	Heap_Build (Heap * H)
205	{
206	register int i;
207	for (i = H->NumItems / 2; i > 0; i--)
208	Heap_Heapify (H, i);
209	}
210
211
212	void
213	Heap_Sort (Heap * H)
214	{
215	register int i;
216	for (i = H->NumItems; i > 1; i--)
217	{
218	HeapEntry temp = H->HE[0];
219	H->HE[0] = H->HE[i - 1];
220	H->HE[i - 1] = temp;
221	H->NumItems--;
222	Heap_Heapify (H, 1);
223	}
224	}
225
226
227	void
228	Heap_DeleteHead (Heap * H)
229	{
230	H->HE[0] = H->HE[--H->NumItems];
231	Heap_Heapify (H, 1);
232	}
233
234	int
235	Heap_Lesser (HeapEntry * a, HeapEntry * b)
236	{
237	return (a->Weight > b->Weight ? -1 :
238	(a->Weight == b->Weight ? 0 : 1));
239	}
240
241
242	int
243	Heap_Greater (HeapEntry * a, HeapEntry * b)
244	{
245	return (a->Weight > b->Weight ? 1 :
246	(a->Weight == b->Weight ? 0 : -1));
247	}
248
249	int
250	Make_Exact_Root (query_data * qd, Heap * H)
251	{
252	int num = 0;
253	HeapEntry *he = H->HE;
254	while (he->SeekPos == 0)
255	{
256	he->Weight = he->Weight *
257	GetLowerApproxDocWeight (qd->awd, he->docNum() - 1) /
258	he->FetchStart (qd);
259	Heap_Heapify (H, 1);
260	num++;
261	}
262	return num;
263	}
264
265	void
266	Heap_Dump (Heap * H, int num)
267	{
268	int i, l, lines, r;
269	if (num > H->NumItems)
270	num = H->NumItems;
271	lines = (num + 3) / 4;
272	for (l = 0; l < lines; l++)
273	for (r = 0; r < 4; r++)
274	{
275	i = lines * r + l;
276	if (i < num)
277	fprintf (stderr, "[%2d] %7.4f", i, H->HE[i].Weight);
278	fprintf (stderr, r == 3 ? "\n" : " ");
279	}
280	fprintf (stderr, "\n");
281	}
282
283	int Heap_chainWeight(Heap H, HeapEntry he, query_data qd, int DocNum, float SumPtr)
284	{
285	SumPtr = (SumPtr) / GetLowerApproxDocWeight (qd->awd, DocNum - 1);
286	qd->num_of_accum ++;
287	if (*SumPtr <= he->docWeight())
288	return 0;
289
290	he->chain(DocNum, SumPtr);
291
292	Heap_Heapify (H, 1);
293	return 1;
294	}
295
296	int Heap_plainWeight(Heap H, HeapEntry he, query_data qd, int DocNum, float SumPtr)
297	{
298	unsigned long SeekPos, Len;
299	float Weight;
300
301	if (*SumPtr == 0)
302	return 0;
303	if (*SumPtr <= he->docWeight())
304	return 0;
305
306	Weight = (SumPtr)
307	GetLowerApproxDocWeight (qd->awd, DocNum-1) /
308	FetchDocStart (qd, DocNum, &SeekPos, &Len);
309
310	if (Weight <= he->docWeight())
311	return 0;
312
313	he->setValues(DocNum, Weight, SeekPos, Len, SumPtr);
314	Heap_Heapify (H, 1);
315
316	return 1;
317	}
318
319	/*************************************************************************/
320
321
322
323
324
325
326	int
327	doc_count_comp (const void A, const void B)
328	{
329	const TermEntry a = (TermEntry ) A;
330	const TermEntry b = (TermEntry ) B;
331	return (a->WE.doc_count - b->WE.doc_count);
332	}
333
334	/* =========================================================================
335	* Function: ParseRankedQuery
336	* Description:
337	* Takes a string query line and extracts the terms in the query
338	* which exist in the stemmed dictionary.
339	* Optionally sorts the terms into ascending order by doc. number.
340	* Input:
341	* stemmed dictionary, query line, sort-flag
342	* Output:
343	* A list of terms.
344	* ========================================================================= */
345
346	/* [RPAP - Jan 97: Stem Index Change] */
347	static TermList *
348	ParseRankedQuery (query_data qd, char QueryLine,
349	RankedQueryInfo *rqi,
350	QueryTermList *query_term_list) / [RPAP - Feb 97: Term Frequency] */
351	{
352	u_char Word[MAXSTEMLEN + 1];
353	u_char sWord[MAXSTEMLEN + 1];
354	u_char end, s_in;
355	int default_stem_method = 0;
356	TermList *Terms = TermList_create(0);
357
358	s_in = (u_char *) QueryLine;
359	end = s_in + strlen ((char *) s_in) - 1;
360	query_term_list = QueryTermList_create(0); / [RPAP - Feb 97: Term Frequency] */
361
362	/* [RPAP - Jan 97: Stem Index Change] */
363	if (qd->sd->sdh.indexed)
364	default_stem_method = rqi->getWordSpecial();
365	//BooleanEnv (GetEnv ("casefold"), 0) \| (BooleanEnv (GetEnv ("stem"), 0) << 1);
366	else
367	default_stem_method = qd->sd->sdh.stem_method;
368
369	while (s_in <= end)
370	{
371	int j;
372	long num_entries, word_num;
373	unsigned long count, doc_count, invf_ptr, invf_len;
374	int weight_to_apply, stem_to_apply;
375	int method_using = -1;
376
377	/* 0=optional, 1=mustmatch */
378	int require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
379
380	/* Skip over the non word separator taking note of any parameters */
381	PARSE_RANKED_NON_STEM_WORD (require_match, s_in, end); /* [RJM 07/97: Ranked Required Terms] */
382	if (s_in > end) break;
383
384	/* Get a word and stem it */
385	PARSE_STEM_WORD (Word, s_in, end);
386
387	/* Extract any parameters */
388	weight_to_apply = 1;
389	stem_to_apply = default_stem_method;
390	while (s_in <= end)
391	{
392	int stem_param, weight_param, param_type;
393	char param[MAXPARAMLEN + 1];
394
395	param_type = 0;
396	PARSE_OPT_TERM_PARAM (param, param_type, s_in, end);
397	if (!param_type)
398	break;
399
400	switch (param_type)
401	{
402	case (WEIGHTPARAM):
403	weight_param = atoi (param);
404	if (errno != ERANGE && weight_param > 0)
405	weight_to_apply = weight_param;
406	break;
407
408	case (STEMPARAM):
409	stem_param = atoi (param);
410	if (errno != ERANGE && qd->sd->sdh.indexed && stem_param >= 0 && stem_param <= 3)
411	method_using = stem_to_apply = stem_param;
412	break;
413	}
414	}
415
416	bcopy ((void ) Word, (void ) sWord, *Word + 1);
417	stemmer (stem_to_apply, qd->sd->sdh.stemmer_num, sWord);
418
419	if (!qd->sd->sdh.indexed \|\| stem_to_apply == 0)
420	{
421	/* Look for the word in the already identified terms */
422	for (j = 0; j < Terms->size(); j++)
423	if (compare (Terms->TE[j].Word, Word) == 0)
424	break;
425
426	/* Increment the weight if the word is in the list */
427	/* Update the require match attribute */
428	if (j < Terms->size())
429	{
430	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
431	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
432	Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
433	QueryTermList_AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
434	}
435	else
436	{
437	/* Look for it in the stemmed dictionary */
438	if ((word_num = FindWord (qd->sd, sWord, &count, &doc_count,
439	&invf_ptr, &invf_len)) != -1)
440	{
441	/* Search the list for the word */
442	for (j = 0; j < Terms->size(); j++)
443	if (Terms->TE[j].WE.word_num == word_num)
444	break;
445
446	/* Increment the weight if the word is in the list */
447	if (j < Terms->size())
448	{
449	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
450	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
451	Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
452	QueryTermList_AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
453	}
454	else
455	{
456	/* Create a new entry in the list for the new word */
457	TermEntry te;
458
459	te.WE.word_num = word_num;
460	te.WE.count = count;
461	te.WE.doc_count = doc_count;
462	te.WE.max_doc_count = doc_count;
463	te.WE.invf_ptr = invf_ptr;
464	te.WE.invf_len = invf_len;
465	te.Count = weight_to_apply;
466	te.Word = copy_string (Word);
467	if (!te.Word)
468	FatalError (1, "Could NOT create memory to add term");
469	te.Stem = NULL;
470	te.require_match = require_match;
471
472	TermList_AddTermEntry (Terms, &te);
473
474	/* [RPAP - Feb 97: Term Frequency] */
475	QueryTermList_AddQueryTerm (query_term_list, Word, count, method_using);
476	}
477	}
478	/* [RPAP - Feb 97: Term Frequency] */
479	else
480	QueryTermList_AddQueryTerm (query_term_list, Word, 0, method_using);
481	}
482	}
483	else
484	{
485	int total_count = 0; /* [RPAP - Feb 97: Term Frequency] */
486	TermList *tempList = TermList_create (0);
487
488	if ((num_entries = FindWords (qd->sd, sWord, stem_to_apply, &tempList)) > 0)
489	{
490	int i;
491	u_long max_doc_count = 0;
492
493	/* get the maximum doc count */
494	for (i = 0; i < tempList->size(); i++)
495	{
496	if (tempList->TE[i].WE.doc_count > max_doc_count)
497	max_doc_count = tempList->TE[i].WE.doc_count;
498	total_count += tempList->TE[i].WE.count; /* [RPAP - Feb 97: Term Frequency] */
499	}
500
501	for (i = 0; i < tempList->size(); i++)
502	{
503	/* Look for the word(s) in the already identified terms */
504	for (j = 0; j < Terms->size(); j++)
505	{
506	if (compare (Terms->TE[j].Word, tempList->TE[i].Word) == 0)
507	{
508	/* found the word */
509	/* Modify weight */
510	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
511	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
512	if (Terms->TE[j].WE.max_doc_count < max_doc_count)
513	Terms->TE[j].WE.max_doc_count = max_doc_count;
514	break;
515	}
516	}
517
518	if (j == Terms->size())
519	{
520	/* word was not found */
521	tempList->TE[i].WE.max_doc_count = max_doc_count;
522	tempList->TE[i].Count = weight_to_apply;
523
524	/* We cannot require a term to match if it is expanded */
525	/* into multiple terms :-( */
526	tempList->TE[i].require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
527
528	TermList_AddTermEntry (Terms, &tempList->TE[i]);
529	}
530	}
531	}
532
533	/* [RPAP - Feb 97: Term Frequency] */
534	QueryTermList_AddQueryTerm (query_term_list, Word, total_count, method_using);
535
536	if (tempList != NULL) Xfree(tempList); /* [RJM 07/98: Memory Leak] */
537	} /* end indexed */
538	} /* end while */
539
540	if (rqi->Sort)
541	/* Sort the terms in ascending order by doc_count */
542	qsort (Terms->TE, Terms->size(), sizeof (TermEntry), doc_count_comp);
543	return (Terms);
544	}
545
546
547	/*
548	* This function is given a list of term numbers and it returns a list
549	* of document numbers based on the cosine document weighting system.
550	* This puts the entries in an array.
551	* inverted file.
552	* If MaxDocs == -1 then it means all
553	*/
554	static DocList *
555	CosineGet (query_data * qd, TermList * Terms, RankedQueryInfo * rqi) {
556	DocList *Docs;
557	float *AccumulatedWeights = NULL;
558	Splay_Tree *ST = NULL;
559	Splay_Tree *Para_ST = NULL;
560	Hash_Table *HT = NULL;
561	List_Table *LT = NULL;
562	Heap *H;
563	HeapEntry *he;
564	register float *fptr = NULL;
565	register Invf_Doc_Entry *ide = NULL;
566	register Invf_Doc_EntryH *ideh = NULL;
567	int BackEnd, NumExact, MaxExact, NumParas;
568	int MaxDocs = 0, MaxParas = 0;
569	int i;
570	Invf_Doc_Entry_Pool ide_pool;
571	ide_pool.pool = NULL;
572
573	qd->hops_taken = qd->num_of_ptrs = qd->num_of_accum = 0;
574
575	switch (rqi->AccumMethod)
576	{
577	case 'S':
578	ST = CosineDecodeSplay (qd, Terms, rqi, &ide_pool);
579	if (!ST)
580	return NULL;
581	break;
582	case 'A':
583	AccumulatedWeights = CosineDecode (qd, Terms, rqi);
584	if (!AccumulatedWeights)
585	return NULL;
586	break;
587	case 'H':
588	HT = CosineDecodeHash (qd, Terms, rqi);
589	if (!HT)
590	return NULL;
591	break;
592	case 'L':
593	LT = CosineDecodeList (qd, Terms, rqi);
594	if (!LT)
595	return NULL;
596	break;
597	}
598
599	#if 0
600	if (rqi->UseSplayTree)
601	{
602
603	AccumulatedWeights = CosineDecode (qd, Terms, rqi);
604	fptr = AccumulatedWeights;
605	ide = SP_get_first (ST);
606	for (i = 0; i < qd->sd->sdh.num_of_docs; i++)
607	{
608	if (AccumulatedWeights[i] != 0)
609	{
610	if (i != ide->DocNum)
611	fprintf (stderr, "Sum mismatch for %d %f %d %f\n", i + 1,
612	AccumulatedWeights[i], ide->docNum() + 1, ide->docWeight());
613	ide = SP_get_next (ST);
614	}
615	}
616	}
617	#endif
618
619	switch (rqi->AccumMethod)
620	{
621	case 'S':
622	MaxParas = ST->no_of_items;
623	break;
624	case 'A':
625	{ /* count the number of non-zero document weights */
626	register int i = qd->sd->sdh.num_of_docs;
627	register float *d;
628	MaxParas = 0;
629	for (d = AccumulatedWeights; i; i--, d++)
630	{
631	if (*d)
632	{
633	MaxParas++;
634	}
635	}
636	}
637	break;
638	case 'H':
639	MaxParas = HT->num + HT->Suplimentary_Num;
640	break;
641	case 'L':
642	MaxParas = LT->num;
643	break;
644	}
645
646	if (rqi->MaxParasToRetrieve != -1 && MaxParas > rqi->MaxParasToRetrieve)
647	MaxParas = rqi->MaxParasToRetrieve;
648	MaxDocs = MaxParas;
649
650	/* Allocate memory for the heap */
651	Docs = DocList_create (MaxDocs);
652	ChangeMemInUse (qd, sizeof (DocEntry) * MaxDocs);
653
654	H = Heap_Make (MaxDocs, Heap_Lesser);
655
656
657	/* Get the sums from the array divide the sums by the
658	document weights which we retrieve from the ".idx.wgt" file and put
659	the resulting data into a heap */
660	he = H->HE;
661	H->NumItems = MaxDocs;
662	switch (rqi->AccumMethod)
663	{
664	case 'S':
665	{
666	ide = (Invf_Doc_Entry *) SP_get_first (ST);
667	for (i = 0; i < H->NumItems; i++, ide = (Invf_Doc_Entry *) SP_get_next (ST), he++)
668	{
669	he->DocNum = ide->docNum() + 1;
670	he->OrgWeight = &ide->Weight;
671	qd->num_of_accum++;
672	}
673	}
674	break;
675	case 'A':
676	{
677	fptr = AccumulatedWeights;
678	for (i = 0; i < H->NumItems; i++, fptr++, he++)
679	{
680	he->DocNum = i + 1;
681	he->OrgWeight = fptr;
682	if (*fptr)
683	qd->num_of_accum++;
684	}
685	}
686	break;
687	case 'H':
688	{
689	ideh = HT->Head;
690	for (i = 0; i < H->NumItems; i++, ideh = ideh->next, he++)
691	{
692	he->DocNum = ideh->DocNum + 1;
693	he->OrgWeight = &ideh->Weight;
694	qd->num_of_accum++;
695	}
696	}
697	break;
698	case 'L':
699	{
700	ide = LT->IDE;
701	for (i = 0; i < H->NumItems; i++, ide++, he++)
702	{
703	he->DocNum = ide->DocNum + 1;
704	he->OrgWeight = &ide->Weight;
705	qd->num_of_accum++;
706	}
707	}
708	break;
709	}
710
711	he = H->HE;
712	for (i = 0; i < H->NumItems; i++, he++)
713	{
714	*he->OrgWeight /= GetLowerApproxDocWeight (qd->awd, he->DocNum - 1);
715	he->Weight = *he->OrgWeight;
716	*he->OrgWeight = 0;
717	he->SeekPos = he->Len = 0;
718	}
719
720	Heap_Build (H);
721
722	he = H->HE;
723	switch (rqi->AccumMethod)
724	{
725	case 'S':
726	{
727	for (i = MaxDocs; i < ST->no_of_items; i++, ide = (Invf_Doc_Entry *) SP_get_next (ST))
728	{
729	/*
730	ide->Sum /= GetLowerApproxDocWeight (qd->awd, ide->DocNum);
731	qd->num_of_accum++;
732	if (ide->Sum <= he->Weight)
733	continue;
734	*/
735	Heap_chainWeight(H, he, qd, ide->DocNum + 1, &ide->Weight);
736	/*
737	*he->OrgWeight = he->Weight;
738	he->DocNum = ide->DocNum + 1;
739	he->Weight = ide->Sum;
740	he->OrgWeight = &ide->Sum;
741	*he->OrgWeight = 0;
742	Heap_Heapify (H, 1);
743	*/
744	}
745	}
746	break;
747	case 'A':
748	{
749	for (i = MaxDocs; i < qd->sd->sdh.num_of_docs; i++, fptr++)
750	{
751	if (!*fptr)
752	continue;
753	/*
754	*fptr /= GetLowerApproxDocWeight (qd->awd, i);
755	qd->num_of_accum++;
756	if (*fptr <= he->Weight)
757	continue;
758	*/
759	Heap_chainWeight(H, he, qd, i+1, fptr);
760	/*
761	*he->OrgWeight = he->Weight;
762	he->DocNum = i + 1;
763	he->Weight = *fptr;
764	he->OrgWeight = fptr;
765	*he->OrgWeight = 0;
766	Heap_Heapify (H, 1);
767	*/
768	}
769	}
770	break;
771	case 'H':
772	{
773	for (; ideh != NULL; ideh = ideh->next)
774	{
775	/*
776	ideh->Sum /=
777	GetLowerApproxDocWeight (qd->awd, ideh->DocNum);
778	qd->num_of_accum++;
779	if (ideh->Sum <= he->Weight)
780	continue;
781	*/
782	Heap_chainWeight(H, he, qd, ideh->DocNum + 1, &ideh->Weight);
783	/*
784	*he->OrgWeight = he->Weight;
785	he->DocNum = ideh->DocNum + 1;
786	he->Weight = ideh->Sum;
787	he->OrgWeight = &ideh->Sum;
788	*he->OrgWeight = 0;
789	Heap_Heapify (H, 1);
790	*/
791	}
792	}
793	break;
794	case 'L':
795	{
796	for (i = MaxDocs; i < LT->num; i++, ide++)
797	{
798	/*
799	ide->Sum /=
800	GetLowerApproxDocWeight (qd->awd, ide->DocNum);
801	qd->num_of_accum++;
802	if (ide->Sum <= he->Weight)
803	continue;
804	*/
805	Heap_chainWeight(H, he, qd, ide->DocNum + 1, &ide->Weight);
806	/*
807	*he->OrgWeight = he->Weight;
808	he->DocNum = ide->DocNum + 1;
809	he->Weight = ide->Sum;
810	he->OrgWeight = &ide->Sum;
811	*he->OrgWeight = 0;
812	Heap_Heapify (H, 1);
813	*/
814	}
815	}
816	break;
817	}
818
819	if (rqi->Exact && qd->id->ifh.InvfLevel != 3)
820	{
821	HeapEntry *he = H->HE;
822
823	for (i = 0; i < H->NumItems; i++, he++)
824	{
825	he->Weight = he->docWeight() *
826	GetLowerApproxDocWeight (qd->awd, he->docNum() - 1) /
827	he->FetchStart (qd);
828	}
829
830	Heap_Build (H);
831
832	he = H->HE;
833
834	switch (rqi->AccumMethod)
835	{
836	case 'S':
837	{
838	ide = (Invf_Doc_Entry *) SP_get_first (ST);
839	for (i = 0; i < ST->no_of_items; i++, ide = (Invf_Doc_Entry *) SP_get_next (ST))
840	{
841	/*
842	u_long SeekPos, Len;
843	float Weight;
844	if (!ide->Sum)
845	continue;
846	if (ide->Sum <= he->Weight)
847	continue;
848	Weight = ide->Sum *
849	GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
850	FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
851	if (Weight <= he->Weight)
852	continue;
853	*/
854	if (Heap_plainWeight(H, he, qd, ide->DocNum + 1, &ide->Weight) == 0)
855	continue;
856	/*
857	he->DocNum = ide->DocNum + 1;
858	he->OrgWeight = &ide->Sum;
859	he->Weight = Weight;
860	he->SeekPos = SeekPos;
861	he->Len = Len;
862	ide->Sum = 0;
863	Heap_Heapify (H, 1);
864	*/
865	}
866	}
867	break;
868
869	/* up to here */
870
871	case 'A':
872	{
873	fptr = AccumulatedWeights;
874	for (i = 0; i < qd->sd->sdh.num_of_docs; i++, fptr++)
875	{
876	/*
877	u_long SeekPos, Len;
878	float Weight;
879	if (!*fptr)
880	continue;
881	if (*fptr <= he->Weight)
882	continue;
883	Weight = fptr
884	GetLowerApproxDocWeight (qd->awd, i) /
885	FetchDocStart (qd, i + 1, &SeekPos, &Len);
886	if (Weight <= he->Weight)
887	continue;
888	*/
889	if (Heap_plainWeight(H, he, qd, i+1, fptr) == 0)
890	continue;
891	/*
892	he->DocNum = i + 1;
893	he->OrgWeight = fptr;
894	he->Weight = Weight;
895	he->SeekPos = SeekPos;
896	he->Len = Len;
897	*fptr = 0;
898	Heap_Heapify (H, 1);
899	*/
900	}
901	}
902	break;
903	case 'H':
904	{
905	ideh = HT->Head;
906	for (ideh = HT->Head; ideh; ideh = ideh->next)
907	{
908	/*
909	u_long SeekPos, Len;
910	float Weight;
911	if (!ideh->Sum)
912	continue;
913	if (ideh->Sum <= he->Weight)
914	continue;
915
916	Weight = ideh->Sum *
917	GetLowerApproxDocWeight (qd->awd, ideh->DocNum) /
918	FetchDocStart (qd, ideh->DocNum + 1, &SeekPos, &Len);
919	if (Weight <= he->Weight)
920	continue;
921	*/
922	if (Heap_plainWeight(H, he, qd, ideh->docNum() + 1, &ideh->Weight) == 0)
923	continue;
924	/*
925	he->DocNum = ideh->DocNum + 1;
926	he->OrgWeight = &ideh->Sum;
927	he->Weight = Weight;
928	he->SeekPos = SeekPos;
929	he->Len = Len;
930	ideh->Sum = 0;
931	Heap_Heapify (H, 1);
932	*/
933	}
934	}
935	break;
936	case 'L':
937	{
938	ide = LT->IDE;
939	for (i = 0; i < LT->num; i++, ide++)
940	{
941	/*u_long SeekPos, Len;
942	float Weight;
943	if (!ide->Sum)
944	continue;
945	if (ide->Sum <= he->Weight)
946	continue;
947
948	Weight = ide->Sum *
949	GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
950	FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
951	if (Weight <= he->Weight)
952	continue;
953	*/
954	if (Heap_plainWeight(H, he, qd, ide->DocNum+1, &ide->Weight) == 0)
955	continue;
956	/*
957	he->DocNum = ide->DocNum + 1;
958	he->OrgWeight = &ide->Sum;
959	he->Weight = Weight;
960	he->SeekPos = SeekPos;
961	he->Len = Len;
962	ide->Sum = 0;
963	Heap_Heapify (H, 1);
964	*/
965	}
966	}
967	break;
968	}
969	}
970
971	H->HC = Heap_Greater;
972	Heap_Build (H);
973
974
975	MaxDocs = H->NumItems;
976	if (rqi->MaxDocsToRetrieve != -1 && MaxDocs > rqi->MaxDocsToRetrieve)
977	MaxDocs = rqi->MaxDocsToRetrieve;
978
979	/* Alarm */
980
981	he = H->HE;
982	BackEnd = H->NumItems - 1;
983	NumExact = 0;
984	MaxExact = H->NumItems;
985	NumParas = 0;
986	Para_ST = SP_createset (DocEntry_compare);
987	while (H->NumItems && Docs->size() < MaxDocs)
988	{
989	DocEntry DocEnt;
990	DocEntry *mem;
991
992	if (rqi->Exact)
993	{
994	if (H->HE[0].SeekPos == 0)
995	NumExact += Make_Exact_Root (qd, H);
996	}
997	else
998	he->FetchStart (qd);
999
1000	NumParas++;
1001
1002	DocEnt.DocNum = he->DocNum;
1003	DocEnt.Weight = he->Weight;
1004	DocEnt.Len = he->Len;
1005	DocEnt.SeekPos = he->SeekPos;
1006	DocEnt.CompTextBuffer = NULL;
1007	DocEnt.Next = NULL;
1008
1009	Heap_DeleteHead (H);
1010
1011	if (!(mem = (DocEntry *) SP_member (&DocEnt, Para_ST)))
1012	{
1013	Docs->addMember(&DocEnt);
1014	SP_insert (Docs->lastMember(), Para_ST);
1015	}
1016	else
1017	{
1018	DocEnt.Next = DocEntry_getNext(mem);
1019	Docs->setMember(BackEnd, &DocEnt);
1020	DocEntry_setNext(mem, Docs->member(BackEnd--));
1021	}
1022	}
1023	SP_freeset (Para_ST);
1024
1025	if (qd->id->ifh.InvfLevel == 3)
1026	{
1027	Message ("%d Paragraphs were required to get %d documents",
1028	NumParas, Docs->size());
1029	if (NumExact == MaxExact)
1030	{
1031	Message ("The exact weights of all %d paragraphs had to be calculated", MaxExact);
1032	Message ("to obtain %d paragraphs. This may mean that the the documents", NumParas);
1033	Message ("returned do not necessarly represent an exact cosine ranking.");
1034	Message ("This problem may be corrected by increasing \'maxparas\'.");
1035	}
1036	}
1037	#if 0
1038	{
1039	int i;
1040	FILE *f = fopen ("top.paras", "w");
1041	fprintf (f, "=========================\nTop Paragraphs\n");
1042	for (i = 0; i < Docs->size(); i++)
1043	{
1044	DocEntry *d;
1045	fprintf (f, "<%d(%f)> ", Heap[i].DocNum, Heap[i].Weight);
1046	for (d = Heap[i].Next; d; d = d->Next)
1047	fprintf (f, "%d(%f) ", d->docNum(), d->docWeight());
1048	fprintf (f, "\n");
1049	}
1050	fprintf (f, "=========================\n");
1051	fclose (f);
1052	}
1053	#endif
1054
1055	if (AccumulatedWeights)
1056	{
1057	Xfree (AccumulatedWeights);
1058	ChangeMemInUse (qd, -sizeof (float) * qd->sd->sdh.num_of_docs);
1059	}
1060	if (ST)
1061	{
1062	int mem = ST->mem_in_use;
1063	SP_freeset (ST);
1064	ChangeMemInUse (qd, -mem);
1065	free_ide_pool (qd, &ide_pool);
1066	}
1067	if (HT)
1068	HT_free (qd, HT);
1069
1070	if (LT)
1071	LT_free (qd, LT);
1072
1073	if (H) delete H; /* [RJM 07/98: Memory Leak] */
1074
1075	return (Docs);
1076	}
1077
1078
1079
1080
1081	/* if MaxDocs == -1 it means all */
1082	void
1083	RankedQuery (query_data qd, char Query, RankedQueryInfo *rqi)
1084	{
1085	DocList *dl;
1086
1087	if (qd->TL)
1088	TermList_destroy (&(qd->TL));
1089
1090	/* [RPAP - Feb 97: Term Frequency] */
1091	if (qd->QTL)
1092	QueryTermList_free (&(qd->QTL));
1093
1094	qd->TL = ParseRankedQuery (qd, Query, rqi, &(qd->QTL)); /* [RPAP - Feb 97: Term Frequency] */
1095
1096	/* TermList_print (qd->TL, stderr); */
1097
1098	dl = CosineGet (qd, qd->TL, rqi);
1099
1100	if (!dl)
1101	FatalError (1, "Out of memory\n");
1102
1103	QueryData_FreeQueryDocs (qd);
1104
1105	qd->DL = dl;
1106	}
1107
1108
1109

Note: See TracBrowser for help on using the repository browser.

Download in other formats: