Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gsdl/trunk/trunk/mg/src/text/query.ranked.c@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago
Undoing change commited in r16582
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 23.4 KB

Line
1	/**************************************************************************
2	*
3	* query.ranked.c -- Ranked query evaluation
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: query.ranked.c 16583 2008-07-29 10:20:36Z davidb $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "memlib.h"
27	#include "filestats.h"
28	#include "messages.h"
29	#include "timing.h"
30	#include "sptree.h"
31
32	#include "mg.h"
33	#include "invf.h"
34	#include "text.h"
35	#include "lists.h"
36	#include "backend.h"
37	#include "stem_search.h"
38	#include "weights.h"
39	#include "text_get.h"
40	#include "invf_get.h"
41	#include "words.h"
42	#include "stemmer.h"
43	#include "locallib.h"
44	#include "environment.h"
45	#include "term_lists.h"
46	#include "local_strings.h"
47	#include "query_term_list.h" /* [RPAP - Feb 97: Term Frequency] */
48
49	/*
50	$Log$
51	Revision 1.1 2003/02/20 21:18:24 mdewsnip
52	Addition of MG package for search and retrieval
53
54	Revision 1.1 1999/08/10 21:18:20 sjboddie
55	renamed mg-1.3d directory mg
56
57	Revision 1.2 1998/11/25 07:55:50 rjmcnab
58
59	Modified mg to that you can specify the stemmer you want
60	to use via a command line option. You specify it to
61	mg_passes during the build process. The number of the
62	stemmer that you used is stored within the inverted
63	dictionary header and the stemmed dictionary header so
64	the correct stemmer is used in later stages of building
65	and querying.
66
67	Revision 1.1 1998/11/17 09:35:34 rjmcnab
68	* empty log message *
69
70	* Revision 1.4 1994/11/25 03:47:46 tes
71	* Committing files before adding the merge stuff.
72	*
73	* Revision 1.3 1994/10/20 03:57:03 tes
74	* I have rewritten the boolean query optimiser and abstracted out the
75	* components of the boolean query.
76	*
77	* Revision 1.2 1994/09/20 04:42:04 tes
78	* For version 1.1
79	*
80	*/
81
82	static char *RCSID = "$Id: query.ranked.c 16583 2008-07-29 10:20:36Z davidb $";
83
84	/*************************************************************************/
85
86	typedef struct HeapEntry
87	{
88	float Weight;
89	float *OrgWeight;
90	int DocNum;
91	u_long SeekPos; /* position in the text file in bytes */
92	u_long Len; /* length of the document in bytes */
93	}
94	HeapEntry;
95
96	typedef int (HeapComp) (HeapEntry , HeapEntry *);
97
98	typedef struct Heap
99	{
100	int NumItems;
101	int MaxSize;
102	HeapComp HC;
103	HeapEntry HE[1];
104	}
105	Heap;
106
107
108	Heap *
109	Heap_Make (int size, HeapComp hc)
110	{
111	Heap *H;
112	H = Xmalloc (sizeof (Heap) + (size - 1) * sizeof (HeapEntry));
113	if (!H)
114	return NULL;
115	H->NumItems = 0;
116	H->MaxSize = size;
117	H->HC = hc;
118	return H;
119	}
120
121	int
122	Heap_Size (Heap * H)
123	{
124	return sizeof (Heap) + H->MaxSize * sizeof (HeapEntry);
125	}
126
127	HeapEntry *
128	Heap_GetHead (Heap * H)
129	{
130	if (H && H->NumItems)
131	return &H->HE[0];
132	else
133	return NULL;
134	}
135
136
137
138	void
139	Heap_Heapify (Heap * H, int i)
140	{
141	register int curr, child;
142	curr = i;
143	child = curr * 2;
144	while (child <= H->NumItems)
145	{
146	if (child < H->NumItems && H->HC (&H->HE[child], &H->HE[child - 1]) > 0)
147	child++;
148	if (H->HC (&H->HE[curr - 1], &H->HE[child - 1]) < 0)
149	{
150	HeapEntry temp = H->HE[child - 1];
151	H->HE[child - 1] = H->HE[curr - 1];
152	H->HE[curr - 1] = temp;
153	curr = child;
154	child = 2 * child;
155	}
156	else
157	break;
158	}
159	}
160
161
162	void
163	Heap_Build (Heap * H)
164	{
165	register int i;
166	for (i = H->NumItems / 2; i > 0; i--)
167	Heap_Heapify (H, i);
168	}
169
170
171	void
172	Heap_Sort (Heap * H)
173	{
174	register int i;
175	for (i = H->NumItems; i > 1; i--)
176	{
177	HeapEntry temp = H->HE[0];
178	H->HE[0] = H->HE[i - 1];
179	H->HE[i - 1] = temp;
180	H->NumItems--;
181	Heap_Heapify (H, 1);
182	}
183	}
184
185
186	void
187	Heap_DeleteHead (Heap * H)
188	{
189	H->HE[0] = H->HE[--H->NumItems];
190	Heap_Heapify (H, 1);
191	}
192
193	int
194	Heap_Lesser (HeapEntry * a, HeapEntry * b)
195	{
196	return (a->Weight > b->Weight ? -1 :
197	(a->Weight == b->Weight ? 0 : 1));
198	}
199
200
201	int
202	Heap_Greater (HeapEntry * a, HeapEntry * b)
203	{
204	return (a->Weight > b->Weight ? 1 :
205	(a->Weight == b->Weight ? 0 : -1));
206	}
207
208	int
209	Make_Exact_Root (query_data * qd, Heap * H)
210	{
211	int num = 0;
212	HeapEntry *he = H->HE;
213	while (he->SeekPos == 0)
214	{
215	he->Weight = he->Weight *
216	GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
217	FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
218	Heap_Heapify (H, 1);
219	num++;
220	}
221	return num;
222	}
223
224	void
225	Heap_Dump (Heap * H, int num)
226	{
227	int i, l, lines, r;
228	if (num > H->NumItems)
229	num = H->NumItems;
230	lines = (num + 3) / 4;
231	for (l = 0; l < lines; l++)
232	for (r = 0; r < 4; r++)
233	{
234	i = lines * r + l;
235	if (i < num)
236	fprintf (stderr, "[%2d] %7.4f", i, H->HE[i].Weight);
237	fprintf (stderr, r == 3 ? "\n" : " ");
238	}
239	fprintf (stderr, "\n");
240	}
241
242	/*************************************************************************/
243
244
245
246
247
248
249	int
250	doc_count_comp (const void A, const void B)
251	{
252	const TermEntry *a = A;
253	const TermEntry *b = B;
254	return (a->WE.doc_count - b->WE.doc_count);
255	}
256
257	/* =========================================================================
258	* Function: ParseRankedQuery
259	* Description:
260	* Takes a string query line and extracts the terms in the query
261	* which exist in the stemmed dictionary.
262	* Optionally sorts the terms into ascending order by doc. number.
263	* Input:
264	* stemmed dictionary, query line, sort-flag
265	* Output:
266	* A list of terms.
267	* ========================================================================= */
268
269	/* [RPAP - Jan 97: Stem Index Change] */
270	static TermList *
271	ParseRankedQuery (stemmed_dict * sd, char *QueryLine, int Sort, int indexed,
272	QueryTermList *query_term_list) / [RPAP - Feb 97: Term Frequency] */
273	{
274	u_char Word[MAXSTEMLEN + 1];
275	u_char sWord[MAXSTEMLEN + 1];
276	u_char end, s_in;
277	int default_stem_method = 0;
278	TermList *Terms = MakeTermList(0);
279
280	s_in = (u_char *) QueryLine;
281	end = s_in + strlen ((char *) s_in) - 1;
282	query_term_list = MakeQueryTermList(0); / [RPAP - Feb 97: Term Frequency] */
283
284	if (indexed)
285	default_stem_method = BooleanEnv (GetEnv ("casefold"), 0) \| (BooleanEnv (GetEnv ("stem"), 0) << 1);
286	else
287	default_stem_method = sd->sdh.stem_method;
288
289	while (s_in <= end)
290	{
291	int j;
292	long num_entries, word_num;
293	unsigned long count, doc_count, invf_ptr, invf_len;
294	int weight_to_apply, stem_to_apply;
295	int method_using = -1;
296
297	/* 0=optional, 1=mustmatch */
298	int require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
299
300	/* Skip over the non word separator taking note of any parameters */
301	PARSE_RANKED_NON_STEM_WORD (require_match, s_in, end); /* [RJM 07/97: Ranked Required Terms] */
302	if (s_in > end) break;
303
304	/* Get a word and stem it */
305	PARSE_STEM_WORD (Word, s_in, end);
306
307	/* Extract any parameters */
308	weight_to_apply = 1;
309	stem_to_apply = default_stem_method;
310	while (s_in <= end)
311	{
312	int stem_param, weight_param, param_type;
313	char param[MAXPARAMLEN + 1];
314
315	param_type = 0;
316	PARSE_OPT_TERM_PARAM (param, param_type, s_in, end);
317	if (!param_type)
318	break;
319
320	switch (param_type)
321	{
322	case (WEIGHTPARAM):
323	weight_param = atoi (param);
324	if (errno != ERANGE && weight_param > 0)
325	weight_to_apply = weight_param;
326	break;
327
328	case (STEMPARAM):
329	stem_param = atoi (param);
330	if (errno != ERANGE && indexed && stem_param >= 0 && stem_param <= 3)
331	method_using = stem_to_apply = stem_param;
332	break;
333	}
334	}
335
336	bcopy ((char ) Word, (char ) sWord, *Word + 1);
337	stemmer (stem_to_apply, sd->sdh.stemmer_num, sWord);
338
339	if (!indexed \|\| stem_to_apply == 0)
340	{
341	/* Look for the word in the already identified terms */
342	for (j = 0; j < Terms->num; j++)
343	if (compare (Terms->TE[j].Word, Word) == 0)
344	break;
345
346	/* Increment the weight if the word is in the list */
347	/* Update the require match attribute */
348	if (j < Terms->num)
349	{
350	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
351	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
352	Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
353	AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
354	}
355	else
356	{
357	/* Look for it in the stemmed dictionary */
358	if ((word_num = FindWord (sd, sWord, &count, &doc_count,
359	&invf_ptr, &invf_len)) != -1)
360	{
361	/* Search the list for the word */
362	for (j = 0; j < Terms->num; j++)
363	if (Terms->TE[j].WE.word_num == word_num)
364	break;
365
366	/* Increment the weight if the word is in the list */
367	if (j < Terms->num)
368	{
369	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
370	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
371	Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
372	AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
373	}
374	else
375	{
376	/* Create a new entry in the list for the new word */
377	TermEntry te;
378
379	te.WE.word_num = word_num;
380	te.WE.count = count;
381	te.WE.doc_count = doc_count;
382	te.WE.max_doc_count = doc_count;
383	te.WE.invf_ptr = invf_ptr;
384	te.WE.invf_len = invf_len;
385	te.Count = weight_to_apply;
386	te.Word = copy_string (Word);
387	if (!te.Word)
388	FatalError (1, "Could NOT create memory to add term");
389	te.Stem = NULL;
390	te.require_match = require_match;
391
392	AddTermEntry (&Terms, &te);
393
394	/* [RPAP - Feb 97: Term Frequency] */
395	AddQueryTerm (query_term_list, Word, count, method_using);
396	}
397	}
398	/* [RPAP - Feb 97: Term Frequency] */
399	else
400	AddQueryTerm (query_term_list, Word, 0, method_using);
401	}
402	}
403	else
404	{
405	int total_count = 0; /* [RPAP - Feb 97: Term Frequency] */
406	TermList *tempList = MakeTermList (0);
407	if ((num_entries = FindWords (sd, sWord, stem_to_apply, &tempList)) > 0)
408	{
409	int i;
410	u_long max_doc_count = 0;
411
412	/* get the maximum doc count */
413	for (i = 0; i < tempList->num; i++)
414	{
415	if (tempList->TE[i].WE.doc_count > max_doc_count)
416	max_doc_count = tempList->TE[i].WE.doc_count;
417	total_count += tempList->TE[i].WE.count; /* [RPAP - Feb 97: Term Frequency] */
418	}
419
420	for (i = 0; i < tempList->num; i++)
421	{
422	/* Look for the word(s) in the already identified terms */
423	for (j = 0; j < Terms->num; j++)
424	{
425	if (compare (Terms->TE[j].Word, tempList->TE[i].Word) == 0)
426	{
427	/* found the word */
428	/* Modify weight */
429	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
430	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
431	if (Terms->TE[j].WE.max_doc_count < max_doc_count)
432	Terms->TE[j].WE.max_doc_count = max_doc_count;
433	break;
434	}
435	}
436
437	if (j == Terms->num)
438	{
439	/* word was not found */
440	tempList->TE[i].WE.max_doc_count = max_doc_count;
441	tempList->TE[i].Count = weight_to_apply;
442
443	/* We cannot require a term to match if it is expanded */
444	/* into multiple terms :-( */
445	tempList->TE[i].require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
446
447	AddTermEntry (&Terms, &tempList->TE[i]);
448	}
449	}
450	}
451	/* [RPAP - Feb 97: Term Frequency] */
452	AddQueryTerm (query_term_list, Word, total_count, method_using);
453
454	if (tempList != NULL) Xfree(tempList); /* [RJM 07/98: Memory Leak] */
455	} /* end indexed */
456	} /* end while */
457
458	if (Sort)
459	/* Sort the terms in ascending order by doc_count */
460	qsort (Terms->TE, Terms->num, sizeof (TermEntry), doc_count_comp);
461	return (Terms);
462	}
463
464
465
466
467	static int
468	DE_comp (void a, void b)
469	{
470	return ((DocEntry ) a)->SeekPos - ((DocEntry ) b)->SeekPos;
471	}
472
473
474
475
476
477	/*
478	* This function is given a list of term numbers and it returns a list
479	* of document numbers based on the cosine document weighting system.
480	* This puts the entries in an array.
481	* inverted file.
482	* If MaxDocs == -1 then it means all
483	*/
484	static DocList *
485	CosineGet (query_data * qd, TermList * Terms, RankedQueryInfo * rqi) {
486	DocList *Docs;
487	float *AccumulatedWeights = NULL;
488	Splay_Tree *ST = NULL;
489	Splay_Tree *Para_ST = NULL;
490	Hash_Table *HT = NULL;
491	List_Table *LT = NULL;
492	Heap *H;
493	HeapEntry *he;
494	register float *fptr = NULL;
495	register Invf_Doc_Entry *ide = NULL;
496	register Invf_Doc_EntryH *ideh = NULL;
497	int BackEnd, NumExact, MaxExact, NumParas;
498	int MaxDocs = 0, MaxParas = 0;
499	int i;
500	Invf_Doc_Entry_Pool ide_pool;
501	ide_pool.pool = NULL;
502
503	qd->hops_taken = qd->num_of_ptrs = qd->num_of_accum = 0;
504
505	switch (rqi->AccumMethod)
506	{
507	case 'S':
508	ST = CosineDecodeSplay (qd, Terms, rqi, &ide_pool);
509	if (!ST)
510	return NULL;
511	break;
512	case 'A':
513	AccumulatedWeights = CosineDecode (qd, Terms, rqi);
514	if (!AccumulatedWeights)
515	return NULL;
516	break;
517	case 'H':
518	HT = CosineDecodeHash (qd, Terms, rqi);
519	if (!HT)
520	return NULL;
521	break;
522	case 'L':
523	LT = CosineDecodeList (qd, Terms, rqi);
524	if (!LT)
525	return NULL;
526	break;
527	}
528
529	#if 0
530	if (rqi->UseSplayTree)
531	{
532
533	AccumulatedWeights = CosineDecode (qd, Terms, rqi);
534	fptr = AccumulatedWeights;
535	ide = SP_get_first (ST);
536	for (i = 0; i < qd->sd->sdh.num_of_docs; i++)
537	{
538	if (AccumulatedWeights[i] != 0)
539	{
540	if (i != ide->DocNum)
541	fprintf (stderr, "Sum mismatch for %d %f %d %f\n", i + 1,
542	AccumulatedWeights[i], ide->DocNum + 1, ide->Sum);
543	ide = SP_get_next (ST);
544	}
545	}
546	}
547	#endif
548
549	switch (rqi->AccumMethod)
550	{
551	case 'S':
552	MaxParas = ST->no_of_items;
553	break;
554	case 'A':
555	{ /* count the number of non-zero document weights */
556	register int i = qd->sd->sdh.num_of_docs;
557	register float *d;
558	MaxParas = 0;
559	for (d = AccumulatedWeights; i; i--, d++)
560	if (*d)
561	MaxParas++;
562	}
563	break;
564	case 'H':
565	MaxParas = HT->num + HT->Suplimentary_Num;
566	break;
567	case 'L':
568	MaxParas = LT->num;
569	break;
570	}
571
572	if (rqi->MaxParasToRetrieve != -1 && MaxParas > rqi->MaxParasToRetrieve)
573	MaxParas = rqi->MaxParasToRetrieve;
574	MaxDocs = MaxParas;
575
576	/* Allocate memory for the heap */
577	Docs = MakeDocList (MaxDocs);
578	ChangeMemInUse (qd, sizeof (DocEntry) * MaxDocs);
579
580	H = Heap_Make (MaxDocs, Heap_Lesser);
581
582
583	/* Get the sums from the array divide the sums by the
584	document weights which we retrieve from the ".idx.wgt" file and put
585	the resulting data into a heap */
586
587
588	he = H->HE;
589	H->NumItems = MaxDocs;
590	switch (rqi->AccumMethod)
591	{
592	case 'S':
593	{
594	ide = SP_get_first (ST);
595	for (i = 0; i < H->NumItems; i++, ide = SP_get_next (ST), he++)
596	{
597	he->DocNum = ide->DocNum + 1;
598	he->OrgWeight = &ide->Sum;
599	qd->num_of_accum++;
600	}
601	}
602	break;
603	case 'A':
604	{
605	fptr = AccumulatedWeights;
606	for (i = 0; i < H->NumItems; i++, fptr++, he++)
607	{
608	he->DocNum = i + 1;
609	he->OrgWeight = fptr;
610	if (*fptr)
611	qd->num_of_accum++;
612	}
613	}
614	break;
615	case 'H':
616	{
617	ideh = HT->Head;
618	for (i = 0; i < H->NumItems; i++, ideh = ideh->next, he++)
619	{
620	he->DocNum = ideh->IDE.DocNum + 1;
621	he->OrgWeight = &ideh->IDE.Sum;
622	qd->num_of_accum++;
623	}
624	}
625	break;
626	case 'L':
627	{
628	ide = LT->IDE;
629	for (i = 0; i < H->NumItems; i++, ide++, he++)
630	{
631	he->DocNum = ide->DocNum + 1;
632	he->OrgWeight = &ide->Sum;
633	qd->num_of_accum++;
634	}
635	}
636	break;
637	}
638
639	he = H->HE;
640	for (i = 0; i < H->NumItems; i++, he++)
641	{
642	*he->OrgWeight /= GetLowerApproxDocWeight (qd->awd, he->DocNum - 1);
643	he->Weight = *he->OrgWeight;
644	*he->OrgWeight = 0;
645	he->SeekPos = he->Len = 0;
646	}
647
648	Heap_Build (H);
649
650	he = H->HE;
651	switch (rqi->AccumMethod)
652	{
653	case 'S':
654	{
655	for (i = MaxDocs; i < ST->no_of_items; i++, ide = SP_get_next (ST))
656	{
657	ide->Sum /= GetLowerApproxDocWeight (qd->awd, ide->DocNum);
658	qd->num_of_accum++;
659	if (ide->Sum <= he->Weight)
660	continue;
661	*he->OrgWeight = he->Weight;
662	he->DocNum = ide->DocNum + 1;
663	he->Weight = ide->Sum;
664	he->OrgWeight = &ide->Sum;
665	*he->OrgWeight = 0;
666	Heap_Heapify (H, 1);
667	}
668	}
669	break;
670	case 'A':
671	{
672	for (i = MaxDocs; i < qd->sd->sdh.num_of_docs; i++, fptr++)
673	{
674	if (!*fptr)
675	continue;
676	qd->num_of_accum++;
677	*fptr /= GetLowerApproxDocWeight (qd->awd, i);
678	if (*fptr <= he->Weight)
679	continue;
680	*he->OrgWeight = he->Weight;
681	he->DocNum = i + 1;
682	he->Weight = *fptr;
683	he->OrgWeight = fptr;
684	*he->OrgWeight = 0;
685	Heap_Heapify (H, 1);
686	}
687	}
688	break;
689	case 'H':
690	{
691	for (; ideh; ideh = ideh->next)
692	{
693	qd->num_of_accum++;
694	ideh->IDE.Sum /=
695	GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum);
696	if (ideh->IDE.Sum <= he->Weight)
697	continue;
698	*he->OrgWeight = he->Weight;
699	he->DocNum = ideh->IDE.DocNum + 1;
700	he->Weight = ideh->IDE.Sum;
701	he->OrgWeight = &ideh->IDE.Sum;
702	*he->OrgWeight = 0;
703	Heap_Heapify (H, 1);
704	}
705	}
706	break;
707	case 'L':
708	{
709	for (i = MaxDocs; i < LT->num; i++, ide++)
710	{
711	qd->num_of_accum++;
712	ide->Sum /=
713	GetLowerApproxDocWeight (qd->awd, ide->DocNum);
714	if (ide->Sum <= he->Weight)
715	continue;
716	*he->OrgWeight = he->Weight;
717	he->DocNum = ide->DocNum + 1;
718	he->Weight = ide->Sum;
719	he->OrgWeight = &ide->Sum;
720	*he->OrgWeight = 0;
721	Heap_Heapify (H, 1);
722	}
723	}
724	break;
725	}
726
727
728	if (rqi->Exact && qd->id->ifh.InvfLevel != 3)
729	{
730	HeapEntry *he = H->HE;
731
732	for (i = 0; i < H->NumItems; i++, he++)
733	{
734	he->Weight = he->Weight *
735	GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
736	FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
737	}
738
739	Heap_Build (H);
740
741	he = H->HE;
742
743	switch (rqi->AccumMethod)
744	{
745	case 'S':
746	{
747	ide = SP_get_first (ST);
748	for (i = 0; i < ST->no_of_items; i++, ide = SP_get_next (ST))
749	{
750	u_long SeekPos, Len;
751	float Weight;
752	if (!ide->Sum)
753	continue;
754	if (ide->Sum <= he->Weight)
755	continue;
756	Weight = ide->Sum *
757	GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
758	FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
759	if (Weight <= he->Weight)
760	continue;
761	he->DocNum = ide->DocNum + 1;
762	he->OrgWeight = &ide->Sum;
763	he->Weight = Weight;
764	he->SeekPos = SeekPos;
765	he->Len = Len;
766	ide->Sum = 0;
767	Heap_Heapify (H, 1);
768	}
769	}
770	break;
771
772	/* up to here */
773
774	case 'A':
775	{
776	fptr = AccumulatedWeights;
777	for (i = 0; i < qd->sd->sdh.num_of_docs; i++, fptr++)
778	{
779	u_long SeekPos, Len;
780	float Weight;
781	if (!*fptr)
782	continue;
783	if (*fptr <= he->Weight)
784	continue;
785	Weight = fptr
786	GetLowerApproxDocWeight (qd->awd, i) /
787	FetchDocStart (qd, i + 1, &SeekPos, &Len);
788	if (Weight <= he->Weight)
789	continue;
790	he->DocNum = i + 1;
791	he->OrgWeight = fptr;
792	he->Weight = Weight;
793	he->SeekPos = SeekPos;
794	he->Len = Len;
795	*fptr = 0;
796	Heap_Heapify (H, 1);
797	}
798	}
799	break;
800	case 'H':
801	{
802	ideh = HT->Head;
803	for (ideh = HT->Head; ideh; ideh = ideh->next)
804	{
805	u_long SeekPos, Len;
806	float Weight;
807	if (!ideh->IDE.Sum)
808	continue;
809	if (ideh->IDE.Sum <= he->Weight)
810	continue;
811	Weight = ideh->IDE.Sum *
812	GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum) /
813	FetchDocStart (qd, ideh->IDE.DocNum + 1, &SeekPos, &Len);
814	if (Weight <= he->Weight)
815	continue;
816	he->DocNum = ideh->IDE.DocNum + 1;
817	he->OrgWeight = &ideh->IDE.Sum;
818	he->Weight = Weight;
819	he->SeekPos = SeekPos;
820	he->Len = Len;
821	ideh->IDE.Sum = 0;
822	Heap_Heapify (H, 1);
823	}
824	}
825	break;
826	case 'L':
827	{
828	ide = LT->IDE;
829	for (i = 0; i < LT->num; i++, ide++)
830	{
831	u_long SeekPos, Len;
832	float Weight;
833	if (!ide->Sum)
834	continue;
835	if (ide->Sum <= he->Weight)
836	continue;
837	Weight = ide->Sum *
838	GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
839	FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
840	if (Weight <= he->Weight)
841	continue;
842	he->DocNum = ide->DocNum + 1;
843	he->OrgWeight = &ide->Sum;
844	he->Weight = Weight;
845	he->SeekPos = SeekPos;
846	he->Len = Len;
847	ide->Sum = 0;
848	Heap_Heapify (H, 1);
849	}
850	}
851	break;
852	}
853	}
854
855
856
857	H->HC = Heap_Greater;
858	Heap_Build (H);
859
860
861	MaxDocs = H->NumItems;
862	if (rqi->MaxDocsToRetrieve != -1 && MaxDocs > rqi->MaxDocsToRetrieve)
863	MaxDocs = rqi->MaxDocsToRetrieve;
864
865	/* Alarm */
866
867	he = H->HE;
868	BackEnd = H->NumItems - 1;
869	NumExact = 0;
870	MaxExact = H->NumItems;
871	NumParas = 0;
872	Para_ST = SP_createset (DE_comp);
873	while (H->NumItems && Docs->num < MaxDocs)
874	{
875	DocEntry DocEnt;
876	DocEntry *mem;
877
878	if (rqi->Exact)
879	{
880	if (H->HE[0].SeekPos == 0)
881	NumExact += Make_Exact_Root (qd, H);
882	}
883	else
884	FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
885
886	NumParas++;
887
888	DocEnt.DocNum = he->DocNum;
889	DocEnt.Weight = he->Weight;
890	DocEnt.Len = he->Len;
891	DocEnt.SeekPos = he->SeekPos;
892	DocEnt.CompTextBuffer = NULL;
893	DocEnt.Next = NULL;
894
895	Heap_DeleteHead (H);
896
897	if (!(mem = SP_member (&DocEnt, Para_ST)))
898	{
899	Docs->DE[Docs->num] = DocEnt;
900	SP_insert (&Docs->DE[Docs->num], Para_ST);
901	Docs->num++;
902	}
903	else
904	{
905	DocEnt.Next = mem->Next;
906	Docs->DE[BackEnd] = DocEnt;
907	mem->Next = &Docs->DE[BackEnd--];
908	}
909	}
910	SP_freeset (Para_ST);
911
912	if (qd->id->ifh.InvfLevel == 3)
913	{
914	Message ("%d Paragraphs were required to get %d documents",
915	NumParas, Docs->num);
916	if (NumExact == MaxExact)
917	{
918	Message ("The exact weights of all %d paragraphs had to be calculated", MaxExact);
919	Message ("to obtain %d paragraphs. This may mean that the the documents", NumParas);
920	Message ("returned do not necessarly represent an exact cosine ranking.");
921	Message ("This problem may be corrected by increasing \'maxparas\'.");
922	}
923	}
924	#if 0
925	{
926	int i;
927	FILE *f = fopen ("top.paras", "w");
928	fprintf (f, "=========================\nTop Paragraphs\n");
929	for (i = 0; i < Docs->num; i++)
930	{
931	DocEntry *d;
932	fprintf (f, "<%d(%f)> ", Heap[i].DocNum, Heap[i].Weight);
933	for (d = Heap[i].Next; d; d = d->Next)
934	fprintf (f, "%d(%f) ", d->DocNum, d->Weight);
935	fprintf (f, "\n");
936	}
937	fprintf (f, "=========================\n");
938	fclose (f);
939	}
940	#endif
941
942	if (AccumulatedWeights)
943	{
944	Xfree (AccumulatedWeights);
945	ChangeMemInUse (qd, -sizeof (float) * qd->sd->sdh.num_of_docs);
946	}
947	if (ST)
948	{
949	int mem = ST->mem_in_use;
950	SP_freeset (ST);
951	ChangeMemInUse (qd, -mem);
952	free_ide_pool (qd, &ide_pool);
953	}
954	if (HT)
955	HT_free (qd, HT);
956
957	if (LT)
958	LT_free (qd, LT);
959
960	if (H) Xfree (H); /* [RJM 07/98: Memory Leak] */
961
962	return (Docs);
963	}
964
965
966
967
968	/* if MaxDocs == -1 it means all */
969	void
970	RankedQuery (query_data qd, char Query, RankedQueryInfo *rqi)
971	{
972	DocList *dl;
973
974	if (qd->TL)
975	FreeTermList (&(qd->TL));
976
977	/* [RPAP - Feb 97: Term Frequency] */
978	if (qd->QTL)
979	FreeQueryTermList (&(qd->QTL));
980
981	qd->TL = ParseRankedQuery (qd->sd, Query, rqi->Sort, qd->sd->sdh.indexed, /* [RPAP - Jan 97: Stem Index Change] */
982	&(qd->QTL)); /* [RPAP - Feb 97: Term Frequency] */
983
984	/* PrintTermList (qd->TL, stderr); */
985
986	dl = CosineGet (qd, qd->TL, rqi);
987
988	if (!dl)
989	FatalError (1, "Out of memory\n");
990
991	FreeQueryDocs (qd);
992
993	qd->DL = dl;
994	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: