Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/packages/mg/src/text/query.ranked.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago
renamed mg-1.3d directory mg
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 23.4 KB

Line
1	/**************************************************************************
2	*
3	* query.ranked.c -- Ranked query evaluation
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: query.ranked.c 439 1999-08-10 21:23:37Z sjboddie $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "memlib.h"
27	#include "filestats.h"
28	#include "messages.h"
29	#include "timing.h"
30	#include "sptree.h"
31
32	#include "mg.h"
33	#include "invf.h"
34	#include "text.h"
35	#include "lists.h"
36	#include "backend.h"
37	#include "stem_search.h"
38	#include "weights.h"
39	#include "text_get.h"
40	#include "invf_get.h"
41	#include "words.h"
42	#include "stemmer.h"
43	#include "locallib.h"
44	#include "environment.h"
45	#include "term_lists.h"
46	#include "local_strings.h"
47	#include "query_term_list.h" /* [RPAP - Feb 97: Term Frequency] */
48
49	/*
50	$Log$
51	Revision 1.1 1999/08/10 21:18:20 sjboddie
52	renamed mg-1.3d directory mg
53
54	Revision 1.2 1998/11/25 07:55:50 rjmcnab
55
56	Modified mg to that you can specify the stemmer you want
57	to use via a command line option. You specify it to
58	mg_passes during the build process. The number of the
59	stemmer that you used is stored within the inverted
60	dictionary header and the stemmed dictionary header so
61	the correct stemmer is used in later stages of building
62	and querying.
63
64	Revision 1.1 1998/11/17 09:35:34 rjmcnab
65	* empty log message *
66
67	* Revision 1.4 1994/11/25 03:47:46 tes
68	* Committing files before adding the merge stuff.
69	*
70	* Revision 1.3 1994/10/20 03:57:03 tes
71	* I have rewritten the boolean query optimiser and abstracted out the
72	* components of the boolean query.
73	*
74	* Revision 1.2 1994/09/20 04:42:04 tes
75	* For version 1.1
76	*
77	*/
78
79	static char *RCSID = "$Id: query.ranked.c 439 1999-08-10 21:23:37Z sjboddie $";
80
81	/*************************************************************************/
82
83	typedef struct HeapEntry
84	{
85	float Weight;
86	float *OrgWeight;
87	int DocNum;
88	u_long SeekPos; /* position in the text file in bytes */
89	u_long Len; /* length of the document in bytes */
90	}
91	HeapEntry;
92
93	typedef int (HeapComp) (HeapEntry , HeapEntry *);
94
95	typedef struct Heap
96	{
97	int NumItems;
98	int MaxSize;
99	HeapComp HC;
100	HeapEntry HE[1];
101	}
102	Heap;
103
104
105	Heap *
106	Heap_Make (int size, HeapComp hc)
107	{
108	Heap *H;
109	H = Xmalloc (sizeof (Heap) + (size - 1) * sizeof (HeapEntry));
110	if (!H)
111	return NULL;
112	H->NumItems = 0;
113	H->MaxSize = size;
114	H->HC = hc;
115	return H;
116	}
117
118	int
119	Heap_Size (Heap * H)
120	{
121	return sizeof (Heap) + H->MaxSize * sizeof (HeapEntry);
122	}
123
124	HeapEntry *
125	Heap_GetHead (Heap * H)
126	{
127	if (H && H->NumItems)
128	return &H->HE[0];
129	else
130	return NULL;
131	}
132
133
134
135	void
136	Heap_Heapify (Heap * H, int i)
137	{
138	register int curr, child;
139	curr = i;
140	child = curr * 2;
141	while (child <= H->NumItems)
142	{
143	if (child < H->NumItems && H->HC (&H->HE[child], &H->HE[child - 1]) > 0)
144	child++;
145	if (H->HC (&H->HE[curr - 1], &H->HE[child - 1]) < 0)
146	{
147	HeapEntry temp = H->HE[child - 1];
148	H->HE[child - 1] = H->HE[curr - 1];
149	H->HE[curr - 1] = temp;
150	curr = child;
151	child = 2 * child;
152	}
153	else
154	break;
155	}
156	}
157
158
159	void
160	Heap_Build (Heap * H)
161	{
162	register int i;
163	for (i = H->NumItems / 2; i > 0; i--)
164	Heap_Heapify (H, i);
165	}
166
167
168	void
169	Heap_Sort (Heap * H)
170	{
171	register int i;
172	for (i = H->NumItems; i > 1; i--)
173	{
174	HeapEntry temp = H->HE[0];
175	H->HE[0] = H->HE[i - 1];
176	H->HE[i - 1] = temp;
177	H->NumItems--;
178	Heap_Heapify (H, 1);
179	}
180	}
181
182
183	void
184	Heap_DeleteHead (Heap * H)
185	{
186	H->HE[0] = H->HE[--H->NumItems];
187	Heap_Heapify (H, 1);
188	}
189
190	int
191	Heap_Lesser (HeapEntry * a, HeapEntry * b)
192	{
193	return (a->Weight > b->Weight ? -1 :
194	(a->Weight == b->Weight ? 0 : 1));
195	}
196
197
198	int
199	Heap_Greater (HeapEntry * a, HeapEntry * b)
200	{
201	return (a->Weight > b->Weight ? 1 :
202	(a->Weight == b->Weight ? 0 : -1));
203	}
204
205	int
206	Make_Exact_Root (query_data * qd, Heap * H)
207	{
208	int num = 0;
209	HeapEntry *he = H->HE;
210	while (he->SeekPos == 0)
211	{
212	he->Weight = he->Weight *
213	GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
214	FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
215	Heap_Heapify (H, 1);
216	num++;
217	}
218	return num;
219	}
220
221	void
222	Heap_Dump (Heap * H, int num)
223	{
224	int i, l, lines, r;
225	if (num > H->NumItems)
226	num = H->NumItems;
227	lines = (num + 3) / 4;
228	for (l = 0; l < lines; l++)
229	for (r = 0; r < 4; r++)
230	{
231	i = lines * r + l;
232	if (i < num)
233	fprintf (stderr, "[%2d] %7.4f", i, H->HE[i].Weight);
234	fprintf (stderr, r == 3 ? "\n" : " ");
235	}
236	fprintf (stderr, "\n");
237	}
238
239	/*************************************************************************/
240
241
242
243
244
245
246	int
247	doc_count_comp (const void A, const void B)
248	{
249	const TermEntry *a = A;
250	const TermEntry *b = B;
251	return (a->WE.doc_count - b->WE.doc_count);
252	}
253
254	/* =========================================================================
255	* Function: ParseRankedQuery
256	* Description:
257	* Takes a string query line and extracts the terms in the query
258	* which exist in the stemmed dictionary.
259	* Optionally sorts the terms into ascending order by doc. number.
260	* Input:
261	* stemmed dictionary, query line, sort-flag
262	* Output:
263	* A list of terms.
264	* ========================================================================= */
265
266	/* [RPAP - Jan 97: Stem Index Change] */
267	static TermList *
268	ParseRankedQuery (stemmed_dict * sd, char *QueryLine, int Sort, int indexed,
269	QueryTermList *query_term_list) / [RPAP - Feb 97: Term Frequency] */
270	{
271	u_char Word[MAXSTEMLEN + 1];
272	u_char sWord[MAXSTEMLEN + 1];
273	u_char end, s_in;
274	int default_stem_method = 0;
275	TermList *Terms = MakeTermList(0);
276
277	s_in = (u_char *) QueryLine;
278	end = s_in + strlen ((char *) s_in) - 1;
279	query_term_list = MakeQueryTermList(0); / [RPAP - Feb 97: Term Frequency] */
280
281	if (indexed)
282	default_stem_method = BooleanEnv (GetEnv ("casefold"), 0) \| (BooleanEnv (GetEnv ("stem"), 0) << 1);
283	else
284	default_stem_method = sd->sdh.stem_method;
285
286	while (s_in <= end)
287	{
288	int j;
289	long num_entries, word_num;
290	unsigned long count, doc_count, invf_ptr, invf_len;
291	int weight_to_apply, stem_to_apply;
292	int method_using = -1;
293
294	/* 0=optional, 1=mustmatch */
295	int require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
296
297	/* Skip over the non word separator taking note of any parameters */
298	PARSE_RANKED_NON_STEM_WORD (require_match, s_in, end); /* [RJM 07/97: Ranked Required Terms] */
299	if (s_in > end) break;
300
301	/* Get a word and stem it */
302	PARSE_STEM_WORD (Word, s_in, end);
303
304	/* Extract any parameters */
305	weight_to_apply = 1;
306	stem_to_apply = default_stem_method;
307	while (s_in <= end)
308	{
309	int stem_param, weight_param, param_type;
310	char param[MAXPARAMLEN + 1];
311
312	param_type = 0;
313	PARSE_OPT_TERM_PARAM (param, param_type, s_in, end);
314	if (!param_type)
315	break;
316
317	switch (param_type)
318	{
319	case (WEIGHTPARAM):
320	weight_param = atoi (param);
321	if (errno != ERANGE && weight_param > 0)
322	weight_to_apply = weight_param;
323	break;
324
325	case (STEMPARAM):
326	stem_param = atoi (param);
327	if (errno != ERANGE && indexed && stem_param >= 0 && stem_param <= 3)
328	method_using = stem_to_apply = stem_param;
329	break;
330	}
331	}
332
333	bcopy ((char ) Word, (char ) sWord, *Word + 1);
334	stemmer (stem_to_apply, sd->sdh.stemmer_num, sWord);
335
336	if (!indexed \|\| stem_to_apply == 0)
337	{
338	/* Look for the word in the already identified terms */
339	for (j = 0; j < Terms->num; j++)
340	if (compare (Terms->TE[j].Word, Word) == 0)
341	break;
342
343	/* Increment the weight if the word is in the list */
344	/* Update the require match attribute */
345	if (j < Terms->num)
346	{
347	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
348	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
349	Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
350	AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
351	}
352	else
353	{
354	/* Look for it in the stemmed dictionary */
355	if ((word_num = FindWord (sd, sWord, &count, &doc_count,
356	&invf_ptr, &invf_len)) != -1)
357	{
358	/* Search the list for the word */
359	for (j = 0; j < Terms->num; j++)
360	if (Terms->TE[j].WE.word_num == word_num)
361	break;
362
363	/* Increment the weight if the word is in the list */
364	if (j < Terms->num)
365	{
366	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
367	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
368	Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
369	AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
370	}
371	else
372	{
373	/* Create a new entry in the list for the new word */
374	TermEntry te;
375
376	te.WE.word_num = word_num;
377	te.WE.count = count;
378	te.WE.doc_count = doc_count;
379	te.WE.max_doc_count = doc_count;
380	te.WE.invf_ptr = invf_ptr;
381	te.WE.invf_len = invf_len;
382	te.Count = weight_to_apply;
383	te.Word = copy_string (Word);
384	if (!te.Word)
385	FatalError (1, "Could NOT create memory to add term");
386	te.Stem = NULL;
387	te.require_match = require_match;
388
389	AddTermEntry (&Terms, &te);
390
391	/* [RPAP - Feb 97: Term Frequency] */
392	AddQueryTerm (query_term_list, Word, count, method_using);
393	}
394	}
395	/* [RPAP - Feb 97: Term Frequency] */
396	else
397	AddQueryTerm (query_term_list, Word, 0, method_using);
398	}
399	}
400	else
401	{
402	int total_count = 0; /* [RPAP - Feb 97: Term Frequency] */
403	TermList *tempList = MakeTermList (0);
404	if ((num_entries = FindWords (sd, sWord, stem_to_apply, &tempList)) > 0)
405	{
406	int i;
407	u_long max_doc_count = 0;
408
409	/* get the maximum doc count */
410	for (i = 0; i < tempList->num; i++)
411	{
412	if (tempList->TE[i].WE.doc_count > max_doc_count)
413	max_doc_count = tempList->TE[i].WE.doc_count;
414	total_count += tempList->TE[i].WE.count; /* [RPAP - Feb 97: Term Frequency] */
415	}
416
417	for (i = 0; i < tempList->num; i++)
418	{
419	/* Look for the word(s) in the already identified terms */
420	for (j = 0; j < Terms->num; j++)
421	{
422	if (compare (Terms->TE[j].Word, tempList->TE[i].Word) == 0)
423	{
424	/* found the word */
425	/* Modify weight */
426	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
427	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
428	if (Terms->TE[j].WE.max_doc_count < max_doc_count)
429	Terms->TE[j].WE.max_doc_count = max_doc_count;
430	break;
431	}
432	}
433
434	if (j == Terms->num)
435	{
436	/* word was not found */
437	tempList->TE[i].WE.max_doc_count = max_doc_count;
438	tempList->TE[i].Count = weight_to_apply;
439
440	/* We cannot require a term to match if it is expanded */
441	/* into multiple terms :-( */
442	tempList->TE[i].require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
443
444	AddTermEntry (&Terms, &tempList->TE[i]);
445	}
446	}
447	}
448	/* [RPAP - Feb 97: Term Frequency] */
449	AddQueryTerm (query_term_list, Word, total_count, method_using);
450
451	if (tempList != NULL) Xfree(tempList); /* [RJM 07/98: Memory Leak] */
452	} /* end indexed */
453	} /* end while */
454
455	if (Sort)
456	/* Sort the terms in ascending order by doc_count */
457	qsort (Terms->TE, Terms->num, sizeof (TermEntry), doc_count_comp);
458	return (Terms);
459	}
460
461
462
463
464	static int
465	DE_comp (void a, void b)
466	{
467	return ((DocEntry ) a)->SeekPos - ((DocEntry ) b)->SeekPos;
468	}
469
470
471
472
473
474	/*
475	* This function is given a list of term numbers and it returns a list
476	* of document numbers based on the cosine document weighting system.
477	* This puts the entries in an array.
478	* inverted file.
479	* If MaxDocs == -1 then it means all
480	*/
481	static DocList *
482	CosineGet (query_data * qd, TermList * Terms, RankedQueryInfo * rqi) {
483	DocList *Docs;
484	float *AccumulatedWeights = NULL;
485	Splay_Tree *ST = NULL;
486	Splay_Tree *Para_ST = NULL;
487	Hash_Table *HT = NULL;
488	List_Table *LT = NULL;
489	Heap *H;
490	HeapEntry *he;
491	register float *fptr = NULL;
492	register Invf_Doc_Entry *ide = NULL;
493	register Invf_Doc_EntryH *ideh = NULL;
494	int BackEnd, NumExact, MaxExact, NumParas;
495	int MaxDocs = 0, MaxParas = 0;
496	int i;
497	Invf_Doc_Entry_Pool ide_pool;
498	ide_pool.pool = NULL;
499
500	qd->hops_taken = qd->num_of_ptrs = qd->num_of_accum = 0;
501
502	switch (rqi->AccumMethod)
503	{
504	case 'S':
505	ST = CosineDecodeSplay (qd, Terms, rqi, &ide_pool);
506	if (!ST)
507	return NULL;
508	break;
509	case 'A':
510	AccumulatedWeights = CosineDecode (qd, Terms, rqi);
511	if (!AccumulatedWeights)
512	return NULL;
513	break;
514	case 'H':
515	HT = CosineDecodeHash (qd, Terms, rqi);
516	if (!HT)
517	return NULL;
518	break;
519	case 'L':
520	LT = CosineDecodeList (qd, Terms, rqi);
521	if (!LT)
522	return NULL;
523	break;
524	}
525
526	#if 0
527	if (rqi->UseSplayTree)
528	{
529
530	AccumulatedWeights = CosineDecode (qd, Terms, rqi);
531	fptr = AccumulatedWeights;
532	ide = SP_get_first (ST);
533	for (i = 0; i < qd->sd->sdh.num_of_docs; i++)
534	{
535	if (AccumulatedWeights[i] != 0)
536	{
537	if (i != ide->DocNum)
538	fprintf (stderr, "Sum mismatch for %d %f %d %f\n", i + 1,
539	AccumulatedWeights[i], ide->DocNum + 1, ide->Sum);
540	ide = SP_get_next (ST);
541	}
542	}
543	}
544	#endif
545
546	switch (rqi->AccumMethod)
547	{
548	case 'S':
549	MaxParas = ST->no_of_items;
550	break;
551	case 'A':
552	{ /* count the number of non-zero document weights */
553	register int i = qd->sd->sdh.num_of_docs;
554	register float *d;
555	MaxParas = 0;
556	for (d = AccumulatedWeights; i; i--, d++)
557	if (*d)
558	MaxParas++;
559	}
560	break;
561	case 'H':
562	MaxParas = HT->num + HT->Suplimentary_Num;
563	break;
564	case 'L':
565	MaxParas = LT->num;
566	break;
567	}
568
569	if (rqi->MaxParasToRetrieve != -1 && MaxParas > rqi->MaxParasToRetrieve)
570	MaxParas = rqi->MaxParasToRetrieve;
571	MaxDocs = MaxParas;
572
573	/* Allocate memory for the heap */
574	Docs = MakeDocList (MaxDocs);
575	ChangeMemInUse (qd, sizeof (DocEntry) * MaxDocs);
576
577	H = Heap_Make (MaxDocs, Heap_Lesser);
578
579
580	/* Get the sums from the array divide the sums by the
581	document weights which we retrieve from the ".idx.wgt" file and put
582	the resulting data into a heap */
583
584
585	he = H->HE;
586	H->NumItems = MaxDocs;
587	switch (rqi->AccumMethod)
588	{
589	case 'S':
590	{
591	ide = SP_get_first (ST);
592	for (i = 0; i < H->NumItems; i++, ide = SP_get_next (ST), he++)
593	{
594	he->DocNum = ide->DocNum + 1;
595	he->OrgWeight = &ide->Sum;
596	qd->num_of_accum++;
597	}
598	}
599	break;
600	case 'A':
601	{
602	fptr = AccumulatedWeights;
603	for (i = 0; i < H->NumItems; i++, fptr++, he++)
604	{
605	he->DocNum = i + 1;
606	he->OrgWeight = fptr;
607	if (*fptr)
608	qd->num_of_accum++;
609	}
610	}
611	break;
612	case 'H':
613	{
614	ideh = HT->Head;
615	for (i = 0; i < H->NumItems; i++, ideh = ideh->next, he++)
616	{
617	he->DocNum = ideh->IDE.DocNum + 1;
618	he->OrgWeight = &ideh->IDE.Sum;
619	qd->num_of_accum++;
620	}
621	}
622	break;
623	case 'L':
624	{
625	ide = LT->IDE;
626	for (i = 0; i < H->NumItems; i++, ide++, he++)
627	{
628	he->DocNum = ide->DocNum + 1;
629	he->OrgWeight = &ide->Sum;
630	qd->num_of_accum++;
631	}
632	}
633	break;
634	}
635
636	he = H->HE;
637	for (i = 0; i < H->NumItems; i++, he++)
638	{
639	*he->OrgWeight /= GetLowerApproxDocWeight (qd->awd, he->DocNum - 1);
640	he->Weight = *he->OrgWeight;
641	*he->OrgWeight = 0;
642	he->SeekPos = he->Len = 0;
643	}
644
645	Heap_Build (H);
646
647	he = H->HE;
648	switch (rqi->AccumMethod)
649	{
650	case 'S':
651	{
652	for (i = MaxDocs; i < ST->no_of_items; i++, ide = SP_get_next (ST))
653	{
654	ide->Sum /= GetLowerApproxDocWeight (qd->awd, ide->DocNum);
655	qd->num_of_accum++;
656	if (ide->Sum <= he->Weight)
657	continue;
658	*he->OrgWeight = he->Weight;
659	he->DocNum = ide->DocNum + 1;
660	he->Weight = ide->Sum;
661	he->OrgWeight = &ide->Sum;
662	*he->OrgWeight = 0;
663	Heap_Heapify (H, 1);
664	}
665	}
666	break;
667	case 'A':
668	{
669	for (i = MaxDocs; i < qd->sd->sdh.num_of_docs; i++, fptr++)
670	{
671	if (!*fptr)
672	continue;
673	qd->num_of_accum++;
674	*fptr /= GetLowerApproxDocWeight (qd->awd, i);
675	if (*fptr <= he->Weight)
676	continue;
677	*he->OrgWeight = he->Weight;
678	he->DocNum = i + 1;
679	he->Weight = *fptr;
680	he->OrgWeight = fptr;
681	*he->OrgWeight = 0;
682	Heap_Heapify (H, 1);
683	}
684	}
685	break;
686	case 'H':
687	{
688	for (; ideh; ideh = ideh->next)
689	{
690	qd->num_of_accum++;
691	ideh->IDE.Sum /=
692	GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum);
693	if (ideh->IDE.Sum <= he->Weight)
694	continue;
695	*he->OrgWeight = he->Weight;
696	he->DocNum = ideh->IDE.DocNum + 1;
697	he->Weight = ideh->IDE.Sum;
698	he->OrgWeight = &ideh->IDE.Sum;
699	*he->OrgWeight = 0;
700	Heap_Heapify (H, 1);
701	}
702	}
703	break;
704	case 'L':
705	{
706	for (i = MaxDocs; i < LT->num; i++, ide++)
707	{
708	qd->num_of_accum++;
709	ide->Sum /=
710	GetLowerApproxDocWeight (qd->awd, ide->DocNum);
711	if (ide->Sum <= he->Weight)
712	continue;
713	*he->OrgWeight = he->Weight;
714	he->DocNum = ide->DocNum + 1;
715	he->Weight = ide->Sum;
716	he->OrgWeight = &ide->Sum;
717	*he->OrgWeight = 0;
718	Heap_Heapify (H, 1);
719	}
720	}
721	break;
722	}
723
724
725	if (rqi->Exact && qd->id->ifh.InvfLevel != 3)
726	{
727	HeapEntry *he = H->HE;
728
729	for (i = 0; i < H->NumItems; i++, he++)
730	{
731	he->Weight = he->Weight *
732	GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
733	FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
734	}
735
736	Heap_Build (H);
737
738	he = H->HE;
739
740	switch (rqi->AccumMethod)
741	{
742	case 'S':
743	{
744	ide = SP_get_first (ST);
745	for (i = 0; i < ST->no_of_items; i++, ide = SP_get_next (ST))
746	{
747	u_long SeekPos, Len;
748	float Weight;
749	if (!ide->Sum)
750	continue;
751	if (ide->Sum <= he->Weight)
752	continue;
753	Weight = ide->Sum *
754	GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
755	FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
756	if (Weight <= he->Weight)
757	continue;
758	he->DocNum = ide->DocNum + 1;
759	he->OrgWeight = &ide->Sum;
760	he->Weight = Weight;
761	he->SeekPos = SeekPos;
762	he->Len = Len;
763	ide->Sum = 0;
764	Heap_Heapify (H, 1);
765	}
766	}
767	break;
768
769	/* up to here */
770
771	case 'A':
772	{
773	fptr = AccumulatedWeights;
774	for (i = 0; i < qd->sd->sdh.num_of_docs; i++, fptr++)
775	{
776	u_long SeekPos, Len;
777	float Weight;
778	if (!*fptr)
779	continue;
780	if (*fptr <= he->Weight)
781	continue;
782	Weight = fptr
783	GetLowerApproxDocWeight (qd->awd, i) /
784	FetchDocStart (qd, i + 1, &SeekPos, &Len);
785	if (Weight <= he->Weight)
786	continue;
787	he->DocNum = i + 1;
788	he->OrgWeight = fptr;
789	he->Weight = Weight;
790	he->SeekPos = SeekPos;
791	he->Len = Len;
792	*fptr = 0;
793	Heap_Heapify (H, 1);
794	}
795	}
796	break;
797	case 'H':
798	{
799	ideh = HT->Head;
800	for (ideh = HT->Head; ideh; ideh = ideh->next)
801	{
802	u_long SeekPos, Len;
803	float Weight;
804	if (!ideh->IDE.Sum)
805	continue;
806	if (ideh->IDE.Sum <= he->Weight)
807	continue;
808	Weight = ideh->IDE.Sum *
809	GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum) /
810	FetchDocStart (qd, ideh->IDE.DocNum + 1, &SeekPos, &Len);
811	if (Weight <= he->Weight)
812	continue;
813	he->DocNum = ideh->IDE.DocNum + 1;
814	he->OrgWeight = &ideh->IDE.Sum;
815	he->Weight = Weight;
816	he->SeekPos = SeekPos;
817	he->Len = Len;
818	ideh->IDE.Sum = 0;
819	Heap_Heapify (H, 1);
820	}
821	}
822	break;
823	case 'L':
824	{
825	ide = LT->IDE;
826	for (i = 0; i < LT->num; i++, ide++)
827	{
828	u_long SeekPos, Len;
829	float Weight;
830	if (!ide->Sum)
831	continue;
832	if (ide->Sum <= he->Weight)
833	continue;
834	Weight = ide->Sum *
835	GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
836	FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
837	if (Weight <= he->Weight)
838	continue;
839	he->DocNum = ide->DocNum + 1;
840	he->OrgWeight = &ide->Sum;
841	he->Weight = Weight;
842	he->SeekPos = SeekPos;
843	he->Len = Len;
844	ide->Sum = 0;
845	Heap_Heapify (H, 1);
846	}
847	}
848	break;
849	}
850	}
851
852
853
854	H->HC = Heap_Greater;
855	Heap_Build (H);
856
857
858	MaxDocs = H->NumItems;
859	if (rqi->MaxDocsToRetrieve != -1 && MaxDocs > rqi->MaxDocsToRetrieve)
860	MaxDocs = rqi->MaxDocsToRetrieve;
861
862	/* Alarm */
863
864	he = H->HE;
865	BackEnd = H->NumItems - 1;
866	NumExact = 0;
867	MaxExact = H->NumItems;
868	NumParas = 0;
869	Para_ST = SP_createset (DE_comp);
870	while (H->NumItems && Docs->num < MaxDocs)
871	{
872	DocEntry DocEnt;
873	DocEntry *mem;
874
875	if (rqi->Exact)
876	{
877	if (H->HE[0].SeekPos == 0)
878	NumExact += Make_Exact_Root (qd, H);
879	}
880	else
881	FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
882
883	NumParas++;
884
885	DocEnt.DocNum = he->DocNum;
886	DocEnt.Weight = he->Weight;
887	DocEnt.Len = he->Len;
888	DocEnt.SeekPos = he->SeekPos;
889	DocEnt.CompTextBuffer = NULL;
890	DocEnt.Next = NULL;
891
892	Heap_DeleteHead (H);
893
894	if (!(mem = SP_member (&DocEnt, Para_ST)))
895	{
896	Docs->DE[Docs->num] = DocEnt;
897	SP_insert (&Docs->DE[Docs->num], Para_ST);
898	Docs->num++;
899	}
900	else
901	{
902	DocEnt.Next = mem->Next;
903	Docs->DE[BackEnd] = DocEnt;
904	mem->Next = &Docs->DE[BackEnd--];
905	}
906	}
907	SP_freeset (Para_ST);
908
909	if (qd->id->ifh.InvfLevel == 3)
910	{
911	Message ("%d Paragraphs were required to get %d documents",
912	NumParas, Docs->num);
913	if (NumExact == MaxExact)
914	{
915	Message ("The exact weights of all %d paragraphs had to be calculated", MaxExact);
916	Message ("to obtain %d paragraphs. This may mean that the the documents", NumParas);
917	Message ("returned do not necessarly represent an exact cosine ranking.");
918	Message ("This problem may be corrected by increasing \'maxparas\'.");
919	}
920	}
921	#if 0
922	{
923	int i;
924	FILE *f = fopen ("top.paras", "w");
925	fprintf (f, "=========================\nTop Paragraphs\n");
926	for (i = 0; i < Docs->num; i++)
927	{
928	DocEntry *d;
929	fprintf (f, "<%d(%f)> ", Heap[i].DocNum, Heap[i].Weight);
930	for (d = Heap[i].Next; d; d = d->Next)
931	fprintf (f, "%d(%f) ", d->DocNum, d->Weight);
932	fprintf (f, "\n");
933	}
934	fprintf (f, "=========================\n");
935	fclose (f);
936	}
937	#endif
938
939	if (AccumulatedWeights)
940	{
941	Xfree (AccumulatedWeights);
942	ChangeMemInUse (qd, -sizeof (float) * qd->sd->sdh.num_of_docs);
943	}
944	if (ST)
945	{
946	int mem = ST->mem_in_use;
947	SP_freeset (ST);
948	ChangeMemInUse (qd, -mem);
949	free_ide_pool (qd, &ide_pool);
950	}
951	if (HT)
952	HT_free (qd, HT);
953
954	if (LT)
955	LT_free (qd, LT);
956
957	if (H) Xfree (H); /* [RJM 07/98: Memory Leak] */
958
959	return (Docs);
960	}
961
962
963
964
965	/* if MaxDocs == -1 it means all */
966	void
967	RankedQuery (query_data qd, char Query, RankedQueryInfo *rqi)
968	{
969	DocList *dl;
970
971	if (qd->TL)
972	FreeTermList (&(qd->TL));
973
974	/* [RPAP - Feb 97: Term Frequency] */
975	if (qd->QTL)
976	FreeQueryTermList (&(qd->QTL));
977
978	qd->TL = ParseRankedQuery (qd->sd, Query, rqi->Sort, qd->sd->sdh.indexed, /* [RPAP - Jan 97: Stem Index Change] */
979	&(qd->QTL)); /* [RPAP - Feb 97: Term Frequency] */
980
981	/* PrintTermList (qd->TL, stderr); */
982
983	dl = CosineGet (qd, qd->TL, rqi);
984
985	if (!dl)
986	FatalError (1, "Out of memory\n");
987
988	FreeQueryDocs (qd);
989
990	qd->DL = dl;
991	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: