source: trunk/gsdl/src/mgpp/text/query.ranked.cpp@ 711

Last change on this file since 711 was 711, checked in by cs025, 25 years ago

Changes to eradicate Xmalloc

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 25.8 KB
Line 
1/**************************************************************************
2 *
3 * query.ranked.c -- Ranked query evaluation
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: query.ranked.cpp 711 1999-10-17 23:43:31Z cs025 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "filestats.h"
28#include "messages.h"
29#include "timing.h"
30#include "sptree.h"
31#include "unitool.h"
32
33#include "mg.h"
34#include "invf.h"
35#include "text.h"
36#include "lists.h"
37#include "backend.h"
38#include "DocEntry.h"
39#include "stem_search.h"
40#include "weights.h"
41#include "text_get.h"
42#include "invf_get.h"
43#include "words.h"
44#include "stemmer.h"
45#include "locallib.h"
46#include "environment.h"
47#include "term_lists.h"
48#include "local_strings.h"
49#include "query_term_list.h" /* [RPAP - Feb 97: Term Frequency] */
50
51/*
52 $Log$
53 Revision 1.3 1999/10/17 23:43:29 cs025
54 Changes to eradicate Xmalloc
55
56 Revision 1.2 1999/10/12 04:13:52 cs025
57 Calls to GetEnv eradicated; interface to parsing of query text
58 cleaned up.
59
60 Revision 1.1 1999/10/11 02:58:14 cs025
61 Base install of MG-PP
62
63 Revision 1.1 1999/08/10 21:18:20 sjboddie
64 renamed mg-1.3d directory mg
65
66 Revision 1.2 1998/11/25 07:55:50 rjmcnab
67
68 Modified mg to that you can specify the stemmer you want
69 to use via a command line option. You specify it to
70 mg_passes during the build process. The number of the
71 stemmer that you used is stored within the inverted
72 dictionary header and the stemmed dictionary header so
73 the correct stemmer is used in later stages of building
74 and querying.
75
76 Revision 1.1 1998/11/17 09:35:34 rjmcnab
77 *** empty log message ***
78
79 * Revision 1.4 1994/11/25 03:47:46 tes
80 * Committing files before adding the merge stuff.
81 *
82 * Revision 1.3 1994/10/20 03:57:03 tes
83 * I have rewritten the boolean query optimiser and abstracted out the
84 * components of the boolean query.
85 *
86 * Revision 1.2 1994/09/20 04:42:04 tes
87 * For version 1.1
88 *
89 */
90
91static char *RCSID = "$Id: query.ranked.cpp 711 1999-10-17 23:43:31Z cs025 $";
92
93/*************************************************************************/
94
95class HeapEntry : public loadableDocEntry
96 {
97 public:
98 float *OrgWeight;
99
100 HeapEntry()
101 {
102 // do nothing
103 }
104
105 inline void chain(int docno, float *weightPtr)
106 {
107 *OrgWeight = this->Weight;
108 DocNum = docno;
109 Weight = *weightPtr;
110 OrgWeight = weightPtr;
111 *OrgWeight = 0;
112 }
113
114 inline void setValues(int docno, float _weight, u_long filepos, u_long _len, float *weightPtr)
115 {
116 DocNum = docno;
117 OrgWeight = weightPtr;
118 Weight = _weight;
119 SeekPos = filepos;
120 Len = _len;
121 *weightPtr = 0;
122 }
123 }
124;
125
126typedef int (*HeapComp) (HeapEntry *, HeapEntry *);
127
128typedef struct Heap
129 {
130 int NumItems;
131 int MaxSize;
132 HeapComp HC;
133 HeapEntry *HE;
134
135 Heap(int size)
136 {
137 HE = new HeapEntry[size];
138 }
139
140 ~Heap()
141 {
142 if (HE != NULL)
143 delete HE;
144 }
145 }
146Heap;
147
148
149Heap *
150Heap_Make (int size, HeapComp hc)
151{
152 Heap *H;
153 H = new Heap(size);
154 if (!H)
155 return NULL;
156 H->NumItems = 0;
157 H->MaxSize = size;
158 H->HC = hc;
159 return H;
160}
161
162int
163Heap_Size (Heap * H)
164{
165 return sizeof (Heap) + H->MaxSize * sizeof (HeapEntry);
166}
167
168HeapEntry *
169Heap_GetHead (Heap * H)
170{
171 if (H && H->NumItems)
172 return &H->HE[0];
173 else
174 return NULL;
175}
176
177
178
179void
180Heap_Heapify (Heap * H, int i)
181{
182 register int curr, child;
183 curr = i;
184 child = curr * 2;
185 while (child <= H->NumItems)
186 {
187 if (child < H->NumItems && H->HC (&H->HE[child], &H->HE[child - 1]) > 0)
188 child++;
189 if (H->HC (&H->HE[curr - 1], &H->HE[child - 1]) < 0)
190 {
191 HeapEntry temp = H->HE[child - 1];
192 H->HE[child - 1] = H->HE[curr - 1];
193 H->HE[curr - 1] = temp;
194 curr = child;
195 child = 2 * child;
196 }
197 else
198 break;
199 }
200}
201
202
203void
204Heap_Build (Heap * H)
205{
206 register int i;
207 for (i = H->NumItems / 2; i > 0; i--)
208 Heap_Heapify (H, i);
209}
210
211
212void
213Heap_Sort (Heap * H)
214{
215 register int i;
216 for (i = H->NumItems; i > 1; i--)
217 {
218 HeapEntry temp = H->HE[0];
219 H->HE[0] = H->HE[i - 1];
220 H->HE[i - 1] = temp;
221 H->NumItems--;
222 Heap_Heapify (H, 1);
223 }
224}
225
226
227void
228Heap_DeleteHead (Heap * H)
229{
230 H->HE[0] = H->HE[--H->NumItems];
231 Heap_Heapify (H, 1);
232}
233
234int
235Heap_Lesser (HeapEntry * a, HeapEntry * b)
236{
237 return (a->Weight > b->Weight ? -1 :
238 (a->Weight == b->Weight ? 0 : 1));
239}
240
241
242int
243Heap_Greater (HeapEntry * a, HeapEntry * b)
244{
245 return (a->Weight > b->Weight ? 1 :
246 (a->Weight == b->Weight ? 0 : -1));
247}
248
249int
250Make_Exact_Root (query_data * qd, Heap * H)
251{
252 int num = 0;
253 HeapEntry *he = H->HE;
254 while (he->SeekPos == 0)
255 {
256 he->Weight = he->Weight *
257 GetLowerApproxDocWeight (qd->awd, he->docNum() - 1) /
258 he->FetchStart (qd);
259 Heap_Heapify (H, 1);
260 num++;
261 }
262 return num;
263}
264
265void
266Heap_Dump (Heap * H, int num)
267{
268 int i, l, lines, r;
269 if (num > H->NumItems)
270 num = H->NumItems;
271 lines = (num + 3) / 4;
272 for (l = 0; l < lines; l++)
273 for (r = 0; r < 4; r++)
274 {
275 i = lines * r + l;
276 if (i < num)
277 fprintf (stderr, "[%2d] %7.4f", i, H->HE[i].Weight);
278 fprintf (stderr, r == 3 ? "\n" : " ");
279 }
280 fprintf (stderr, "\n");
281}
282
283int Heap_chainWeight(Heap *H, HeapEntry *he, query_data *qd, int DocNum, float *SumPtr)
284{
285 *SumPtr = (*SumPtr) / GetLowerApproxDocWeight (qd->awd, DocNum - 1);
286 qd->num_of_accum ++;
287 if (*SumPtr <= he->docWeight())
288 return 0;
289
290 he->chain(DocNum, SumPtr);
291
292 Heap_Heapify (H, 1);
293 return 1;
294}
295
296int Heap_plainWeight(Heap *H, HeapEntry *he, query_data *qd, int DocNum, float *SumPtr)
297{
298 unsigned long SeekPos, Len;
299 float Weight;
300
301 if (*SumPtr == 0)
302 return 0;
303 if (*SumPtr <= he->docWeight())
304 return 0;
305
306 Weight = (*SumPtr) *
307 GetLowerApproxDocWeight (qd->awd, DocNum-1) /
308 FetchDocStart (qd, DocNum, &SeekPos, &Len);
309
310 if (Weight <= he->docWeight())
311 return 0;
312
313 he->setValues(DocNum, Weight, SeekPos, Len, SumPtr);
314 Heap_Heapify (H, 1);
315
316 return 1;
317}
318
319/*************************************************************************/
320
321
322
323
324
325
326int
327doc_count_comp (const void *A, const void *B)
328{
329 const TermEntry *a = (TermEntry *) A;
330 const TermEntry *b = (TermEntry *) B;
331 return (a->WE.doc_count - b->WE.doc_count);
332}
333
334/* =========================================================================
335 * Function: ParseRankedQuery
336 * Description:
337 * Takes a string query line and extracts the terms in the query
338 * which exist in the stemmed dictionary.
339 * Optionally sorts the terms into ascending order by doc. number.
340 * Input:
341 * stemmed dictionary, query line, sort-flag
342 * Output:
343 * A list of terms.
344 * ========================================================================= */
345
346/* [RPAP - Jan 97: Stem Index Change] */
347static TermList *
348ParseRankedQuery (query_data *qd, char *QueryLine,
349 RankedQueryInfo *rqi,
350 QueryTermList **query_term_list) /* [RPAP - Feb 97: Term Frequency] */
351{
352 u_char Word[MAXSTEMLEN + 1];
353 u_char sWord[MAXSTEMLEN + 1];
354 u_char *end, *s_in;
355 int default_stem_method = 0;
356 TermList *Terms = TermList_create(0);
357
358 s_in = (u_char *) QueryLine;
359 end = s_in + strlen ((char *) s_in) - 1;
360 *query_term_list = QueryTermList_create(0); /* [RPAP - Feb 97: Term Frequency] */
361
362 /* [RPAP - Jan 97: Stem Index Change] */
363 if (qd->sd->sdh.indexed)
364 default_stem_method = rqi->getWordSpecial();
365 //BooleanEnv (GetEnv ("casefold"), 0) | (BooleanEnv (GetEnv ("stem"), 0) << 1);
366 else
367 default_stem_method = qd->sd->sdh.stem_method;
368
369 while (s_in <= end)
370 {
371 int j;
372 long num_entries, word_num;
373 unsigned long count, doc_count, invf_ptr, invf_len;
374 int weight_to_apply, stem_to_apply;
375 int method_using = -1;
376
377 /* 0=optional, 1=mustmatch */
378 int require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
379
380 /* Skip over the non word separator taking note of any parameters */
381 PARSE_RANKED_NON_STEM_WORD (require_match, s_in, end); /* [RJM 07/97: Ranked Required Terms] */
382 if (s_in > end) break;
383
384 /* Get a word and stem it */
385 PARSE_STEM_WORD (Word, s_in, end);
386
387 /* Extract any parameters */
388 weight_to_apply = 1;
389 stem_to_apply = default_stem_method;
390 while (s_in <= end)
391 {
392 int stem_param, weight_param, param_type;
393 char param[MAXPARAMLEN + 1];
394
395 param_type = 0;
396 PARSE_OPT_TERM_PARAM (param, param_type, s_in, end);
397 if (!param_type)
398 break;
399
400 switch (param_type)
401 {
402 case (WEIGHTPARAM):
403 weight_param = atoi (param);
404 if (errno != ERANGE && weight_param > 0)
405 weight_to_apply = weight_param;
406 break;
407
408 case (STEMPARAM):
409 stem_param = atoi (param);
410 if (errno != ERANGE && qd->sd->sdh.indexed && stem_param >= 0 && stem_param <= 3)
411 method_using = stem_to_apply = stem_param;
412 break;
413 }
414 }
415
416 bcopy ((void *) Word, (void *) sWord, *Word + 1);
417 stemmer (stem_to_apply, qd->sd->sdh.stemmer_num, sWord);
418
419 if (!qd->sd->sdh.indexed || stem_to_apply == 0)
420 {
421 /* Look for the word in the already identified terms */
422 for (j = 0; j < Terms->size(); j++)
423 if (compare (Terms->TE[j].Word, Word) == 0)
424 break;
425
426 /* Increment the weight if the word is in the list */
427 /* Update the require match attribute */
428 if (j < Terms->size())
429 {
430 Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
431 INT_MAX : (Terms->TE[j].Count + weight_to_apply));
432 Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
433 QueryTermList_AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
434 }
435 else
436 {
437 /* Look for it in the stemmed dictionary */
438 if ((word_num = FindWord (qd->sd, sWord, &count, &doc_count,
439 &invf_ptr, &invf_len)) != -1)
440 {
441 /* Search the list for the word */
442 for (j = 0; j < Terms->size(); j++)
443 if (Terms->TE[j].WE.word_num == word_num)
444 break;
445
446 /* Increment the weight if the word is in the list */
447 if (j < Terms->size())
448 {
449 Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
450 INT_MAX : (Terms->TE[j].Count + weight_to_apply));
451 Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
452 QueryTermList_AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
453 }
454 else
455 {
456 /* Create a new entry in the list for the new word */
457 TermEntry te;
458
459 te.WE.word_num = word_num;
460 te.WE.count = count;
461 te.WE.doc_count = doc_count;
462 te.WE.max_doc_count = doc_count;
463 te.WE.invf_ptr = invf_ptr;
464 te.WE.invf_len = invf_len;
465 te.Count = weight_to_apply;
466 te.Word = copy_string (Word);
467 if (!te.Word)
468 FatalError (1, "Could NOT create memory to add term");
469 te.Stem = NULL;
470 te.require_match = require_match;
471
472 TermList_AddTermEntry (Terms, &te);
473
474 /* [RPAP - Feb 97: Term Frequency] */
475 QueryTermList_AddQueryTerm (query_term_list, Word, count, method_using);
476 }
477 }
478 /* [RPAP - Feb 97: Term Frequency] */
479 else
480 QueryTermList_AddQueryTerm (query_term_list, Word, 0, method_using);
481 }
482 }
483 else
484 {
485 int total_count = 0; /* [RPAP - Feb 97: Term Frequency] */
486 TermList *tempList = TermList_create (0);
487
488 if ((num_entries = FindWords (qd->sd, sWord, stem_to_apply, &tempList)) > 0)
489 {
490 int i;
491 u_long max_doc_count = 0;
492
493 /* get the maximum doc count */
494 for (i = 0; i < tempList->size(); i++)
495 {
496 if (tempList->TE[i].WE.doc_count > max_doc_count)
497 max_doc_count = tempList->TE[i].WE.doc_count;
498 total_count += tempList->TE[i].WE.count; /* [RPAP - Feb 97: Term Frequency] */
499 }
500
501 for (i = 0; i < tempList->size(); i++)
502 {
503 /* Look for the word(s) in the already identified terms */
504 for (j = 0; j < Terms->size(); j++)
505 {
506 if (compare (Terms->TE[j].Word, tempList->TE[i].Word) == 0)
507 {
508 /* found the word */
509 /* Modify weight */
510 Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
511 INT_MAX : (Terms->TE[j].Count + weight_to_apply));
512 if (Terms->TE[j].WE.max_doc_count < max_doc_count)
513 Terms->TE[j].WE.max_doc_count = max_doc_count;
514 break;
515 }
516 }
517
518 if (j == Terms->size())
519 {
520 /* word was not found */
521 tempList->TE[i].WE.max_doc_count = max_doc_count;
522 tempList->TE[i].Count = weight_to_apply;
523
524 /* We cannot require a term to match if it is expanded */
525 /* into multiple terms :-( */
526 tempList->TE[i].require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
527
528 TermList_AddTermEntry (Terms, &tempList->TE[i]);
529 }
530 }
531 }
532
533 /* [RPAP - Feb 97: Term Frequency] */
534 QueryTermList_AddQueryTerm (query_term_list, Word, total_count, method_using);
535
536 if (tempList != NULL) Xfree(tempList); /* [RJM 07/98: Memory Leak] */
537 } /* end indexed */
538 } /* end while */
539
540 if (rqi->Sort)
541 /* Sort the terms in ascending order by doc_count */
542 qsort (Terms->TE, Terms->size(), sizeof (TermEntry), doc_count_comp);
543 return (Terms);
544}
545
546
547/*
548 * This function is given a list of term numbers and it returns a list
549 * of document numbers based on the cosine document weighting system.
550 * This puts the entries in an array.
551 * inverted file.
552 * If MaxDocs == -1 then it means all
553 */
554static DocList *
555CosineGet (query_data * qd, TermList * Terms, RankedQueryInfo * rqi) {
556 DocList *Docs;
557 float *AccumulatedWeights = NULL;
558 Splay_Tree *ST = NULL;
559 Splay_Tree *Para_ST = NULL;
560 Hash_Table *HT = NULL;
561 List_Table *LT = NULL;
562 Heap *H;
563 HeapEntry *he;
564 register float *fptr = NULL;
565 register Invf_Doc_Entry *ide = NULL;
566 register Invf_Doc_EntryH *ideh = NULL;
567 int BackEnd, NumExact, MaxExact, NumParas;
568 int MaxDocs = 0, MaxParas = 0;
569 int i;
570 Invf_Doc_Entry_Pool ide_pool;
571 ide_pool.pool = NULL;
572
573 qd->hops_taken = qd->num_of_ptrs = qd->num_of_accum = 0;
574
575 switch (rqi->AccumMethod)
576 {
577 case 'S':
578 ST = CosineDecodeSplay (qd, Terms, rqi, &ide_pool);
579 if (!ST)
580 return NULL;
581 break;
582 case 'A':
583 AccumulatedWeights = CosineDecode (qd, Terms, rqi);
584 if (!AccumulatedWeights)
585 return NULL;
586 break;
587 case 'H':
588 HT = CosineDecodeHash (qd, Terms, rqi);
589 if (!HT)
590 return NULL;
591 break;
592 case 'L':
593 LT = CosineDecodeList (qd, Terms, rqi);
594 if (!LT)
595 return NULL;
596 break;
597 }
598
599#if 0
600 if (rqi->UseSplayTree)
601 {
602
603 AccumulatedWeights = CosineDecode (qd, Terms, rqi);
604 fptr = AccumulatedWeights;
605 ide = SP_get_first (ST);
606 for (i = 0; i < qd->sd->sdh.num_of_docs; i++)
607 {
608 if (AccumulatedWeights[i] != 0)
609 {
610 if (i != ide->DocNum)
611 fprintf (stderr, "Sum mismatch for %d %f %d %f\n", i + 1,
612 AccumulatedWeights[i], ide->docNum() + 1, ide->docWeight());
613 ide = SP_get_next (ST);
614 }
615 }
616 }
617#endif
618
619 switch (rqi->AccumMethod)
620 {
621 case 'S':
622 MaxParas = ST->no_of_items;
623 break;
624 case 'A':
625 { /* count the number of non-zero document weights */
626 register int i = qd->sd->sdh.num_of_docs;
627 register float *d;
628 MaxParas = 0;
629 for (d = AccumulatedWeights; i; i--, d++)
630 {
631 if (*d)
632 {
633 MaxParas++;
634 }
635 }
636 }
637 break;
638 case 'H':
639 MaxParas = HT->num + HT->Suplimentary_Num;
640 break;
641 case 'L':
642 MaxParas = LT->num;
643 break;
644 }
645
646 if (rqi->MaxParasToRetrieve != -1 && MaxParas > rqi->MaxParasToRetrieve)
647 MaxParas = rqi->MaxParasToRetrieve;
648 MaxDocs = MaxParas;
649
650 /* Allocate memory for the heap */
651 Docs = DocList_create (MaxDocs);
652 ChangeMemInUse (qd, sizeof (DocEntry) * MaxDocs);
653
654 H = Heap_Make (MaxDocs, Heap_Lesser);
655
656
657 /* Get the sums from the array divide the sums by the
658 document weights which we retrieve from the ".idx.wgt" file and put
659 the resulting data into a heap */
660 he = H->HE;
661 H->NumItems = MaxDocs;
662 switch (rqi->AccumMethod)
663 {
664 case 'S':
665 {
666 ide = (Invf_Doc_Entry *) SP_get_first (ST);
667 for (i = 0; i < H->NumItems; i++, ide = (Invf_Doc_Entry *) SP_get_next (ST), he++)
668 {
669 he->DocNum = ide->docNum() + 1;
670 he->OrgWeight = &ide->Weight;
671 qd->num_of_accum++;
672 }
673 }
674 break;
675 case 'A':
676 {
677 fptr = AccumulatedWeights;
678 for (i = 0; i < H->NumItems; i++, fptr++, he++)
679 {
680 he->DocNum = i + 1;
681 he->OrgWeight = fptr;
682 if (*fptr)
683 qd->num_of_accum++;
684 }
685 }
686 break;
687 case 'H':
688 {
689 ideh = HT->Head;
690 for (i = 0; i < H->NumItems; i++, ideh = ideh->next, he++)
691 {
692 he->DocNum = ideh->DocNum + 1;
693 he->OrgWeight = &ideh->Weight;
694 qd->num_of_accum++;
695 }
696 }
697 break;
698 case 'L':
699 {
700 ide = LT->IDE;
701 for (i = 0; i < H->NumItems; i++, ide++, he++)
702 {
703 he->DocNum = ide->DocNum + 1;
704 he->OrgWeight = &ide->Weight;
705 qd->num_of_accum++;
706 }
707 }
708 break;
709 }
710
711 he = H->HE;
712 for (i = 0; i < H->NumItems; i++, he++)
713 {
714 *he->OrgWeight /= GetLowerApproxDocWeight (qd->awd, he->DocNum - 1);
715 he->Weight = *he->OrgWeight;
716 *he->OrgWeight = 0;
717 he->SeekPos = he->Len = 0;
718 }
719
720 Heap_Build (H);
721
722 he = H->HE;
723 switch (rqi->AccumMethod)
724 {
725 case 'S':
726 {
727 for (i = MaxDocs; i < ST->no_of_items; i++, ide = (Invf_Doc_Entry *) SP_get_next (ST))
728 {
729 /*
730 ide->Sum /= GetLowerApproxDocWeight (qd->awd, ide->DocNum);
731 qd->num_of_accum++;
732 if (ide->Sum <= he->Weight)
733 continue;
734 */
735 Heap_chainWeight(H, he, qd, ide->DocNum + 1, &ide->Weight);
736 /*
737 *he->OrgWeight = he->Weight;
738 he->DocNum = ide->DocNum + 1;
739 he->Weight = ide->Sum;
740 he->OrgWeight = &ide->Sum;
741 *he->OrgWeight = 0;
742 Heap_Heapify (H, 1);
743 */
744 }
745 }
746 break;
747 case 'A':
748 {
749 for (i = MaxDocs; i < qd->sd->sdh.num_of_docs; i++, fptr++)
750 {
751 if (!*fptr)
752 continue;
753 /*
754 *fptr /= GetLowerApproxDocWeight (qd->awd, i);
755 qd->num_of_accum++;
756 if (*fptr <= he->Weight)
757 continue;
758 */
759 Heap_chainWeight(H, he, qd, i+1, fptr);
760 /*
761 *he->OrgWeight = he->Weight;
762 he->DocNum = i + 1;
763 he->Weight = *fptr;
764 he->OrgWeight = fptr;
765 *he->OrgWeight = 0;
766 Heap_Heapify (H, 1);
767 */
768 }
769 }
770 break;
771 case 'H':
772 {
773 for (; ideh != NULL; ideh = ideh->next)
774 {
775 /*
776 ideh->Sum /=
777 GetLowerApproxDocWeight (qd->awd, ideh->DocNum);
778 qd->num_of_accum++;
779 if (ideh->Sum <= he->Weight)
780 continue;
781 */
782 Heap_chainWeight(H, he, qd, ideh->DocNum + 1, &ideh->Weight);
783 /*
784 *he->OrgWeight = he->Weight;
785 he->DocNum = ideh->DocNum + 1;
786 he->Weight = ideh->Sum;
787 he->OrgWeight = &ideh->Sum;
788 *he->OrgWeight = 0;
789 Heap_Heapify (H, 1);
790 */
791 }
792 }
793 break;
794 case 'L':
795 {
796 for (i = MaxDocs; i < LT->num; i++, ide++)
797 {
798 /*
799 ide->Sum /=
800 GetLowerApproxDocWeight (qd->awd, ide->DocNum);
801 qd->num_of_accum++;
802 if (ide->Sum <= he->Weight)
803 continue;
804 */
805 Heap_chainWeight(H, he, qd, ide->DocNum + 1, &ide->Weight);
806 /*
807 *he->OrgWeight = he->Weight;
808 he->DocNum = ide->DocNum + 1;
809 he->Weight = ide->Sum;
810 he->OrgWeight = &ide->Sum;
811 *he->OrgWeight = 0;
812 Heap_Heapify (H, 1);
813 */
814 }
815 }
816 break;
817 }
818
819 if (rqi->Exact && qd->id->ifh.InvfLevel != 3)
820 {
821 HeapEntry *he = H->HE;
822
823 for (i = 0; i < H->NumItems; i++, he++)
824 {
825 he->Weight = he->docWeight() *
826 GetLowerApproxDocWeight (qd->awd, he->docNum() - 1) /
827 he->FetchStart (qd);
828 }
829
830 Heap_Build (H);
831
832 he = H->HE;
833
834 switch (rqi->AccumMethod)
835 {
836 case 'S':
837 {
838 ide = (Invf_Doc_Entry *) SP_get_first (ST);
839 for (i = 0; i < ST->no_of_items; i++, ide = (Invf_Doc_Entry *) SP_get_next (ST))
840 {
841 /*
842 u_long SeekPos, Len;
843 float Weight;
844 if (!ide->Sum)
845 continue;
846 if (ide->Sum <= he->Weight)
847 continue;
848 Weight = ide->Sum *
849 GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
850 FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
851 if (Weight <= he->Weight)
852 continue;
853 */
854 if (Heap_plainWeight(H, he, qd, ide->DocNum + 1, &ide->Weight) == 0)
855 continue;
856 /*
857 he->DocNum = ide->DocNum + 1;
858 he->OrgWeight = &ide->Sum;
859 he->Weight = Weight;
860 he->SeekPos = SeekPos;
861 he->Len = Len;
862 ide->Sum = 0;
863 Heap_Heapify (H, 1);
864 */
865 }
866 }
867 break;
868
869 /* up to here */
870
871 case 'A':
872 {
873 fptr = AccumulatedWeights;
874 for (i = 0; i < qd->sd->sdh.num_of_docs; i++, fptr++)
875 {
876 /*
877 u_long SeekPos, Len;
878 float Weight;
879 if (!*fptr)
880 continue;
881 if (*fptr <= he->Weight)
882 continue;
883 Weight = *fptr *
884 GetLowerApproxDocWeight (qd->awd, i) /
885 FetchDocStart (qd, i + 1, &SeekPos, &Len);
886 if (Weight <= he->Weight)
887 continue;
888 */
889 if (Heap_plainWeight(H, he, qd, i+1, fptr) == 0)
890 continue;
891 /*
892 he->DocNum = i + 1;
893 he->OrgWeight = fptr;
894 he->Weight = Weight;
895 he->SeekPos = SeekPos;
896 he->Len = Len;
897 *fptr = 0;
898 Heap_Heapify (H, 1);
899 */
900 }
901 }
902 break;
903 case 'H':
904 {
905 ideh = HT->Head;
906 for (ideh = HT->Head; ideh; ideh = ideh->next)
907 {
908 /*
909 u_long SeekPos, Len;
910 float Weight;
911 if (!ideh->Sum)
912 continue;
913 if (ideh->Sum <= he->Weight)
914 continue;
915
916 Weight = ideh->Sum *
917 GetLowerApproxDocWeight (qd->awd, ideh->DocNum) /
918 FetchDocStart (qd, ideh->DocNum + 1, &SeekPos, &Len);
919 if (Weight <= he->Weight)
920 continue;
921 */
922 if (Heap_plainWeight(H, he, qd, ideh->docNum() + 1, &ideh->Weight) == 0)
923 continue;
924 /*
925 he->DocNum = ideh->DocNum + 1;
926 he->OrgWeight = &ideh->Sum;
927 he->Weight = Weight;
928 he->SeekPos = SeekPos;
929 he->Len = Len;
930 ideh->Sum = 0;
931 Heap_Heapify (H, 1);
932 */
933 }
934 }
935 break;
936 case 'L':
937 {
938 ide = LT->IDE;
939 for (i = 0; i < LT->num; i++, ide++)
940 {
941 /*u_long SeekPos, Len;
942 float Weight;
943 if (!ide->Sum)
944 continue;
945 if (ide->Sum <= he->Weight)
946 continue;
947
948 Weight = ide->Sum *
949 GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
950 FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
951 if (Weight <= he->Weight)
952 continue;
953 */
954 if (Heap_plainWeight(H, he, qd, ide->DocNum+1, &ide->Weight) == 0)
955 continue;
956 /*
957 he->DocNum = ide->DocNum + 1;
958 he->OrgWeight = &ide->Sum;
959 he->Weight = Weight;
960 he->SeekPos = SeekPos;
961 he->Len = Len;
962 ide->Sum = 0;
963 Heap_Heapify (H, 1);
964 */
965 }
966 }
967 break;
968 }
969 }
970
971 H->HC = Heap_Greater;
972 Heap_Build (H);
973
974
975 MaxDocs = H->NumItems;
976 if (rqi->MaxDocsToRetrieve != -1 && MaxDocs > rqi->MaxDocsToRetrieve)
977 MaxDocs = rqi->MaxDocsToRetrieve;
978
979 /* Alarm */
980
981 he = H->HE;
982 BackEnd = H->NumItems - 1;
983 NumExact = 0;
984 MaxExact = H->NumItems;
985 NumParas = 0;
986 Para_ST = SP_createset (DocEntry_compare);
987 while (H->NumItems && Docs->size() < MaxDocs)
988 {
989 DocEntry DocEnt;
990 DocEntry *mem;
991
992 if (rqi->Exact)
993 {
994 if (H->HE[0].SeekPos == 0)
995 NumExact += Make_Exact_Root (qd, H);
996 }
997 else
998 he->FetchStart (qd);
999
1000 NumParas++;
1001
1002 DocEnt.DocNum = he->DocNum;
1003 DocEnt.Weight = he->Weight;
1004 DocEnt.Len = he->Len;
1005 DocEnt.SeekPos = he->SeekPos;
1006 DocEnt.CompTextBuffer = NULL;
1007 DocEnt.Next = NULL;
1008
1009 Heap_DeleteHead (H);
1010
1011 if (!(mem = (DocEntry *) SP_member (&DocEnt, Para_ST)))
1012 {
1013 Docs->addMember(&DocEnt);
1014 SP_insert (Docs->lastMember(), Para_ST);
1015 }
1016 else
1017 {
1018 DocEnt.Next = DocEntry_getNext(mem);
1019 Docs->setMember(BackEnd, &DocEnt);
1020 DocEntry_setNext(mem, Docs->member(BackEnd--));
1021 }
1022 }
1023 SP_freeset (Para_ST);
1024
1025 if (qd->id->ifh.InvfLevel == 3)
1026 {
1027 Message ("%d Paragraphs were required to get %d documents",
1028 NumParas, Docs->size());
1029 if (NumExact == MaxExact)
1030 {
1031 Message ("The exact weights of all %d paragraphs had to be calculated", MaxExact);
1032 Message ("to obtain %d paragraphs. This may mean that the the documents", NumParas);
1033 Message ("returned do not necessarly represent an exact cosine ranking.");
1034 Message ("This problem may be corrected by increasing \'maxparas\'.");
1035 }
1036 }
1037#if 0
1038 {
1039 int i;
1040 FILE *f = fopen ("top.paras", "w");
1041 fprintf (f, "=========================\nTop Paragraphs\n");
1042 for (i = 0; i < Docs->size(); i++)
1043 {
1044 DocEntry *d;
1045 fprintf (f, "<%d(%f)> ", Heap[i].DocNum, Heap[i].Weight);
1046 for (d = Heap[i].Next; d; d = d->Next)
1047 fprintf (f, "%d(%f) ", d->docNum(), d->docWeight());
1048 fprintf (f, "\n");
1049 }
1050 fprintf (f, "=========================\n");
1051 fclose (f);
1052 }
1053#endif
1054
1055 if (AccumulatedWeights)
1056 {
1057 Xfree (AccumulatedWeights);
1058 ChangeMemInUse (qd, -sizeof (float) * qd->sd->sdh.num_of_docs);
1059 }
1060 if (ST)
1061 {
1062 int mem = ST->mem_in_use;
1063 SP_freeset (ST);
1064 ChangeMemInUse (qd, -mem);
1065 free_ide_pool (qd, &ide_pool);
1066 }
1067 if (HT)
1068 HT_free (qd, HT);
1069
1070 if (LT)
1071 LT_free (qd, LT);
1072
1073 if (H) delete H; /* [RJM 07/98: Memory Leak] */
1074
1075 return (Docs);
1076}
1077
1078
1079
1080
1081/* if MaxDocs == -1 it means all */
1082void
1083RankedQuery (query_data *qd, char *Query, RankedQueryInfo *rqi)
1084{
1085 DocList *dl;
1086
1087 if (qd->TL)
1088 TermList_destroy (&(qd->TL));
1089
1090 /* [RPAP - Feb 97: Term Frequency] */
1091 if (qd->QTL)
1092 QueryTermList_free (&(qd->QTL));
1093
1094 qd->TL = ParseRankedQuery (qd, Query, rqi, &(qd->QTL)); /* [RPAP - Feb 97: Term Frequency] */
1095
1096 /* TermList_print (qd->TL, stderr); */
1097
1098 dl = CosineGet (qd, qd->TL, rqi);
1099
1100 if (!dl)
1101 FatalError (1, "Out of memory\n");
1102
1103 QueryData_FreeQueryDocs (qd);
1104
1105 qd->DL = dl;
1106}
1107
1108
1109
Note: See TracBrowser for help on using the repository browser.