source: gsdl/trunk/trunk/mg/src/text/query.ranked.c@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 23.4 KB
Line 
1/**************************************************************************
2 *
3 * query.ranked.c -- Ranked query evaluation
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: query.ranked.c 16583 2008-07-29 10:20:36Z davidb $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "filestats.h"
28#include "messages.h"
29#include "timing.h"
30#include "sptree.h"
31
32#include "mg.h"
33#include "invf.h"
34#include "text.h"
35#include "lists.h"
36#include "backend.h"
37#include "stem_search.h"
38#include "weights.h"
39#include "text_get.h"
40#include "invf_get.h"
41#include "words.h"
42#include "stemmer.h"
43#include "locallib.h"
44#include "environment.h"
45#include "term_lists.h"
46#include "local_strings.h"
47#include "query_term_list.h" /* [RPAP - Feb 97: Term Frequency] */
48
49/*
50 $Log$
51 Revision 1.1 2003/02/20 21:18:24 mdewsnip
52 Addition of MG package for search and retrieval
53
54 Revision 1.1 1999/08/10 21:18:20 sjboddie
55 renamed mg-1.3d directory mg
56
57 Revision 1.2 1998/11/25 07:55:50 rjmcnab
58
59 Modified mg to that you can specify the stemmer you want
60 to use via a command line option. You specify it to
61 mg_passes during the build process. The number of the
62 stemmer that you used is stored within the inverted
63 dictionary header and the stemmed dictionary header so
64 the correct stemmer is used in later stages of building
65 and querying.
66
67 Revision 1.1 1998/11/17 09:35:34 rjmcnab
68 *** empty log message ***
69
70 * Revision 1.4 1994/11/25 03:47:46 tes
71 * Committing files before adding the merge stuff.
72 *
73 * Revision 1.3 1994/10/20 03:57:03 tes
74 * I have rewritten the boolean query optimiser and abstracted out the
75 * components of the boolean query.
76 *
77 * Revision 1.2 1994/09/20 04:42:04 tes
78 * For version 1.1
79 *
80 */
81
82static char *RCSID = "$Id: query.ranked.c 16583 2008-07-29 10:20:36Z davidb $";
83
84/*************************************************************************/
85
86typedef struct HeapEntry
87 {
88 float Weight;
89 float *OrgWeight;
90 int DocNum;
91 u_long SeekPos; /* position in the text file in bytes */
92 u_long Len; /* length of the document in bytes */
93 }
94HeapEntry;
95
96typedef int (*HeapComp) (HeapEntry *, HeapEntry *);
97
98typedef struct Heap
99 {
100 int NumItems;
101 int MaxSize;
102 HeapComp HC;
103 HeapEntry HE[1];
104 }
105Heap;
106
107
108Heap *
109Heap_Make (int size, HeapComp hc)
110{
111 Heap *H;
112 H = Xmalloc (sizeof (Heap) + (size - 1) * sizeof (HeapEntry));
113 if (!H)
114 return NULL;
115 H->NumItems = 0;
116 H->MaxSize = size;
117 H->HC = hc;
118 return H;
119}
120
121int
122Heap_Size (Heap * H)
123{
124 return sizeof (Heap) + H->MaxSize * sizeof (HeapEntry);
125}
126
127HeapEntry *
128Heap_GetHead (Heap * H)
129{
130 if (H && H->NumItems)
131 return &H->HE[0];
132 else
133 return NULL;
134}
135
136
137
138void
139Heap_Heapify (Heap * H, int i)
140{
141 register int curr, child;
142 curr = i;
143 child = curr * 2;
144 while (child <= H->NumItems)
145 {
146 if (child < H->NumItems && H->HC (&H->HE[child], &H->HE[child - 1]) > 0)
147 child++;
148 if (H->HC (&H->HE[curr - 1], &H->HE[child - 1]) < 0)
149 {
150 HeapEntry temp = H->HE[child - 1];
151 H->HE[child - 1] = H->HE[curr - 1];
152 H->HE[curr - 1] = temp;
153 curr = child;
154 child = 2 * child;
155 }
156 else
157 break;
158 }
159}
160
161
162void
163Heap_Build (Heap * H)
164{
165 register int i;
166 for (i = H->NumItems / 2; i > 0; i--)
167 Heap_Heapify (H, i);
168}
169
170
171void
172Heap_Sort (Heap * H)
173{
174 register int i;
175 for (i = H->NumItems; i > 1; i--)
176 {
177 HeapEntry temp = H->HE[0];
178 H->HE[0] = H->HE[i - 1];
179 H->HE[i - 1] = temp;
180 H->NumItems--;
181 Heap_Heapify (H, 1);
182 }
183}
184
185
186void
187Heap_DeleteHead (Heap * H)
188{
189 H->HE[0] = H->HE[--H->NumItems];
190 Heap_Heapify (H, 1);
191}
192
193int
194Heap_Lesser (HeapEntry * a, HeapEntry * b)
195{
196 return (a->Weight > b->Weight ? -1 :
197 (a->Weight == b->Weight ? 0 : 1));
198}
199
200
201int
202Heap_Greater (HeapEntry * a, HeapEntry * b)
203{
204 return (a->Weight > b->Weight ? 1 :
205 (a->Weight == b->Weight ? 0 : -1));
206}
207
208int
209Make_Exact_Root (query_data * qd, Heap * H)
210{
211 int num = 0;
212 HeapEntry *he = H->HE;
213 while (he->SeekPos == 0)
214 {
215 he->Weight = he->Weight *
216 GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
217 FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
218 Heap_Heapify (H, 1);
219 num++;
220 }
221 return num;
222}
223
224void
225Heap_Dump (Heap * H, int num)
226{
227 int i, l, lines, r;
228 if (num > H->NumItems)
229 num = H->NumItems;
230 lines = (num + 3) / 4;
231 for (l = 0; l < lines; l++)
232 for (r = 0; r < 4; r++)
233 {
234 i = lines * r + l;
235 if (i < num)
236 fprintf (stderr, "[%2d] %7.4f", i, H->HE[i].Weight);
237 fprintf (stderr, r == 3 ? "\n" : " ");
238 }
239 fprintf (stderr, "\n");
240}
241
242/*************************************************************************/
243
244
245
246
247
248
249int
250doc_count_comp (const void *A, const void *B)
251{
252 const TermEntry *a = A;
253 const TermEntry *b = B;
254 return (a->WE.doc_count - b->WE.doc_count);
255}
256
257/* =========================================================================
258 * Function: ParseRankedQuery
259 * Description:
260 * Takes a string query line and extracts the terms in the query
261 * which exist in the stemmed dictionary.
262 * Optionally sorts the terms into ascending order by doc. number.
263 * Input:
264 * stemmed dictionary, query line, sort-flag
265 * Output:
266 * A list of terms.
267 * ========================================================================= */
268
269/* [RPAP - Jan 97: Stem Index Change] */
270static TermList *
271ParseRankedQuery (stemmed_dict * sd, char *QueryLine, int Sort, int indexed,
272 QueryTermList **query_term_list) /* [RPAP - Feb 97: Term Frequency] */
273{
274 u_char Word[MAXSTEMLEN + 1];
275 u_char sWord[MAXSTEMLEN + 1];
276 u_char *end, *s_in;
277 int default_stem_method = 0;
278 TermList *Terms = MakeTermList(0);
279
280 s_in = (u_char *) QueryLine;
281 end = s_in + strlen ((char *) s_in) - 1;
282 *query_term_list = MakeQueryTermList(0); /* [RPAP - Feb 97: Term Frequency] */
283
284 if (indexed)
285 default_stem_method = BooleanEnv (GetEnv ("casefold"), 0) | (BooleanEnv (GetEnv ("stem"), 0) << 1);
286 else
287 default_stem_method = sd->sdh.stem_method;
288
289 while (s_in <= end)
290 {
291 int j;
292 long num_entries, word_num;
293 unsigned long count, doc_count, invf_ptr, invf_len;
294 int weight_to_apply, stem_to_apply;
295 int method_using = -1;
296
297 /* 0=optional, 1=mustmatch */
298 int require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
299
300 /* Skip over the non word separator taking note of any parameters */
301 PARSE_RANKED_NON_STEM_WORD (require_match, s_in, end); /* [RJM 07/97: Ranked Required Terms] */
302 if (s_in > end) break;
303
304 /* Get a word and stem it */
305 PARSE_STEM_WORD (Word, s_in, end);
306
307 /* Extract any parameters */
308 weight_to_apply = 1;
309 stem_to_apply = default_stem_method;
310 while (s_in <= end)
311 {
312 int stem_param, weight_param, param_type;
313 char param[MAXPARAMLEN + 1];
314
315 param_type = 0;
316 PARSE_OPT_TERM_PARAM (param, param_type, s_in, end);
317 if (!param_type)
318 break;
319
320 switch (param_type)
321 {
322 case (WEIGHTPARAM):
323 weight_param = atoi (param);
324 if (errno != ERANGE && weight_param > 0)
325 weight_to_apply = weight_param;
326 break;
327
328 case (STEMPARAM):
329 stem_param = atoi (param);
330 if (errno != ERANGE && indexed && stem_param >= 0 && stem_param <= 3)
331 method_using = stem_to_apply = stem_param;
332 break;
333 }
334 }
335
336 bcopy ((char *) Word, (char *) sWord, *Word + 1);
337 stemmer (stem_to_apply, sd->sdh.stemmer_num, sWord);
338
339 if (!indexed || stem_to_apply == 0)
340 {
341 /* Look for the word in the already identified terms */
342 for (j = 0; j < Terms->num; j++)
343 if (compare (Terms->TE[j].Word, Word) == 0)
344 break;
345
346 /* Increment the weight if the word is in the list */
347 /* Update the require match attribute */
348 if (j < Terms->num)
349 {
350 Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
351 INT_MAX : (Terms->TE[j].Count + weight_to_apply));
352 Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
353 AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
354 }
355 else
356 {
357 /* Look for it in the stemmed dictionary */
358 if ((word_num = FindWord (sd, sWord, &count, &doc_count,
359 &invf_ptr, &invf_len)) != -1)
360 {
361 /* Search the list for the word */
362 for (j = 0; j < Terms->num; j++)
363 if (Terms->TE[j].WE.word_num == word_num)
364 break;
365
366 /* Increment the weight if the word is in the list */
367 if (j < Terms->num)
368 {
369 Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
370 INT_MAX : (Terms->TE[j].Count + weight_to_apply));
371 Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
372 AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
373 }
374 else
375 {
376 /* Create a new entry in the list for the new word */
377 TermEntry te;
378
379 te.WE.word_num = word_num;
380 te.WE.count = count;
381 te.WE.doc_count = doc_count;
382 te.WE.max_doc_count = doc_count;
383 te.WE.invf_ptr = invf_ptr;
384 te.WE.invf_len = invf_len;
385 te.Count = weight_to_apply;
386 te.Word = copy_string (Word);
387 if (!te.Word)
388 FatalError (1, "Could NOT create memory to add term");
389 te.Stem = NULL;
390 te.require_match = require_match;
391
392 AddTermEntry (&Terms, &te);
393
394 /* [RPAP - Feb 97: Term Frequency] */
395 AddQueryTerm (query_term_list, Word, count, method_using);
396 }
397 }
398 /* [RPAP - Feb 97: Term Frequency] */
399 else
400 AddQueryTerm (query_term_list, Word, 0, method_using);
401 }
402 }
403 else
404 {
405 int total_count = 0; /* [RPAP - Feb 97: Term Frequency] */
406 TermList *tempList = MakeTermList (0);
407 if ((num_entries = FindWords (sd, sWord, stem_to_apply, &tempList)) > 0)
408 {
409 int i;
410 u_long max_doc_count = 0;
411
412 /* get the maximum doc count */
413 for (i = 0; i < tempList->num; i++)
414 {
415 if (tempList->TE[i].WE.doc_count > max_doc_count)
416 max_doc_count = tempList->TE[i].WE.doc_count;
417 total_count += tempList->TE[i].WE.count; /* [RPAP - Feb 97: Term Frequency] */
418 }
419
420 for (i = 0; i < tempList->num; i++)
421 {
422 /* Look for the word(s) in the already identified terms */
423 for (j = 0; j < Terms->num; j++)
424 {
425 if (compare (Terms->TE[j].Word, tempList->TE[i].Word) == 0)
426 {
427 /* found the word */
428 /* Modify weight */
429 Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
430 INT_MAX : (Terms->TE[j].Count + weight_to_apply));
431 if (Terms->TE[j].WE.max_doc_count < max_doc_count)
432 Terms->TE[j].WE.max_doc_count = max_doc_count;
433 break;
434 }
435 }
436
437 if (j == Terms->num)
438 {
439 /* word was not found */
440 tempList->TE[i].WE.max_doc_count = max_doc_count;
441 tempList->TE[i].Count = weight_to_apply;
442
443 /* We cannot require a term to match if it is expanded */
444 /* into multiple terms :-( */
445 tempList->TE[i].require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
446
447 AddTermEntry (&Terms, &tempList->TE[i]);
448 }
449 }
450 }
451 /* [RPAP - Feb 97: Term Frequency] */
452 AddQueryTerm (query_term_list, Word, total_count, method_using);
453
454 if (tempList != NULL) Xfree(tempList); /* [RJM 07/98: Memory Leak] */
455 } /* end indexed */
456 } /* end while */
457
458 if (Sort)
459 /* Sort the terms in ascending order by doc_count */
460 qsort (Terms->TE, Terms->num, sizeof (TermEntry), doc_count_comp);
461 return (Terms);
462}
463
464
465
466
467static int
468DE_comp (void *a, void *b)
469{
470 return ((DocEntry *) a)->SeekPos - ((DocEntry *) b)->SeekPos;
471}
472
473
474
475
476
477/*
478 * This function is given a list of term numbers and it returns a list
479 * of document numbers based on the cosine document weighting system.
480 * This puts the entries in an array.
481 * inverted file.
482 * If MaxDocs == -1 then it means all
483 */
484static DocList *
485CosineGet (query_data * qd, TermList * Terms, RankedQueryInfo * rqi) {
486 DocList *Docs;
487 float *AccumulatedWeights = NULL;
488 Splay_Tree *ST = NULL;
489 Splay_Tree *Para_ST = NULL;
490 Hash_Table *HT = NULL;
491 List_Table *LT = NULL;
492 Heap *H;
493 HeapEntry *he;
494 register float *fptr = NULL;
495 register Invf_Doc_Entry *ide = NULL;
496 register Invf_Doc_EntryH *ideh = NULL;
497 int BackEnd, NumExact, MaxExact, NumParas;
498 int MaxDocs = 0, MaxParas = 0;
499 int i;
500 Invf_Doc_Entry_Pool ide_pool;
501 ide_pool.pool = NULL;
502
503 qd->hops_taken = qd->num_of_ptrs = qd->num_of_accum = 0;
504
505 switch (rqi->AccumMethod)
506 {
507 case 'S':
508 ST = CosineDecodeSplay (qd, Terms, rqi, &ide_pool);
509 if (!ST)
510 return NULL;
511 break;
512 case 'A':
513 AccumulatedWeights = CosineDecode (qd, Terms, rqi);
514 if (!AccumulatedWeights)
515 return NULL;
516 break;
517 case 'H':
518 HT = CosineDecodeHash (qd, Terms, rqi);
519 if (!HT)
520 return NULL;
521 break;
522 case 'L':
523 LT = CosineDecodeList (qd, Terms, rqi);
524 if (!LT)
525 return NULL;
526 break;
527 }
528
529#if 0
530 if (rqi->UseSplayTree)
531 {
532
533 AccumulatedWeights = CosineDecode (qd, Terms, rqi);
534 fptr = AccumulatedWeights;
535 ide = SP_get_first (ST);
536 for (i = 0; i < qd->sd->sdh.num_of_docs; i++)
537 {
538 if (AccumulatedWeights[i] != 0)
539 {
540 if (i != ide->DocNum)
541 fprintf (stderr, "Sum mismatch for %d %f %d %f\n", i + 1,
542 AccumulatedWeights[i], ide->DocNum + 1, ide->Sum);
543 ide = SP_get_next (ST);
544 }
545 }
546 }
547#endif
548
549 switch (rqi->AccumMethod)
550 {
551 case 'S':
552 MaxParas = ST->no_of_items;
553 break;
554 case 'A':
555 { /* count the number of non-zero document weights */
556 register int i = qd->sd->sdh.num_of_docs;
557 register float *d;
558 MaxParas = 0;
559 for (d = AccumulatedWeights; i; i--, d++)
560 if (*d)
561 MaxParas++;
562 }
563 break;
564 case 'H':
565 MaxParas = HT->num + HT->Suplimentary_Num;
566 break;
567 case 'L':
568 MaxParas = LT->num;
569 break;
570 }
571
572 if (rqi->MaxParasToRetrieve != -1 && MaxParas > rqi->MaxParasToRetrieve)
573 MaxParas = rqi->MaxParasToRetrieve;
574 MaxDocs = MaxParas;
575
576 /* Allocate memory for the heap */
577 Docs = MakeDocList (MaxDocs);
578 ChangeMemInUse (qd, sizeof (DocEntry) * MaxDocs);
579
580 H = Heap_Make (MaxDocs, Heap_Lesser);
581
582
583 /* Get the sums from the array divide the sums by the
584 document weights which we retrieve from the ".idx.wgt" file and put
585 the resulting data into a heap */
586
587
588 he = H->HE;
589 H->NumItems = MaxDocs;
590 switch (rqi->AccumMethod)
591 {
592 case 'S':
593 {
594 ide = SP_get_first (ST);
595 for (i = 0; i < H->NumItems; i++, ide = SP_get_next (ST), he++)
596 {
597 he->DocNum = ide->DocNum + 1;
598 he->OrgWeight = &ide->Sum;
599 qd->num_of_accum++;
600 }
601 }
602 break;
603 case 'A':
604 {
605 fptr = AccumulatedWeights;
606 for (i = 0; i < H->NumItems; i++, fptr++, he++)
607 {
608 he->DocNum = i + 1;
609 he->OrgWeight = fptr;
610 if (*fptr)
611 qd->num_of_accum++;
612 }
613 }
614 break;
615 case 'H':
616 {
617 ideh = HT->Head;
618 for (i = 0; i < H->NumItems; i++, ideh = ideh->next, he++)
619 {
620 he->DocNum = ideh->IDE.DocNum + 1;
621 he->OrgWeight = &ideh->IDE.Sum;
622 qd->num_of_accum++;
623 }
624 }
625 break;
626 case 'L':
627 {
628 ide = LT->IDE;
629 for (i = 0; i < H->NumItems; i++, ide++, he++)
630 {
631 he->DocNum = ide->DocNum + 1;
632 he->OrgWeight = &ide->Sum;
633 qd->num_of_accum++;
634 }
635 }
636 break;
637 }
638
639 he = H->HE;
640 for (i = 0; i < H->NumItems; i++, he++)
641 {
642 *he->OrgWeight /= GetLowerApproxDocWeight (qd->awd, he->DocNum - 1);
643 he->Weight = *he->OrgWeight;
644 *he->OrgWeight = 0;
645 he->SeekPos = he->Len = 0;
646 }
647
648 Heap_Build (H);
649
650 he = H->HE;
651 switch (rqi->AccumMethod)
652 {
653 case 'S':
654 {
655 for (i = MaxDocs; i < ST->no_of_items; i++, ide = SP_get_next (ST))
656 {
657 ide->Sum /= GetLowerApproxDocWeight (qd->awd, ide->DocNum);
658 qd->num_of_accum++;
659 if (ide->Sum <= he->Weight)
660 continue;
661 *he->OrgWeight = he->Weight;
662 he->DocNum = ide->DocNum + 1;
663 he->Weight = ide->Sum;
664 he->OrgWeight = &ide->Sum;
665 *he->OrgWeight = 0;
666 Heap_Heapify (H, 1);
667 }
668 }
669 break;
670 case 'A':
671 {
672 for (i = MaxDocs; i < qd->sd->sdh.num_of_docs; i++, fptr++)
673 {
674 if (!*fptr)
675 continue;
676 qd->num_of_accum++;
677 *fptr /= GetLowerApproxDocWeight (qd->awd, i);
678 if (*fptr <= he->Weight)
679 continue;
680 *he->OrgWeight = he->Weight;
681 he->DocNum = i + 1;
682 he->Weight = *fptr;
683 he->OrgWeight = fptr;
684 *he->OrgWeight = 0;
685 Heap_Heapify (H, 1);
686 }
687 }
688 break;
689 case 'H':
690 {
691 for (; ideh; ideh = ideh->next)
692 {
693 qd->num_of_accum++;
694 ideh->IDE.Sum /=
695 GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum);
696 if (ideh->IDE.Sum <= he->Weight)
697 continue;
698 *he->OrgWeight = he->Weight;
699 he->DocNum = ideh->IDE.DocNum + 1;
700 he->Weight = ideh->IDE.Sum;
701 he->OrgWeight = &ideh->IDE.Sum;
702 *he->OrgWeight = 0;
703 Heap_Heapify (H, 1);
704 }
705 }
706 break;
707 case 'L':
708 {
709 for (i = MaxDocs; i < LT->num; i++, ide++)
710 {
711 qd->num_of_accum++;
712 ide->Sum /=
713 GetLowerApproxDocWeight (qd->awd, ide->DocNum);
714 if (ide->Sum <= he->Weight)
715 continue;
716 *he->OrgWeight = he->Weight;
717 he->DocNum = ide->DocNum + 1;
718 he->Weight = ide->Sum;
719 he->OrgWeight = &ide->Sum;
720 *he->OrgWeight = 0;
721 Heap_Heapify (H, 1);
722 }
723 }
724 break;
725 }
726
727
728 if (rqi->Exact && qd->id->ifh.InvfLevel != 3)
729 {
730 HeapEntry *he = H->HE;
731
732 for (i = 0; i < H->NumItems; i++, he++)
733 {
734 he->Weight = he->Weight *
735 GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
736 FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
737 }
738
739 Heap_Build (H);
740
741 he = H->HE;
742
743 switch (rqi->AccumMethod)
744 {
745 case 'S':
746 {
747 ide = SP_get_first (ST);
748 for (i = 0; i < ST->no_of_items; i++, ide = SP_get_next (ST))
749 {
750 u_long SeekPos, Len;
751 float Weight;
752 if (!ide->Sum)
753 continue;
754 if (ide->Sum <= he->Weight)
755 continue;
756 Weight = ide->Sum *
757 GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
758 FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
759 if (Weight <= he->Weight)
760 continue;
761 he->DocNum = ide->DocNum + 1;
762 he->OrgWeight = &ide->Sum;
763 he->Weight = Weight;
764 he->SeekPos = SeekPos;
765 he->Len = Len;
766 ide->Sum = 0;
767 Heap_Heapify (H, 1);
768 }
769 }
770 break;
771
772 /* up to here */
773
774 case 'A':
775 {
776 fptr = AccumulatedWeights;
777 for (i = 0; i < qd->sd->sdh.num_of_docs; i++, fptr++)
778 {
779 u_long SeekPos, Len;
780 float Weight;
781 if (!*fptr)
782 continue;
783 if (*fptr <= he->Weight)
784 continue;
785 Weight = *fptr *
786 GetLowerApproxDocWeight (qd->awd, i) /
787 FetchDocStart (qd, i + 1, &SeekPos, &Len);
788 if (Weight <= he->Weight)
789 continue;
790 he->DocNum = i + 1;
791 he->OrgWeight = fptr;
792 he->Weight = Weight;
793 he->SeekPos = SeekPos;
794 he->Len = Len;
795 *fptr = 0;
796 Heap_Heapify (H, 1);
797 }
798 }
799 break;
800 case 'H':
801 {
802 ideh = HT->Head;
803 for (ideh = HT->Head; ideh; ideh = ideh->next)
804 {
805 u_long SeekPos, Len;
806 float Weight;
807 if (!ideh->IDE.Sum)
808 continue;
809 if (ideh->IDE.Sum <= he->Weight)
810 continue;
811 Weight = ideh->IDE.Sum *
812 GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum) /
813 FetchDocStart (qd, ideh->IDE.DocNum + 1, &SeekPos, &Len);
814 if (Weight <= he->Weight)
815 continue;
816 he->DocNum = ideh->IDE.DocNum + 1;
817 he->OrgWeight = &ideh->IDE.Sum;
818 he->Weight = Weight;
819 he->SeekPos = SeekPos;
820 he->Len = Len;
821 ideh->IDE.Sum = 0;
822 Heap_Heapify (H, 1);
823 }
824 }
825 break;
826 case 'L':
827 {
828 ide = LT->IDE;
829 for (i = 0; i < LT->num; i++, ide++)
830 {
831 u_long SeekPos, Len;
832 float Weight;
833 if (!ide->Sum)
834 continue;
835 if (ide->Sum <= he->Weight)
836 continue;
837 Weight = ide->Sum *
838 GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
839 FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
840 if (Weight <= he->Weight)
841 continue;
842 he->DocNum = ide->DocNum + 1;
843 he->OrgWeight = &ide->Sum;
844 he->Weight = Weight;
845 he->SeekPos = SeekPos;
846 he->Len = Len;
847 ide->Sum = 0;
848 Heap_Heapify (H, 1);
849 }
850 }
851 break;
852 }
853 }
854
855
856
857 H->HC = Heap_Greater;
858 Heap_Build (H);
859
860
861 MaxDocs = H->NumItems;
862 if (rqi->MaxDocsToRetrieve != -1 && MaxDocs > rqi->MaxDocsToRetrieve)
863 MaxDocs = rqi->MaxDocsToRetrieve;
864
865 /* Alarm */
866
867 he = H->HE;
868 BackEnd = H->NumItems - 1;
869 NumExact = 0;
870 MaxExact = H->NumItems;
871 NumParas = 0;
872 Para_ST = SP_createset (DE_comp);
873 while (H->NumItems && Docs->num < MaxDocs)
874 {
875 DocEntry DocEnt;
876 DocEntry *mem;
877
878 if (rqi->Exact)
879 {
880 if (H->HE[0].SeekPos == 0)
881 NumExact += Make_Exact_Root (qd, H);
882 }
883 else
884 FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
885
886 NumParas++;
887
888 DocEnt.DocNum = he->DocNum;
889 DocEnt.Weight = he->Weight;
890 DocEnt.Len = he->Len;
891 DocEnt.SeekPos = he->SeekPos;
892 DocEnt.CompTextBuffer = NULL;
893 DocEnt.Next = NULL;
894
895 Heap_DeleteHead (H);
896
897 if (!(mem = SP_member (&DocEnt, Para_ST)))
898 {
899 Docs->DE[Docs->num] = DocEnt;
900 SP_insert (&Docs->DE[Docs->num], Para_ST);
901 Docs->num++;
902 }
903 else
904 {
905 DocEnt.Next = mem->Next;
906 Docs->DE[BackEnd] = DocEnt;
907 mem->Next = &Docs->DE[BackEnd--];
908 }
909 }
910 SP_freeset (Para_ST);
911
912 if (qd->id->ifh.InvfLevel == 3)
913 {
914 Message ("%d Paragraphs were required to get %d documents",
915 NumParas, Docs->num);
916 if (NumExact == MaxExact)
917 {
918 Message ("The exact weights of all %d paragraphs had to be calculated", MaxExact);
919 Message ("to obtain %d paragraphs. This may mean that the the documents", NumParas);
920 Message ("returned do not necessarly represent an exact cosine ranking.");
921 Message ("This problem may be corrected by increasing \'maxparas\'.");
922 }
923 }
924#if 0
925 {
926 int i;
927 FILE *f = fopen ("top.paras", "w");
928 fprintf (f, "=========================\nTop Paragraphs\n");
929 for (i = 0; i < Docs->num; i++)
930 {
931 DocEntry *d;
932 fprintf (f, "<%d(%f)> ", Heap[i].DocNum, Heap[i].Weight);
933 for (d = Heap[i].Next; d; d = d->Next)
934 fprintf (f, "%d(%f) ", d->DocNum, d->Weight);
935 fprintf (f, "\n");
936 }
937 fprintf (f, "=========================\n");
938 fclose (f);
939 }
940#endif
941
942 if (AccumulatedWeights)
943 {
944 Xfree (AccumulatedWeights);
945 ChangeMemInUse (qd, -sizeof (float) * qd->sd->sdh.num_of_docs);
946 }
947 if (ST)
948 {
949 int mem = ST->mem_in_use;
950 SP_freeset (ST);
951 ChangeMemInUse (qd, -mem);
952 free_ide_pool (qd, &ide_pool);
953 }
954 if (HT)
955 HT_free (qd, HT);
956
957 if (LT)
958 LT_free (qd, LT);
959
960 if (H) Xfree (H); /* [RJM 07/98: Memory Leak] */
961
962 return (Docs);
963}
964
965
966
967
968/* if MaxDocs == -1 it means all */
969void
970RankedQuery (query_data *qd, char *Query, RankedQueryInfo *rqi)
971{
972 DocList *dl;
973
974 if (qd->TL)
975 FreeTermList (&(qd->TL));
976
977 /* [RPAP - Feb 97: Term Frequency] */
978 if (qd->QTL)
979 FreeQueryTermList (&(qd->QTL));
980
981 qd->TL = ParseRankedQuery (qd->sd, Query, rqi->Sort, qd->sd->sdh.indexed, /* [RPAP - Jan 97: Stem Index Change] */
982 &(qd->QTL)); /* [RPAP - Feb 97: Term Frequency] */
983
984 /* PrintTermList (qd->TL, stderr); */
985
986 dl = CosineGet (qd, qd->TL, rqi);
987
988 if (!dl)
989 FatalError (1, "Out of memory\n");
990
991 FreeQueryDocs (qd);
992
993 qd->DL = dl;
994}
Note: See TracBrowser for help on using the repository browser.