source: trunk/gsdl/packages/mg/src/text/query.ranked.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 23.4 KB
Line 
1/**************************************************************************
2 *
3 * query.ranked.c -- Ranked query evaluation
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: query.ranked.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "filestats.h"
28#include "messages.h"
29#include "timing.h"
30#include "sptree.h"
31
32#include "mg.h"
33#include "invf.h"
34#include "text.h"
35#include "lists.h"
36#include "backend.h"
37#include "stem_search.h"
38#include "weights.h"
39#include "text_get.h"
40#include "invf_get.h"
41#include "words.h"
42#include "stemmer.h"
43#include "locallib.h"
44#include "environment.h"
45#include "term_lists.h"
46#include "local_strings.h"
47#include "query_term_list.h" /* [RPAP - Feb 97: Term Frequency] */
48
49/*
50 $Log$
51 Revision 1.1 1999/08/10 21:18:20 sjboddie
52 renamed mg-1.3d directory mg
53
54 Revision 1.2 1998/11/25 07:55:50 rjmcnab
55
56 Modified mg to that you can specify the stemmer you want
57 to use via a command line option. You specify it to
58 mg_passes during the build process. The number of the
59 stemmer that you used is stored within the inverted
60 dictionary header and the stemmed dictionary header so
61 the correct stemmer is used in later stages of building
62 and querying.
63
64 Revision 1.1 1998/11/17 09:35:34 rjmcnab
65 *** empty log message ***
66
67 * Revision 1.4 1994/11/25 03:47:46 tes
68 * Committing files before adding the merge stuff.
69 *
70 * Revision 1.3 1994/10/20 03:57:03 tes
71 * I have rewritten the boolean query optimiser and abstracted out the
72 * components of the boolean query.
73 *
74 * Revision 1.2 1994/09/20 04:42:04 tes
75 * For version 1.1
76 *
77 */
78
79static char *RCSID = "$Id: query.ranked.c 439 1999-08-10 21:23:37Z sjboddie $";
80
81/*************************************************************************/
82
83typedef struct HeapEntry
84 {
85 float Weight;
86 float *OrgWeight;
87 int DocNum;
88 u_long SeekPos; /* position in the text file in bytes */
89 u_long Len; /* length of the document in bytes */
90 }
91HeapEntry;
92
93typedef int (*HeapComp) (HeapEntry *, HeapEntry *);
94
95typedef struct Heap
96 {
97 int NumItems;
98 int MaxSize;
99 HeapComp HC;
100 HeapEntry HE[1];
101 }
102Heap;
103
104
105Heap *
106Heap_Make (int size, HeapComp hc)
107{
108 Heap *H;
109 H = Xmalloc (sizeof (Heap) + (size - 1) * sizeof (HeapEntry));
110 if (!H)
111 return NULL;
112 H->NumItems = 0;
113 H->MaxSize = size;
114 H->HC = hc;
115 return H;
116}
117
118int
119Heap_Size (Heap * H)
120{
121 return sizeof (Heap) + H->MaxSize * sizeof (HeapEntry);
122}
123
124HeapEntry *
125Heap_GetHead (Heap * H)
126{
127 if (H && H->NumItems)
128 return &H->HE[0];
129 else
130 return NULL;
131}
132
133
134
135void
136Heap_Heapify (Heap * H, int i)
137{
138 register int curr, child;
139 curr = i;
140 child = curr * 2;
141 while (child <= H->NumItems)
142 {
143 if (child < H->NumItems && H->HC (&H->HE[child], &H->HE[child - 1]) > 0)
144 child++;
145 if (H->HC (&H->HE[curr - 1], &H->HE[child - 1]) < 0)
146 {
147 HeapEntry temp = H->HE[child - 1];
148 H->HE[child - 1] = H->HE[curr - 1];
149 H->HE[curr - 1] = temp;
150 curr = child;
151 child = 2 * child;
152 }
153 else
154 break;
155 }
156}
157
158
159void
160Heap_Build (Heap * H)
161{
162 register int i;
163 for (i = H->NumItems / 2; i > 0; i--)
164 Heap_Heapify (H, i);
165}
166
167
168void
169Heap_Sort (Heap * H)
170{
171 register int i;
172 for (i = H->NumItems; i > 1; i--)
173 {
174 HeapEntry temp = H->HE[0];
175 H->HE[0] = H->HE[i - 1];
176 H->HE[i - 1] = temp;
177 H->NumItems--;
178 Heap_Heapify (H, 1);
179 }
180}
181
182
183void
184Heap_DeleteHead (Heap * H)
185{
186 H->HE[0] = H->HE[--H->NumItems];
187 Heap_Heapify (H, 1);
188}
189
190int
191Heap_Lesser (HeapEntry * a, HeapEntry * b)
192{
193 return (a->Weight > b->Weight ? -1 :
194 (a->Weight == b->Weight ? 0 : 1));
195}
196
197
198int
199Heap_Greater (HeapEntry * a, HeapEntry * b)
200{
201 return (a->Weight > b->Weight ? 1 :
202 (a->Weight == b->Weight ? 0 : -1));
203}
204
205int
206Make_Exact_Root (query_data * qd, Heap * H)
207{
208 int num = 0;
209 HeapEntry *he = H->HE;
210 while (he->SeekPos == 0)
211 {
212 he->Weight = he->Weight *
213 GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
214 FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
215 Heap_Heapify (H, 1);
216 num++;
217 }
218 return num;
219}
220
221void
222Heap_Dump (Heap * H, int num)
223{
224 int i, l, lines, r;
225 if (num > H->NumItems)
226 num = H->NumItems;
227 lines = (num + 3) / 4;
228 for (l = 0; l < lines; l++)
229 for (r = 0; r < 4; r++)
230 {
231 i = lines * r + l;
232 if (i < num)
233 fprintf (stderr, "[%2d] %7.4f", i, H->HE[i].Weight);
234 fprintf (stderr, r == 3 ? "\n" : " ");
235 }
236 fprintf (stderr, "\n");
237}
238
239/*************************************************************************/
240
241
242
243
244
245
246int
247doc_count_comp (const void *A, const void *B)
248{
249 const TermEntry *a = A;
250 const TermEntry *b = B;
251 return (a->WE.doc_count - b->WE.doc_count);
252}
253
254/* =========================================================================
255 * Function: ParseRankedQuery
256 * Description:
257 * Takes a string query line and extracts the terms in the query
258 * which exist in the stemmed dictionary.
259 * Optionally sorts the terms into ascending order by doc. number.
260 * Input:
261 * stemmed dictionary, query line, sort-flag
262 * Output:
263 * A list of terms.
264 * ========================================================================= */
265
266/* [RPAP - Jan 97: Stem Index Change] */
267static TermList *
268ParseRankedQuery (stemmed_dict * sd, char *QueryLine, int Sort, int indexed,
269 QueryTermList **query_term_list) /* [RPAP - Feb 97: Term Frequency] */
270{
271 u_char Word[MAXSTEMLEN + 1];
272 u_char sWord[MAXSTEMLEN + 1];
273 u_char *end, *s_in;
274 int default_stem_method = 0;
275 TermList *Terms = MakeTermList(0);
276
277 s_in = (u_char *) QueryLine;
278 end = s_in + strlen ((char *) s_in) - 1;
279 *query_term_list = MakeQueryTermList(0); /* [RPAP - Feb 97: Term Frequency] */
280
281 if (indexed)
282 default_stem_method = BooleanEnv (GetEnv ("casefold"), 0) | (BooleanEnv (GetEnv ("stem"), 0) << 1);
283 else
284 default_stem_method = sd->sdh.stem_method;
285
286 while (s_in <= end)
287 {
288 int j;
289 long num_entries, word_num;
290 unsigned long count, doc_count, invf_ptr, invf_len;
291 int weight_to_apply, stem_to_apply;
292 int method_using = -1;
293
294 /* 0=optional, 1=mustmatch */
295 int require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
296
297 /* Skip over the non word separator taking note of any parameters */
298 PARSE_RANKED_NON_STEM_WORD (require_match, s_in, end); /* [RJM 07/97: Ranked Required Terms] */
299 if (s_in > end) break;
300
301 /* Get a word and stem it */
302 PARSE_STEM_WORD (Word, s_in, end);
303
304 /* Extract any parameters */
305 weight_to_apply = 1;
306 stem_to_apply = default_stem_method;
307 while (s_in <= end)
308 {
309 int stem_param, weight_param, param_type;
310 char param[MAXPARAMLEN + 1];
311
312 param_type = 0;
313 PARSE_OPT_TERM_PARAM (param, param_type, s_in, end);
314 if (!param_type)
315 break;
316
317 switch (param_type)
318 {
319 case (WEIGHTPARAM):
320 weight_param = atoi (param);
321 if (errno != ERANGE && weight_param > 0)
322 weight_to_apply = weight_param;
323 break;
324
325 case (STEMPARAM):
326 stem_param = atoi (param);
327 if (errno != ERANGE && indexed && stem_param >= 0 && stem_param <= 3)
328 method_using = stem_to_apply = stem_param;
329 break;
330 }
331 }
332
333 bcopy ((char *) Word, (char *) sWord, *Word + 1);
334 stemmer (stem_to_apply, sd->sdh.stemmer_num, sWord);
335
336 if (!indexed || stem_to_apply == 0)
337 {
338 /* Look for the word in the already identified terms */
339 for (j = 0; j < Terms->num; j++)
340 if (compare (Terms->TE[j].Word, Word) == 0)
341 break;
342
343 /* Increment the weight if the word is in the list */
344 /* Update the require match attribute */
345 if (j < Terms->num)
346 {
347 Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
348 INT_MAX : (Terms->TE[j].Count + weight_to_apply));
349 Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
350 AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
351 }
352 else
353 {
354 /* Look for it in the stemmed dictionary */
355 if ((word_num = FindWord (sd, sWord, &count, &doc_count,
356 &invf_ptr, &invf_len)) != -1)
357 {
358 /* Search the list for the word */
359 for (j = 0; j < Terms->num; j++)
360 if (Terms->TE[j].WE.word_num == word_num)
361 break;
362
363 /* Increment the weight if the word is in the list */
364 if (j < Terms->num)
365 {
366 Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
367 INT_MAX : (Terms->TE[j].Count + weight_to_apply));
368 Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
369 AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
370 }
371 else
372 {
373 /* Create a new entry in the list for the new word */
374 TermEntry te;
375
376 te.WE.word_num = word_num;
377 te.WE.count = count;
378 te.WE.doc_count = doc_count;
379 te.WE.max_doc_count = doc_count;
380 te.WE.invf_ptr = invf_ptr;
381 te.WE.invf_len = invf_len;
382 te.Count = weight_to_apply;
383 te.Word = copy_string (Word);
384 if (!te.Word)
385 FatalError (1, "Could NOT create memory to add term");
386 te.Stem = NULL;
387 te.require_match = require_match;
388
389 AddTermEntry (&Terms, &te);
390
391 /* [RPAP - Feb 97: Term Frequency] */
392 AddQueryTerm (query_term_list, Word, count, method_using);
393 }
394 }
395 /* [RPAP - Feb 97: Term Frequency] */
396 else
397 AddQueryTerm (query_term_list, Word, 0, method_using);
398 }
399 }
400 else
401 {
402 int total_count = 0; /* [RPAP - Feb 97: Term Frequency] */
403 TermList *tempList = MakeTermList (0);
404 if ((num_entries = FindWords (sd, sWord, stem_to_apply, &tempList)) > 0)
405 {
406 int i;
407 u_long max_doc_count = 0;
408
409 /* get the maximum doc count */
410 for (i = 0; i < tempList->num; i++)
411 {
412 if (tempList->TE[i].WE.doc_count > max_doc_count)
413 max_doc_count = tempList->TE[i].WE.doc_count;
414 total_count += tempList->TE[i].WE.count; /* [RPAP - Feb 97: Term Frequency] */
415 }
416
417 for (i = 0; i < tempList->num; i++)
418 {
419 /* Look for the word(s) in the already identified terms */
420 for (j = 0; j < Terms->num; j++)
421 {
422 if (compare (Terms->TE[j].Word, tempList->TE[i].Word) == 0)
423 {
424 /* found the word */
425 /* Modify weight */
426 Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
427 INT_MAX : (Terms->TE[j].Count + weight_to_apply));
428 if (Terms->TE[j].WE.max_doc_count < max_doc_count)
429 Terms->TE[j].WE.max_doc_count = max_doc_count;
430 break;
431 }
432 }
433
434 if (j == Terms->num)
435 {
436 /* word was not found */
437 tempList->TE[i].WE.max_doc_count = max_doc_count;
438 tempList->TE[i].Count = weight_to_apply;
439
440 /* We cannot require a term to match if it is expanded */
441 /* into multiple terms :-( */
442 tempList->TE[i].require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
443
444 AddTermEntry (&Terms, &tempList->TE[i]);
445 }
446 }
447 }
448 /* [RPAP - Feb 97: Term Frequency] */
449 AddQueryTerm (query_term_list, Word, total_count, method_using);
450
451 if (tempList != NULL) Xfree(tempList); /* [RJM 07/98: Memory Leak] */
452 } /* end indexed */
453 } /* end while */
454
455 if (Sort)
456 /* Sort the terms in ascending order by doc_count */
457 qsort (Terms->TE, Terms->num, sizeof (TermEntry), doc_count_comp);
458 return (Terms);
459}
460
461
462
463
464static int
465DE_comp (void *a, void *b)
466{
467 return ((DocEntry *) a)->SeekPos - ((DocEntry *) b)->SeekPos;
468}
469
470
471
472
473
474/*
475 * This function is given a list of term numbers and it returns a list
476 * of document numbers based on the cosine document weighting system.
477 * This puts the entries in an array.
478 * inverted file.
479 * If MaxDocs == -1 then it means all
480 */
481static DocList *
482CosineGet (query_data * qd, TermList * Terms, RankedQueryInfo * rqi) {
483 DocList *Docs;
484 float *AccumulatedWeights = NULL;
485 Splay_Tree *ST = NULL;
486 Splay_Tree *Para_ST = NULL;
487 Hash_Table *HT = NULL;
488 List_Table *LT = NULL;
489 Heap *H;
490 HeapEntry *he;
491 register float *fptr = NULL;
492 register Invf_Doc_Entry *ide = NULL;
493 register Invf_Doc_EntryH *ideh = NULL;
494 int BackEnd, NumExact, MaxExact, NumParas;
495 int MaxDocs = 0, MaxParas = 0;
496 int i;
497 Invf_Doc_Entry_Pool ide_pool;
498 ide_pool.pool = NULL;
499
500 qd->hops_taken = qd->num_of_ptrs = qd->num_of_accum = 0;
501
502 switch (rqi->AccumMethod)
503 {
504 case 'S':
505 ST = CosineDecodeSplay (qd, Terms, rqi, &ide_pool);
506 if (!ST)
507 return NULL;
508 break;
509 case 'A':
510 AccumulatedWeights = CosineDecode (qd, Terms, rqi);
511 if (!AccumulatedWeights)
512 return NULL;
513 break;
514 case 'H':
515 HT = CosineDecodeHash (qd, Terms, rqi);
516 if (!HT)
517 return NULL;
518 break;
519 case 'L':
520 LT = CosineDecodeList (qd, Terms, rqi);
521 if (!LT)
522 return NULL;
523 break;
524 }
525
526#if 0
527 if (rqi->UseSplayTree)
528 {
529
530 AccumulatedWeights = CosineDecode (qd, Terms, rqi);
531 fptr = AccumulatedWeights;
532 ide = SP_get_first (ST);
533 for (i = 0; i < qd->sd->sdh.num_of_docs; i++)
534 {
535 if (AccumulatedWeights[i] != 0)
536 {
537 if (i != ide->DocNum)
538 fprintf (stderr, "Sum mismatch for %d %f %d %f\n", i + 1,
539 AccumulatedWeights[i], ide->DocNum + 1, ide->Sum);
540 ide = SP_get_next (ST);
541 }
542 }
543 }
544#endif
545
546 switch (rqi->AccumMethod)
547 {
548 case 'S':
549 MaxParas = ST->no_of_items;
550 break;
551 case 'A':
552 { /* count the number of non-zero document weights */
553 register int i = qd->sd->sdh.num_of_docs;
554 register float *d;
555 MaxParas = 0;
556 for (d = AccumulatedWeights; i; i--, d++)
557 if (*d)
558 MaxParas++;
559 }
560 break;
561 case 'H':
562 MaxParas = HT->num + HT->Suplimentary_Num;
563 break;
564 case 'L':
565 MaxParas = LT->num;
566 break;
567 }
568
569 if (rqi->MaxParasToRetrieve != -1 && MaxParas > rqi->MaxParasToRetrieve)
570 MaxParas = rqi->MaxParasToRetrieve;
571 MaxDocs = MaxParas;
572
573 /* Allocate memory for the heap */
574 Docs = MakeDocList (MaxDocs);
575 ChangeMemInUse (qd, sizeof (DocEntry) * MaxDocs);
576
577 H = Heap_Make (MaxDocs, Heap_Lesser);
578
579
580 /* Get the sums from the array divide the sums by the
581 document weights which we retrieve from the ".idx.wgt" file and put
582 the resulting data into a heap */
583
584
585 he = H->HE;
586 H->NumItems = MaxDocs;
587 switch (rqi->AccumMethod)
588 {
589 case 'S':
590 {
591 ide = SP_get_first (ST);
592 for (i = 0; i < H->NumItems; i++, ide = SP_get_next (ST), he++)
593 {
594 he->DocNum = ide->DocNum + 1;
595 he->OrgWeight = &ide->Sum;
596 qd->num_of_accum++;
597 }
598 }
599 break;
600 case 'A':
601 {
602 fptr = AccumulatedWeights;
603 for (i = 0; i < H->NumItems; i++, fptr++, he++)
604 {
605 he->DocNum = i + 1;
606 he->OrgWeight = fptr;
607 if (*fptr)
608 qd->num_of_accum++;
609 }
610 }
611 break;
612 case 'H':
613 {
614 ideh = HT->Head;
615 for (i = 0; i < H->NumItems; i++, ideh = ideh->next, he++)
616 {
617 he->DocNum = ideh->IDE.DocNum + 1;
618 he->OrgWeight = &ideh->IDE.Sum;
619 qd->num_of_accum++;
620 }
621 }
622 break;
623 case 'L':
624 {
625 ide = LT->IDE;
626 for (i = 0; i < H->NumItems; i++, ide++, he++)
627 {
628 he->DocNum = ide->DocNum + 1;
629 he->OrgWeight = &ide->Sum;
630 qd->num_of_accum++;
631 }
632 }
633 break;
634 }
635
636 he = H->HE;
637 for (i = 0; i < H->NumItems; i++, he++)
638 {
639 *he->OrgWeight /= GetLowerApproxDocWeight (qd->awd, he->DocNum - 1);
640 he->Weight = *he->OrgWeight;
641 *he->OrgWeight = 0;
642 he->SeekPos = he->Len = 0;
643 }
644
645 Heap_Build (H);
646
647 he = H->HE;
648 switch (rqi->AccumMethod)
649 {
650 case 'S':
651 {
652 for (i = MaxDocs; i < ST->no_of_items; i++, ide = SP_get_next (ST))
653 {
654 ide->Sum /= GetLowerApproxDocWeight (qd->awd, ide->DocNum);
655 qd->num_of_accum++;
656 if (ide->Sum <= he->Weight)
657 continue;
658 *he->OrgWeight = he->Weight;
659 he->DocNum = ide->DocNum + 1;
660 he->Weight = ide->Sum;
661 he->OrgWeight = &ide->Sum;
662 *he->OrgWeight = 0;
663 Heap_Heapify (H, 1);
664 }
665 }
666 break;
667 case 'A':
668 {
669 for (i = MaxDocs; i < qd->sd->sdh.num_of_docs; i++, fptr++)
670 {
671 if (!*fptr)
672 continue;
673 qd->num_of_accum++;
674 *fptr /= GetLowerApproxDocWeight (qd->awd, i);
675 if (*fptr <= he->Weight)
676 continue;
677 *he->OrgWeight = he->Weight;
678 he->DocNum = i + 1;
679 he->Weight = *fptr;
680 he->OrgWeight = fptr;
681 *he->OrgWeight = 0;
682 Heap_Heapify (H, 1);
683 }
684 }
685 break;
686 case 'H':
687 {
688 for (; ideh; ideh = ideh->next)
689 {
690 qd->num_of_accum++;
691 ideh->IDE.Sum /=
692 GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum);
693 if (ideh->IDE.Sum <= he->Weight)
694 continue;
695 *he->OrgWeight = he->Weight;
696 he->DocNum = ideh->IDE.DocNum + 1;
697 he->Weight = ideh->IDE.Sum;
698 he->OrgWeight = &ideh->IDE.Sum;
699 *he->OrgWeight = 0;
700 Heap_Heapify (H, 1);
701 }
702 }
703 break;
704 case 'L':
705 {
706 for (i = MaxDocs; i < LT->num; i++, ide++)
707 {
708 qd->num_of_accum++;
709 ide->Sum /=
710 GetLowerApproxDocWeight (qd->awd, ide->DocNum);
711 if (ide->Sum <= he->Weight)
712 continue;
713 *he->OrgWeight = he->Weight;
714 he->DocNum = ide->DocNum + 1;
715 he->Weight = ide->Sum;
716 he->OrgWeight = &ide->Sum;
717 *he->OrgWeight = 0;
718 Heap_Heapify (H, 1);
719 }
720 }
721 break;
722 }
723
724
725 if (rqi->Exact && qd->id->ifh.InvfLevel != 3)
726 {
727 HeapEntry *he = H->HE;
728
729 for (i = 0; i < H->NumItems; i++, he++)
730 {
731 he->Weight = he->Weight *
732 GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
733 FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
734 }
735
736 Heap_Build (H);
737
738 he = H->HE;
739
740 switch (rqi->AccumMethod)
741 {
742 case 'S':
743 {
744 ide = SP_get_first (ST);
745 for (i = 0; i < ST->no_of_items; i++, ide = SP_get_next (ST))
746 {
747 u_long SeekPos, Len;
748 float Weight;
749 if (!ide->Sum)
750 continue;
751 if (ide->Sum <= he->Weight)
752 continue;
753 Weight = ide->Sum *
754 GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
755 FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
756 if (Weight <= he->Weight)
757 continue;
758 he->DocNum = ide->DocNum + 1;
759 he->OrgWeight = &ide->Sum;
760 he->Weight = Weight;
761 he->SeekPos = SeekPos;
762 he->Len = Len;
763 ide->Sum = 0;
764 Heap_Heapify (H, 1);
765 }
766 }
767 break;
768
769 /* up to here */
770
771 case 'A':
772 {
773 fptr = AccumulatedWeights;
774 for (i = 0; i < qd->sd->sdh.num_of_docs; i++, fptr++)
775 {
776 u_long SeekPos, Len;
777 float Weight;
778 if (!*fptr)
779 continue;
780 if (*fptr <= he->Weight)
781 continue;
782 Weight = *fptr *
783 GetLowerApproxDocWeight (qd->awd, i) /
784 FetchDocStart (qd, i + 1, &SeekPos, &Len);
785 if (Weight <= he->Weight)
786 continue;
787 he->DocNum = i + 1;
788 he->OrgWeight = fptr;
789 he->Weight = Weight;
790 he->SeekPos = SeekPos;
791 he->Len = Len;
792 *fptr = 0;
793 Heap_Heapify (H, 1);
794 }
795 }
796 break;
797 case 'H':
798 {
799 ideh = HT->Head;
800 for (ideh = HT->Head; ideh; ideh = ideh->next)
801 {
802 u_long SeekPos, Len;
803 float Weight;
804 if (!ideh->IDE.Sum)
805 continue;
806 if (ideh->IDE.Sum <= he->Weight)
807 continue;
808 Weight = ideh->IDE.Sum *
809 GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum) /
810 FetchDocStart (qd, ideh->IDE.DocNum + 1, &SeekPos, &Len);
811 if (Weight <= he->Weight)
812 continue;
813 he->DocNum = ideh->IDE.DocNum + 1;
814 he->OrgWeight = &ideh->IDE.Sum;
815 he->Weight = Weight;
816 he->SeekPos = SeekPos;
817 he->Len = Len;
818 ideh->IDE.Sum = 0;
819 Heap_Heapify (H, 1);
820 }
821 }
822 break;
823 case 'L':
824 {
825 ide = LT->IDE;
826 for (i = 0; i < LT->num; i++, ide++)
827 {
828 u_long SeekPos, Len;
829 float Weight;
830 if (!ide->Sum)
831 continue;
832 if (ide->Sum <= he->Weight)
833 continue;
834 Weight = ide->Sum *
835 GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
836 FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
837 if (Weight <= he->Weight)
838 continue;
839 he->DocNum = ide->DocNum + 1;
840 he->OrgWeight = &ide->Sum;
841 he->Weight = Weight;
842 he->SeekPos = SeekPos;
843 he->Len = Len;
844 ide->Sum = 0;
845 Heap_Heapify (H, 1);
846 }
847 }
848 break;
849 }
850 }
851
852
853
854 H->HC = Heap_Greater;
855 Heap_Build (H);
856
857
858 MaxDocs = H->NumItems;
859 if (rqi->MaxDocsToRetrieve != -1 && MaxDocs > rqi->MaxDocsToRetrieve)
860 MaxDocs = rqi->MaxDocsToRetrieve;
861
862 /* Alarm */
863
864 he = H->HE;
865 BackEnd = H->NumItems - 1;
866 NumExact = 0;
867 MaxExact = H->NumItems;
868 NumParas = 0;
869 Para_ST = SP_createset (DE_comp);
870 while (H->NumItems && Docs->num < MaxDocs)
871 {
872 DocEntry DocEnt;
873 DocEntry *mem;
874
875 if (rqi->Exact)
876 {
877 if (H->HE[0].SeekPos == 0)
878 NumExact += Make_Exact_Root (qd, H);
879 }
880 else
881 FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
882
883 NumParas++;
884
885 DocEnt.DocNum = he->DocNum;
886 DocEnt.Weight = he->Weight;
887 DocEnt.Len = he->Len;
888 DocEnt.SeekPos = he->SeekPos;
889 DocEnt.CompTextBuffer = NULL;
890 DocEnt.Next = NULL;
891
892 Heap_DeleteHead (H);
893
894 if (!(mem = SP_member (&DocEnt, Para_ST)))
895 {
896 Docs->DE[Docs->num] = DocEnt;
897 SP_insert (&Docs->DE[Docs->num], Para_ST);
898 Docs->num++;
899 }
900 else
901 {
902 DocEnt.Next = mem->Next;
903 Docs->DE[BackEnd] = DocEnt;
904 mem->Next = &Docs->DE[BackEnd--];
905 }
906 }
907 SP_freeset (Para_ST);
908
909 if (qd->id->ifh.InvfLevel == 3)
910 {
911 Message ("%d Paragraphs were required to get %d documents",
912 NumParas, Docs->num);
913 if (NumExact == MaxExact)
914 {
915 Message ("The exact weights of all %d paragraphs had to be calculated", MaxExact);
916 Message ("to obtain %d paragraphs. This may mean that the the documents", NumParas);
917 Message ("returned do not necessarly represent an exact cosine ranking.");
918 Message ("This problem may be corrected by increasing \'maxparas\'.");
919 }
920 }
921#if 0
922 {
923 int i;
924 FILE *f = fopen ("top.paras", "w");
925 fprintf (f, "=========================\nTop Paragraphs\n");
926 for (i = 0; i < Docs->num; i++)
927 {
928 DocEntry *d;
929 fprintf (f, "<%d(%f)> ", Heap[i].DocNum, Heap[i].Weight);
930 for (d = Heap[i].Next; d; d = d->Next)
931 fprintf (f, "%d(%f) ", d->DocNum, d->Weight);
932 fprintf (f, "\n");
933 }
934 fprintf (f, "=========================\n");
935 fclose (f);
936 }
937#endif
938
939 if (AccumulatedWeights)
940 {
941 Xfree (AccumulatedWeights);
942 ChangeMemInUse (qd, -sizeof (float) * qd->sd->sdh.num_of_docs);
943 }
944 if (ST)
945 {
946 int mem = ST->mem_in_use;
947 SP_freeset (ST);
948 ChangeMemInUse (qd, -mem);
949 free_ide_pool (qd, &ide_pool);
950 }
951 if (HT)
952 HT_free (qd, HT);
953
954 if (LT)
955 LT_free (qd, LT);
956
957 if (H) Xfree (H); /* [RJM 07/98: Memory Leak] */
958
959 return (Docs);
960}
961
962
963
964
965/* if MaxDocs == -1 it means all */
966void
967RankedQuery (query_data *qd, char *Query, RankedQueryInfo *rqi)
968{
969 DocList *dl;
970
971 if (qd->TL)
972 FreeTermList (&(qd->TL));
973
974 /* [RPAP - Feb 97: Term Frequency] */
975 if (qd->QTL)
976 FreeQueryTermList (&(qd->QTL));
977
978 qd->TL = ParseRankedQuery (qd->sd, Query, rqi->Sort, qd->sd->sdh.indexed, /* [RPAP - Jan 97: Stem Index Change] */
979 &(qd->QTL)); /* [RPAP - Feb 97: Term Frequency] */
980
981 /* PrintTermList (qd->TL, stderr); */
982
983 dl = CosineGet (qd, qd->TL, rqi);
984
985 if (!dl)
986 FatalError (1, "Out of memory\n");
987
988 FreeQueryDocs (qd);
989
990 qd->DL = dl;
991}
Note: See TracBrowser for help on using the repository browser.