source: main/tags/2.80/indexers/mg/src/text/invf_get.c@ 24541

Last change on this file since 24541 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 39.8 KB
Line 
1/**************************************************************************
2 *
3 * invf_get.c -- Inverted file reading routines
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: invf_get.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "filestats.h"
28#include "timing.h"
29#include "bitio_gen.h"
30#include "bitio_m_mem.h"
31#include "bitio_m.h"
32#include "sptree.h"
33#include "messages.h"
34#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36#include "mg.h"
37#include "invf.h"
38#include "text.h"
39#include "lists.h"
40#include "backend.h"
41#include "invf_get.h"
42#include "mg_errors.h"
43
44#define SKIPPING
45#define SHORT_CUT
46
47struct pool
48 {
49 int length, pos;
50 struct pool *next;
51 Invf_Doc_Entry ide[1];
52 };
53
54static int skip_header = 0;
55
56static char skip_header_text0[] =
57"#######################################################################\n"
58"#\n"
59"# 1\tWord number.\n"
60"# 2\tNumber of time the word occurs in the collection.\n"
61"# 3\tNumber of documents the word occurs in.\n"
62"# 4\tThe word.\n"
63"# 5\tNumber of docnum/word_count entries decoded.\n"
64"# 6\tNumber of hits.\n"
65"# 7\tNumber of entries in the splay tree or hash table.\n"
66"# 8\tAmount added to the accumulators while while processing this word.\n"
67"#\n"
68"#######################################################################\n";
69
70static char skip_header_text1[] =
71"#######################################################################\n"
72"#\n"
73"# 1\tWord number.\n"
74"# 2\tNumber of time the word occurs in the collection.\n"
75"# 3\tNumber of documents the word occurs in.\n"
76"# 4\tThe word.\n"
77"# 5\tNumber of skips taken in evaluating the inverted file entry.\n"
78"# 6\tNumber of docnum/word_count entries decoded.\n"
79"# 7\tNumber of hits.\n"
80"# 8\tNumber of entries in the splay tree or hash table.\n"
81"# 9\tAmount added to the accumulators while while processing this word.\n"
82"#\n"
83"#######################################################################\n";
84
85
86static char skip_header_text2[] =
87"#######################################################################\n"
88"#\n"
89"# 1\tWord number.\n"
90"# 2\tNumber of time the word occurs in the collection.\n"
91"# 3\tNumber of documents the word occurs in.\n"
92"# 4\tThe word.\n"
93"# 5\tSkip size.\n"
94"# 6\tNumber of skips taken in evaluating the inverted file entry.\n"
95"# 7\tNumber of docnum/word_count entries decoded.\n"
96"# 8\tNumber of hits.\n"
97"# 9\tNumber of entries in the splay tree or hash table.\n"
98"# 10\tAmount added to the accumulators while while processing this word.\n"
99"#\n"
100"#######################################################################\n";
101
102static char *skip_header_text[] =
103{skip_header_text0,
104 skip_header_text1,
105 skip_header_text2};
106
107
108
109
110
111
112
113
114/**************************************************************************
115 *
116 * Code for the hash table.
117 *
118 */
119
120static Hash_Table *
121HT_create (query_data * qd, unsigned max_entries,
122 unsigned max_size)
123{
124 unsigned es, hts;
125 Hash_Table *HT;
126 if (max_entries == 0)
127 max_entries = 8192;
128 es = sizeof (Hash_Table) + sizeof (Invf_Doc_EntryH) * (max_entries - 1);
129 hts = sizeof (Invf_Doc_EntryH *) * max_size;
130
131 HT = Xmalloc (es);
132 if (!HT)
133 return NULL;
134
135 HT->HashTable = Xmalloc (hts);
136 if (!(HT->HashTable))
137 {
138 Xfree (HT);
139 return NULL;
140 }
141
142 ChangeMemInUse (qd, es + hts);
143 bzero ((char *) (HT->HashTable), hts);
144 HT->max = max_entries;
145 HT->num = 0;
146 HT->Suplimentary_Entries = NULL;
147 HT->Suplimentary_Size = 0;
148 HT->Suplimentary_Num = 0;
149 HT->Head = NULL;
150 HT->size = max_size;
151 return HT;
152}
153
154
155static void
156HT_Sup_create (query_data * qd, Hash_Table * HT, unsigned max_size)
157{
158 int es = sizeof (Invf_Doc_EntryH) * max_size;
159 HT->Suplimentary_Entries = Xmalloc (es);
160 if (!HT->Suplimentary_Entries)
161 return;
162 HT->Suplimentary_Size = max_size;
163 HT->Suplimentary_Num = 0;
164 ChangeMemInUse (qd, es);
165}
166
167static void
168HT_Sup_Add (Hash_Table * HT, unsigned long DN, float Sum)
169{
170 Invf_Doc_EntryH *IDEH;
171 IDEH = &HT->Suplimentary_Entries[HT->Suplimentary_Num++];
172 IDEH->next = NULL;
173 IDEH->IDE.DocNum = DN;
174 IDEH->IDE.Sum = Sum;
175}
176
177Invf_Doc_Entry *
178HT_find (Hash_Table * HT, unsigned long DN)
179{
180 Invf_Doc_EntryH *IDEH;
181 if (!HT->HashTable)
182 FatalError (1, "Accessing the hash_table through HT_find after HT_sort");
183 IDEH = HT->HashTable[DN % HT->size];
184 while (IDEH)
185 {
186 if (IDEH->IDE.DocNum == DN)
187 return &(IDEH->IDE);
188 IDEH = IDEH->next;
189 }
190 return NULL;
191}
192
193
194static int
195HT_insert (Hash_Table * HT,
196 unsigned long DN, float Sum)
197{
198 Invf_Doc_EntryH **IDEH;
199 if (HT->num == HT->max)
200 return 0;
201 IDEH = &(HT->HashTable[DN % HT->size]);
202 while (*IDEH)
203 IDEH = &((*IDEH)->next);
204 *IDEH = &(HT->IDEH[HT->num++]);
205 (*IDEH)->next = NULL;
206 (*IDEH)->IDE.DocNum = DN;
207 (*IDEH)->IDE.Sum = Sum;
208 return 1;
209}
210
211
212void
213HT_free (query_data * qd, Hash_Table * HT)
214{
215 unsigned es;
216 es = sizeof (Hash_Table) + sizeof (Invf_Doc_EntryH) * HT->max +
217 sizeof (Invf_Doc_EntryH *) * HT->size;
218 if (HT->HashTable)
219 Xfree (HT->HashTable);
220 if (HT->Suplimentary_Entries)
221 {
222 Xfree (HT->Suplimentary_Entries);
223 es += sizeof (Invf_Doc_EntryH) * HT->Suplimentary_Size;
224 }
225 Xfree (HT);
226 ChangeMemInUse (qd, -es);
227}
228
229
230static int
231IDEH_comp (const void *a, const void *b)
232{
233 const Invf_Doc_EntryH *A = a;
234 const Invf_Doc_EntryH *B = b;
235 return A->IDE.DocNum - B->IDE.DocNum;
236}
237
238static void
239HT_sort (query_data * qd, Hash_Table * HT)
240{
241 int i;
242 Invf_Doc_EntryH *l, *r, **b;
243 unsigned es = sizeof (Invf_Doc_EntryH *) * HT->size;
244 if (HT->HashTable)
245 Xfree (HT->HashTable);
246 HT->size = 0;
247 HT->HashTable = NULL;
248 ChangeMemInUse (qd, -es);
249
250 qsort (HT->IDEH, HT->num, sizeof (Invf_Doc_EntryH), IDEH_comp);
251
252 for (i = 0; i < HT->num - 1; i++)
253 HT->IDEH[i].next = &HT->IDEH[i + 1];
254 HT->IDEH[i].next = NULL;
255 l = HT->IDEH;
256
257 if (HT->Suplimentary_Entries)
258 {
259 for (i = 0; i < HT->Suplimentary_Num - 1; i++)
260 HT->Suplimentary_Entries[i].next = &HT->Suplimentary_Entries[i + 1];
261 HT->Suplimentary_Entries[i].next = NULL;
262 }
263 r = HT->Suplimentary_Entries;
264
265 b = &HT->Head;
266
267 while (l || r)
268 {
269 if (r && (!l || (l->IDE.DocNum > r->IDE.DocNum)))
270 {
271 *b = r;
272 b = &r->next;
273 r = *b;
274 }
275 else
276 {
277 *b = l;
278 b = &l->next;
279 l = *b;
280 }
281 }
282 *b = NULL;
283}
284
285
286
287
288
289
290/**************************************************************************
291 *
292 * Code for the hash table.
293 *
294 */
295
296static List_Table *
297LT_create (query_data * qd, unsigned max_entries)
298{
299 unsigned es;
300 List_Table *LT;
301 if (max_entries == 0)
302 max_entries = 8192;
303 es = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * (max_entries - 1);
304
305 LT = Xmalloc (es);
306 if (!LT)
307 return NULL;
308
309 ChangeMemInUse (qd, es);
310 LT->max = max_entries;
311 LT->num = 0;
312 return LT;
313}
314
315
316
317Invf_Doc_Entry *
318LT_find (List_Table * LT, unsigned long DN)
319{
320 int l = 0;
321 int r = LT->num - 1;
322 int m, c;
323 while (l <= r)
324 {
325 m = (l + r) / 2;
326 c = DN - LT->IDE[m].DocNum;
327 if (c < 0)
328 r = m - 1;
329 else if (c > 0)
330 l = m + 1;
331 else
332 return &LT->IDE[m];
333 }
334 return NULL;
335}
336
337static List_Table *
338LT_add (query_data * qd, List_Table * LT,
339 unsigned long DN, float Sum)
340{
341 Invf_Doc_Entry *ide;
342
343 /* increase the size of the list if there is not enough */
344 /* room for this entry */
345 if (LT->num == LT->max)
346 {
347 int old, new, max_entries;
348 old = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * LT->max;
349 max_entries = LT->max + (LT->max >> 1);
350 new = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * max_entries;
351
352 LT = Xrealloc (LT, new);
353 if (!LT)
354 return NULL;
355
356 ChangeMemInUse (qd, new - old);
357 LT->max = max_entries;
358 }
359
360 /* add the entry to the end of the list */
361 ide = &LT->IDE[LT->num];
362 ide->DocNum = DN;
363 ide->Sum = Sum;
364 LT->num++;
365
366 return LT;
367}
368
369
370void
371LT_free (query_data * qd, List_Table * LT)
372{
373 unsigned es;
374 es = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * LT->max;
375 Xfree (LT);
376 ChangeMemInUse (qd, -es);
377}
378
379
380static int
381IDE_comp (const void *a, const void *b)
382{
383 const Invf_Doc_Entry *A = a;
384 const Invf_Doc_Entry *B = b;
385 return A->DocNum - B->DocNum;
386}
387
388static void
389LT_sort (List_Table * LT)
390{
391 qsort (LT->IDE, LT->num, sizeof (Invf_Doc_Entry), IDE_comp);
392}
393
394
395/* [RJM 07/97: Ranked Required Terms] */
396/* accumulators with a sum less than or equal to 0.0 are to be deleted */
397static void
398LT_pack (List_Table * LT)
399{
400 int s, d = 0;
401 for (s = 0; s < LT->num; s++) {
402 if (LT->IDE[s].Sum <= 0.0) {
403 /* ignore this accumulator */
404
405 } else if (d > 0 && LT->IDE[s].DocNum == LT->IDE[d-1].DocNum) {
406 LT->IDE[d-1].Sum += LT->IDE[s].Sum;
407
408 } else {
409 if (d != s) LT->IDE[d] = LT->IDE[s];
410 d++;
411 }
412 }
413
414 if (LT->num) LT->num = d;
415}
416
417
418/**************************************************************************/
419
420
421
422invf_data *
423InitInvfFile (File * InvfFile, stemmed_dict * sd)
424{
425 invf_data *id;
426 if (!(id = Xmalloc (sizeof (invf_data))))
427 {
428 mg_errno = MG_NOMEM;
429 return NULL;
430 }
431
432 id->InvfFile = InvfFile;
433
434 Fread (&id->ifh, sizeof (id->ifh), 1, InvfFile);
435 /* [RPAP - Jan 97: Endian Ordering] */
436 {
437 int i;
438
439 NTOHUL(id->ifh.no_of_words);
440 NTOHUL(id->ifh.no_of_ptrs);
441 NTOHUL(id->ifh.skip_mode);
442 for (i = 0; i < 16; i++)
443 NTOHUL(id->ifh.params[i]);
444 NTOHUL(id->ifh.InvfLevel);
445 }
446
447 id->N = sd->sdh.num_of_docs;
448 id->Nstatic = sd->sdh.static_num_of_docs;
449
450 return (id);
451}
452
453
454void
455FreeInvfData (invf_data * id)
456{
457 Xfree (id);
458}
459
460/****************************************************************************/
461
462
463static void
464CalcBlks (query_data * qd, WordEntry * we, int *blk,
465 int *dn_blk, int *len_blk, int *k, int *p)
466{
467 *p = we->doc_count;
468
469 *blk = BIO_Bblock_Init (qd->id->Nstatic, *p);
470 switch (qd->id->ifh.skip_mode)
471 {
472 case 1:
473 {
474 unsigned long len;
475 if (*p <= qd->id->ifh.params[0])
476 *k = 0;
477 else
478 {
479 *k = qd->id->ifh.params[0];
480 len = BIO_Bblock_Bound (qd->id->N, *p);
481 if (qd->id->ifh.InvfLevel >= 2)
482 len += we->count;
483 *dn_blk = BIO_Bblock_Init (qd->sd->sdh.num_of_docs, (*p + *k - 1) / *k);
484 *len_blk = BIO_Bblock_Init (len, (*p + *k - 1) / *k);
485 }
486 break;
487 }
488 case 2:
489 {
490 unsigned long len;
491 if (*p <= qd->id->ifh.params[1])
492 *k = 0;
493 else
494 {
495 *k = (int) (2 * sqrt (((double) (*p)) / qd->id->ifh.params[0]));
496 if (*k <= qd->id->ifh.params[1])
497 *k = qd->id->ifh.params[1];
498 len = BIO_Bblock_Bound (qd->id->N, *p);
499 if (qd->id->ifh.InvfLevel >= 2)
500 len += we->count;
501 *dn_blk = BIO_Bblock_Init (qd->sd->sdh.num_of_docs,
502 (*p + *k - 1) / *k);
503 *len_blk = BIO_Bblock_Init (len, (*p + *k - 1) / *k);
504 }
505 break;
506 }
507 default:
508 *k = 0;
509 }
510}
511
512
513/* =========================================================================
514 * Function: GetDocsOp
515 * Description:
516 * if op == op_term then
517 * look up term in invf and decode all its entries/documents
518 * if op == op_and_terms then
519 * go thru L, testing if it exists in invf using skips
520 * if it does exist then overwrite the entry nearer the start of the list
521 * if op == op_and_or_terms then
522 * go thru L, testing if it exists in invf using skips
523 * BUT only mark ones which are found - keep list intact
524 * if op == op_diff_terms then >>>> NOT tested <<<<
525 * Input:
526 * Output:
527 * ========================================================================= */
528
529DocList *
530GetDocsOp (query_data * qd, WordEntry * we, Op_types op, DocList * L)
531{
532 u_char *buffer;
533 DocEntry *src = NULL, *dest = NULL;
534 int i; /* index counter into inverted list */
535 unsigned long pos;
536 int next_start, next_doc_num, next_i;
537 int block_counter; /* within counter in block */
538 int dn_blk = 0, len_blk = 0;
539 int skip_size, have_skipping;
540 int blk; /* bblock control parameter */
541 int ft; /* The number of documents the word occurs in */
542 unsigned long CurrDoc = 0; /* NOTE: Document numbers start at 1 */
543 float Weight;
544
545 /* ... AND empty */
546 if (op == op_and_terms && (!L || !L->num))
547 {
548 if (L)
549 Xfree (L);
550 return MakeDocList (0);
551 }
552
553 if (op == op_term)
554 L = MakeDocList (we->doc_count);
555
556 CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &skip_size, &ft);
557 have_skipping = (skip_size != 0);
558 if (!(buffer = Xmalloc (we->invf_len)))
559 return MakeDocList (0);
560
561 ChangeMemInUse (qd, we->invf_len);
562
563 qd->num_of_terms++;
564
565 /* --- Read an inverted file entry for the given word --- */
566 Fseek (qd->File_invf, we->invf_ptr, 0);
567 Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
568
569 /* Read from the buffer */
570 DECODE_START (buffer, we->invf_len);
571
572 Weight = qd->sd->sdh.num_of_words / (100 * log2 ((float) we->count + 1));
573
574 if (op != op_term)
575 dest = src = L->DE;
576 i = pos = 0;
577 next_i = next_start = next_doc_num = 0;
578 block_counter = skip_size - 1;
579
580 /* Go thru the elements of an inverted list for a term */
581 /* May not go sequentially if skipping is done */
582 while (i < ft && ((op == op_term) || src - L->DE < L->num))
583 {
584 unsigned long diff, Temp;
585
586 if (have_skipping)
587 {
588
589 /* --- Skip to next block --- */
590 if (op != op_term && i + skip_size < ft &&
591 src->DocNum > next_doc_num && next_doc_num >= 0)
592 {
593 ShortCut:
594 i = next_i;
595 DECODE_SEEK (next_start);
596 pos = next_start;
597 CurrDoc = next_doc_num;
598 qd->hops_taken++;
599 block_counter = skip_size - 1;
600 }
601
602 /* --- Decode the skip block header --- */
603 if (block_counter == skip_size - 1)
604 {
605 if (i + skip_size < ft)
606 {
607 /* --- Decode the skip block header --- */
608 int doc_diff, pos_diff;
609 BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
610 next_doc_num += doc_diff;
611 next_i += skip_size;
612 BBLOCK_DECODE_L (pos_diff, len_blk, pos);
613 next_start = pos + pos_diff;
614 }
615 else
616 next_doc_num = -1;
617 }
618
619 /* --- Do a skip if c-set doc is higher than block header doc --- */
620 if (op != op_term && i + skip_size < ft &&
621 src->DocNum > next_doc_num && next_doc_num >= 0)
622 goto ShortCut;
623
624
625 } /*if skipping */
626
627 /* --- increment CurrDoc --- */
628 if (have_skipping && block_counter == 0 && i != ft - 1)
629 CurrDoc = next_doc_num;
630 else
631 {
632 BBLOCK_DECODE_L (diff, blk, pos);
633 CurrDoc += diff;
634 }
635
636 /* --- If have f_dt's then eat them up --- */
637 if (qd->id->ifh.InvfLevel >= 2)
638 GAMMA_DECODE_L (Temp, pos); /* discard word count */
639
640 /* --- Increment counters --- */
641 qd->num_of_ptrs++;
642 i++;
643 block_counter--;
644 if (block_counter < 0) /* ??? Why would this happen ? */
645 block_counter = skip_size - 1;
646
647 switch (op)
648 {
649 case op_term:
650 /* --- Add the doc from invf onto the c-set list --- */
651 L->DE[L->num].DocNum = CurrDoc;
652 L->DE[L->num].Weight = Weight;
653 L->DE[L->num].SeekPos = 0;
654 L->DE[L->num].Len = 0;
655 L->DE[L->num].CompTextBuffer = NULL;
656 L->DE[L->num].Next = NULL;
657 L->DE[L->num].or_included = 0;
658 L->num++;
659 break;
660
661 case op_and_terms:
662 case op_and_or_terms:
663 /* --- see if have a match and save it --- */
664
665 while (src < (L->DE + L->num) && CurrDoc > src->DocNum)
666 {
667 src++;
668 }
669 /* Unless at end of list, assert(CurrDoc <= src->DocNum) */
670
671 /* --- Accept the doc as a candidate --- */
672 if (src < (L->DE + L->num) && CurrDoc == src->DocNum)
673 {
674 if (op == op_and_terms)
675 {
676 *dest++ = *src;
677 }
678 else
679 {
680 src->or_included = 1;
681 }
682 src++; /* Move onto next one */
683 }
684
685 /* If the next one to try has already been tried then move on again */
686 if (op == op_and_or_terms)
687 {
688 while (src < (L->DE + L->num) && src->or_included)
689 src++;
690 }
691
692 break;
693
694 case op_diff_terms:
695 while (src < (L->DE + L->num) && src->DocNum < CurrDoc)
696 *dest++ = *src++;
697 if (src < (L->DE + L->num) && src->DocNum == CurrDoc)
698 src++;
699 break;
700
701 default:
702 break;
703 } /*switch */
704
705 } /*while traversing invf */
706
707 if (op == op_and_terms || op == op_diff_terms)
708 {
709 if (op == op_diff_terms)
710 while (src - L->DE < L->num)
711 *dest++ = *src++;
712
713 /* update size of c-set */
714 L->num = dest - L->DE;
715 }
716 DECODE_DONE
717
718 Xfree (buffer);
719 ChangeMemInUse (qd, -we->invf_len);
720
721 return (L);
722}
723
724
725
726
727
728
729
730float *
731CosineDecode (query_data * qd, TermList * Terms, RankedQueryInfo * rqi)
732{
733 float *AccumulatedWeights;
734 int j, num_docs, num_terms;
735 int seen_must_match = 0; /* [RJM 07/97: Ranked Required Terms] says whether to 'and' queries or not */
736
737 num_docs = qd->sd->sdh.num_of_docs;
738 /* Create the array for the documents */
739 if (!(AccumulatedWeights = Xmalloc (num_docs * sizeof (float))))
740 return (NULL);
741
742 bzero ((char *) AccumulatedWeights, sizeof (float) * num_docs);
743 ChangeMemInUse (qd, sizeof (float) * num_docs);
744
745 if (rqi->MaxTerms == -1)
746 num_terms = Terms->num;
747 else
748 num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
749
750
751 /* For each word in the list . . . */
752 for (j = 0; j < num_terms; j++)
753 {
754 u_char *buffer;
755 int i, kd;
756 unsigned long next_mjr_dn = 0;
757 unsigned long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
758 unsigned long LastDocNum = 0; /* [RJM 07/97: Ranked Required Terms] */
759 unsigned long TempDocNum = 0; /* [RJM 07/97: Ranked Required Terms] */
760 double Wqt, WordLog;
761 int dn_blk = 0, len_blk = 0, k;
762 int blk; /* bblock control parameter */
763 int p; /* The number of documents the word occurs in */
764 int require_match = Terms->TE[j].require_match; /* [RJM 07/97: Ranked Required Terms] */
765 WordEntry *we = &Terms->TE[j].WE;
766 CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
767 if (!(buffer = Xmalloc (we->invf_len)))
768 break;
769
770 ChangeMemInUse (qd, we->invf_len);
771
772 Fseek (qd->File_invf, we->invf_ptr, 0);
773 Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
774
775 /* Read from the buffer */
776 DECODE_START (buffer, we->invf_len);
777
778 WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
779
780 Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
781
782 qd->num_of_terms++;
783
784 kd = 0;
785 for (i = 0; i < p; i++, kd++)
786 {
787 int Count; /* The number of times the word occurs in a document */
788 double Wdt;
789 unsigned long diff;
790 if (kd == k)
791 {
792 kd = 0;
793 }
794 if (k && kd == 0)
795 if (i + k < p)
796 {
797 int temp;
798 BBLOCK_DECODE (next_mjr_dn, dn_blk);
799 next_mjr_dn += CurrDocNum;
800 BBLOCK_DECODE (temp, len_blk);
801 }
802 if (k && kd == k - 1 && i != p - 1)
803 CurrDocNum = next_mjr_dn;
804 else
805 {
806 BBLOCK_DECODE (diff, blk);
807 CurrDocNum += diff;
808 }
809 if (qd->id->ifh.InvfLevel >= 2)
810 GAMMA_DECODE (Count);
811 else
812 Count = (double) we->count / p;
813 Wdt = Count * WordLog;
814
815 /* [RJM 07/97: Ranked Required Terms] */
816 /* zero the weights of intermediate documents if this term is required to match */
817 if (require_match == 1) {
818 for (TempDocNum = LastDocNum+1; TempDocNum < CurrDocNum; TempDocNum++)
819 AccumulatedWeights[TempDocNum - 1] = 0.0;
820 }
821
822 /* [RJM 07/97: Ranked Required Terms] */
823 /* if a term has been required to match before, the weight we are */
824 /* adding to must be non-zero */
825 if (!seen_must_match || (AccumulatedWeights[CurrDocNum - 1] > 0.0)) {
826 AccumulatedWeights[CurrDocNum - 1] += Wqt * Wdt;
827 }
828
829 qd->num_of_ptrs++;
830 LastDocNum = CurrDocNum; /* [RJM 07/97: Ranked Required Terms] */
831 }
832
833 DECODE_DONE
834
835 Xfree (buffer);
836 ChangeMemInUse (qd, -we->invf_len);
837
838 /* [RJM 07/97: Ranked Required Terms] */
839 /* zero the remaining weights if this term is required to match */
840 if (require_match == 1) {
841 for (TempDocNum = LastDocNum+1; TempDocNum <= num_docs; TempDocNum++)
842 AccumulatedWeights[TempDocNum - 1] = 0.0;
843 seen_must_match = 1;
844 }
845 }
846 return AccumulatedWeights;
847}
848
849
850
851static int
852doc_comp (void *a, void *b)
853{
854 return ((Invf_Doc_Entry *) a)->DocNum - ((Invf_Doc_Entry *) b)->DocNum;
855}
856
857
858static Invf_Doc_Entry *
859get_ide (query_data * qd, Invf_Doc_Entry_Pool * pool)
860{
861 struct pool *p = pool->pool;
862 if (!p || p->pos == p->length)
863 {
864 register unsigned size = sizeof (struct pool) +
865 (pool->pool_chunks - 1) * sizeof (Invf_Doc_Entry);
866 p = malloc (size);
867 if (!p)
868 return NULL;
869 ChangeMemInUse (qd, size);
870 p->length = pool->pool_chunks;
871 p->pos = 0;
872 p->next = pool->pool;
873 pool->pool = p;
874 }
875 return &(p->ide[p->pos++]);
876}
877
878
879void
880free_ide_pool (query_data * qd, Invf_Doc_Entry_Pool * pool)
881{
882 while (pool->pool)
883 {
884 struct pool *p = pool->pool;
885 pool->pool = p->next;
886 ChangeMemInUse (qd, -(sizeof (struct pool) + (p->length - 1) *
887 sizeof (Invf_Doc_Entry)));
888 free (p);
889 }
890 pool->pool = NULL;
891}
892
893
894FILE *
895open_skip (char *sd)
896{
897 char buf[256];
898 if (sd == NULL || *sd == '\0')
899 return NULL;
900 sprintf (buf, sd, getpid ());
901 return fopen (buf, "a");
902}
903
904
905
906Splay_Tree *
907CosineDecodeSplay (query_data * qd, TermList * Terms,
908 RankedQueryInfo * rqi, Invf_Doc_Entry_Pool * pool)
909{
910 Splay_Tree *Doc_Set;
911 int j, num_docs, num_terms, MoreAccum;
912 int Anding = 0;
913 FILE *sk = open_skip (rqi->skip_dump);
914 u_char indent = 0;
915 if (sk)
916 {
917 if (!skip_header)
918 {
919 skip_header = 1;
920 fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
921 fprintf (sk, "\nSkipping method %ld\n", qd->id->ifh.skip_mode);
922 switch (qd->id->ifh.skip_mode)
923 {
924 case 1:
925 fprintf (sk, "Skipping is every %ld docnums\n",
926 qd->id->ifh.params[0]);
927 break;
928 case 2:
929 fprintf (sk, "Max nodes = %ld\nNo skips smaller or equal to %ld\n",
930 qd->id->ifh.params[0], qd->id->ifh.params[1]);
931 break;
932 }
933 }
934 fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
935 for (j = 0; j < Terms->num; j++)
936 if (Terms->TE[j].Word[0] > indent)
937 indent = Terms->TE[j].Word[0];
938 }
939
940 num_docs = qd->sd->sdh.num_of_docs;
941 /* Create the array for the documents */
942 Doc_Set = SP_createset (doc_comp);
943
944 pool->pool = NULL;
945 pool->pool_chunks = 1024;
946
947 if (rqi->MaxTerms == -1)
948 num_terms = Terms->num;
949 else
950 num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
951
952 MoreAccum = 1;
953
954 /* For each word in the list . . . */
955 for (j = 0; j < num_terms; j++)
956 {
957 int Abort = 0;
958 u_char *buffer;
959 int pos, i;
960 unsigned long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
961 double Wqt, WordLog;
962 int dn_blk = 0, len_blk = 0, k, kd;
963 int blk; /* bblock control parameter */
964 int p; /* The number of documents the word occurs in */
965 WordEntry *we = &Terms->TE[j].WE;
966 Invf_Doc_Entry *next = NULL;
967 int next_i, next_doc_num, next_start;
968 int skips_taken = 0, ptrs_dec = 0;
969 double total_added = 0.0;
970 int hits = 0;
971 if (sk)
972 {
973 fprintf (sk, "%3d : %8ld %6ld \"%.*s\"%*s", j, we->count,
974 we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
975 indent - Terms->TE[j].Word[0], "");
976 }
977
978 CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
979 if (!(buffer = Xmalloc (we->invf_len)))
980 break;
981
982 ChangeMemInUse (qd, we->invf_len);
983
984 if (sk && qd->id->ifh.skip_mode == 2)
985 fprintf (sk, "%4d", k);
986
987 Fseek (qd->File_invf, we->invf_ptr, 0);
988 Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
989
990 /* Read from the buffer */
991 DECODE_START (buffer, we->invf_len);
992
993 WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
994
995 Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
996
997 qd->num_of_terms++;
998
999 if (rqi->MaxAccums != -1 && Doc_Set->no_of_items >= rqi->MaxAccums)
1000 {
1001 MoreAccum = 0;
1002 Anding = 1;
1003 next = SP_get_first (Doc_Set);
1004 }
1005 i = pos = 0;
1006 next_i = next_start = next_doc_num = 0;
1007 kd = k - 1;
1008 while (i < p && (!Anding || next))
1009 {
1010 int Count; /* The number of times the occurs in a document */
1011 double Wdt;
1012 unsigned long diff;
1013 Invf_Doc_Entry ent, *mem;
1014 if (k)
1015 {
1016 if (Anding && i + k < p && next->DocNum > next_doc_num &&
1017 next_doc_num >= 0)
1018 {
1019 i = next_i;
1020 DECODE_SEEK (next_start);
1021 pos = next_start;
1022 CurrDocNum = next_doc_num;
1023 qd->hops_taken++;
1024 skips_taken++;
1025 kd = k - 1;
1026 }
1027 if (kd == k - 1)
1028 if (i + k < p)
1029 {
1030 int doc_diff, pos_diff;
1031 BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1032 next_doc_num += doc_diff;
1033 next_i += k;
1034 BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1035 next_start = pos + pos_diff;
1036 }
1037 else
1038 next_doc_num = -1;
1039 if (Anding && i + k < p && next->DocNum > next_doc_num &&
1040 next_doc_num >= 0)
1041 continue;
1042 }
1043 if (k && kd == 0 && i != p - 1)
1044 CurrDocNum = next_doc_num;
1045 else
1046 {
1047 BBLOCK_DECODE_L (diff, blk, pos);
1048 CurrDocNum += diff;
1049 }
1050 qd->num_of_ptrs++;
1051 ptrs_dec++;
1052 if (qd->id->ifh.InvfLevel >= 2)
1053 GAMMA_DECODE_L (Count, pos);
1054 else
1055 Count = (double) we->count / p;
1056 i++;
1057 kd--;
1058 if (kd < 0)
1059 kd = k - 1;
1060 Wdt = Count * WordLog;
1061
1062
1063 if (Anding)
1064 {
1065 while (next && next->DocNum < CurrDocNum - 1)
1066 next = SP_get_next (Doc_Set);
1067 if (next && next->DocNum == CurrDocNum - 1)
1068 {
1069 next->Sum += Wqt * Wdt;
1070 total_added += Wqt * Wdt;
1071 hits++;
1072 }
1073 if (next && next->DocNum <= CurrDocNum - 1)
1074 next = SP_get_next (Doc_Set);
1075 goto NextDocNum;
1076 }
1077 ent.DocNum = CurrDocNum - 1;
1078 if ((mem = SP_member (&ent, Doc_Set)) == NULL)
1079 {
1080 hits++;
1081 if (rqi->MaxAccums != -1 &&
1082 Doc_Set->no_of_items >= rqi->MaxAccums &&
1083 !MoreAccum)
1084 goto NextDocNum;
1085 if ((mem = get_ide (qd, pool)) == NULL)
1086 return NULL;
1087 ChangeMemInUse (qd, sizeof (*mem));
1088 mem->DocNum = CurrDocNum - 1;
1089 mem->Sum = 0;
1090 ChangeMemInUse (qd, -Doc_Set->mem_in_use);
1091 if (SP_insert (mem, Doc_Set) == NULL)
1092 {
1093 Message ("Unable to allocate memory for a splay node");
1094 Abort = 1;
1095 goto FastAbort;
1096 }
1097 ChangeMemInUse (qd, Doc_Set->mem_in_use);
1098 if (rqi->MaxAccums != -1 &&
1099 Doc_Set->no_of_items >= rqi->MaxAccums &&
1100 rqi->StopAtMaxAccum)
1101 Abort = 1;
1102 }
1103
1104 mem->Sum += Wqt * Wdt;
1105 total_added += Wqt * Wdt;
1106
1107 NextDocNum:
1108 ;
1109 }
1110
1111 FastAbort:
1112
1113 DECODE_DONE
1114
1115 Xfree (buffer);
1116 ChangeMemInUse (qd, -we->invf_len);
1117 if (sk)
1118 {
1119 if (qd->id->ifh.skip_mode != 0)
1120 fprintf (sk, " %6d", skips_taken);
1121 fprintf (sk, " %6d %6d %6d%12.2f\n",
1122 ptrs_dec, hits, Doc_Set->no_of_items, total_added);
1123 }
1124 if (Abort)
1125 break;
1126 }
1127 if (sk)
1128 fclose (sk);
1129 return Doc_Set;
1130}
1131
1132
1133
1134
1135Hash_Table *
1136CosineDecodeHash (query_data * qd, TermList * Terms,
1137 RankedQueryInfo * rqi)
1138{
1139 int j, num_docs, num_terms, MoreAccum;
1140 Hash_Table *HT;
1141 int Anding = 0;
1142 int Sorted = 0;
1143 FILE *sk = open_skip (rqi->skip_dump);
1144 u_char indent = 0;
1145 if (sk)
1146 {
1147 if (!skip_header)
1148 {
1149 skip_header = 1;
1150 fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
1151 fprintf (sk, "\nSkipping method %ld\n", qd->id->ifh.skip_mode);
1152 switch (qd->id->ifh.skip_mode)
1153 {
1154 case 1:
1155 fprintf (sk, "Skipping is every %ld docnums\n",
1156 qd->id->ifh.params[0]);
1157 break;
1158 case 2:
1159 fprintf (sk, "Max nodes = %ld\nNo skips smaller or equal to %ld\n",
1160 qd->id->ifh.params[0], qd->id->ifh.params[1]);
1161 break;
1162 }
1163 }
1164 fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
1165 for (j = 0; j < Terms->num; j++)
1166 if (Terms->TE[j].Word[0] > indent)
1167 indent = Terms->TE[j].Word[0];
1168 }
1169
1170 num_docs = qd->sd->sdh.num_of_docs;
1171 /* Create the array for the documents */
1172 HT = HT_create (qd, rqi->MaxAccums, rqi->HashTblSize);
1173 if (!HT)
1174 return NULL;
1175
1176 if (rqi->MaxTerms == -1)
1177 num_terms = Terms->num;
1178 else
1179 num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
1180
1181 MoreAccum = 1;
1182
1183 /* For each word in the list . . . */
1184 for (j = 0; j < num_terms; j++)
1185 {
1186 int Abort = 0;
1187 u_char *buffer;
1188 int pos;
1189 int i;
1190 unsigned long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
1191 double Wqt, WordLog;
1192 int dn_blk = 0, len_blk = 0, k, kd;
1193 int blk; /* bblock control parameter */
1194 int p; /* The number of documents the word occurs in */
1195 WordEntry *we = &Terms->TE[j].WE;
1196 Invf_Doc_EntryH *next = NULL;
1197 int next_i, next_doc_num, next_start;
1198 int skips_taken = 0, ptrs_dec = 0;
1199 double total_added = 0.0;
1200 int hits = 0;
1201 if (sk)
1202 {
1203 fprintf (sk, "%3d : %8ld %6ld \"%.*s\"%*s", j, we->count,
1204 we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
1205 indent - Terms->TE[j].Word[0], "");
1206 }
1207
1208 CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
1209 if (!(buffer = Xmalloc (we->invf_len)))
1210 break;
1211
1212 ChangeMemInUse (qd, we->invf_len);
1213
1214 if (sk && qd->id->ifh.skip_mode == 2)
1215 fprintf (sk, "%4d", k);
1216
1217 Fseek (qd->File_invf, we->invf_ptr, 0);
1218 Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
1219
1220 /* Read from the buffer */
1221 DECODE_START (buffer, we->invf_len);
1222
1223 WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
1224
1225 Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
1226
1227 qd->num_of_terms++;
1228
1229 if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums)
1230 {
1231 MoreAccum = 0;
1232 Anding = 1;
1233 if (!Sorted)
1234 {
1235 HT_sort (qd, HT);
1236 Sorted = 1;
1237 }
1238 next = HT->Head;
1239 }
1240 i = pos = 0;
1241 next_i = next_start = next_doc_num = 0;
1242 kd = k - 1;
1243 while (i < p && (!Anding || next))
1244 {
1245 int Count; /* The number of times the occurs in a document */
1246 double Wdt;
1247 unsigned long diff;
1248 Invf_Doc_Entry *mem;
1249 if (k)
1250 {
1251 if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1252 next_doc_num >= 0)
1253 {
1254#ifdef SHORT_CUT
1255 ShortCut:
1256#endif
1257 i = next_i;
1258 DECODE_SEEK (next_start);
1259 pos = next_start;
1260 CurrDocNum = next_doc_num;
1261 qd->hops_taken++;
1262 skips_taken++;
1263 kd = k - 1;
1264 }
1265 if (kd == k - 1)
1266 {
1267 if (i + k < p)
1268 {
1269 int doc_diff, pos_diff;
1270 BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1271 next_doc_num += doc_diff;
1272 next_i += k;
1273 BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1274 next_start = pos + pos_diff;
1275 }
1276 else
1277 next_doc_num = -1;
1278 }
1279#ifdef SHORT_CUT
1280 if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1281 next_doc_num >= 0)
1282 goto ShortCut;
1283#else
1284 if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1285 next_doc_num >= 0)
1286 continue;
1287#endif
1288 }
1289 if (k && kd == 0 && i != p - 1)
1290 {
1291 CurrDocNum = next_doc_num;
1292 }
1293 else
1294 {
1295 BBLOCK_DECODE_L (diff, blk, pos);
1296 CurrDocNum += diff;
1297 }
1298 qd->num_of_ptrs++;
1299 ptrs_dec++;
1300 if (qd->id->ifh.InvfLevel >= 2)
1301 GAMMA_DECODE_L (Count, pos);
1302 else
1303 Count = (double) we->count / p;
1304 i++;
1305 kd--;
1306 if (kd < 0)
1307 kd = k - 1;
1308
1309
1310 if (Anding)
1311 {
1312 while (next && next->IDE.DocNum < CurrDocNum - 1)
1313 next = next->next;
1314 if (next && next->IDE.DocNum == CurrDocNum - 1)
1315 {
1316 Wdt = Count * WordLog;
1317 next->IDE.Sum += Wqt * Wdt;
1318 total_added += Wqt * Wdt;
1319 hits++;
1320 }
1321 if (next && next->IDE.DocNum <= CurrDocNum - 1)
1322 next = next->next;
1323 goto NextDocNum;
1324 }
1325 if ((mem = HT_find (HT, CurrDocNum - 1)) == NULL)
1326 {
1327 if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums &&
1328 !MoreAccum)
1329 goto NextDocNum;
1330 Wdt = Count * WordLog;
1331 if (!HT_insert (HT, CurrDocNum - 1, Wqt * Wdt))
1332 {
1333 if (!HT->Suplimentary_Entries)
1334 HT_Sup_create (qd, HT, we->doc_count - i + 1);
1335 HT_Sup_Add (HT, CurrDocNum - 1, Wqt * Wdt);
1336 }
1337 if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums &&
1338 rqi->StopAtMaxAccum)
1339 Abort = 1;
1340 }
1341 else
1342 {
1343 Wdt = Count * WordLog;
1344 mem->Sum += Wqt * Wdt;
1345 total_added += Wqt * Wdt;
1346 hits++;
1347 }
1348
1349 NextDocNum:
1350 ;
1351 }
1352
1353 DECODE_DONE
1354
1355 Xfree (buffer);
1356 ChangeMemInUse (qd, -we->invf_len);
1357 if (sk)
1358 {
1359 if (qd->id->ifh.skip_mode != 0)
1360 fprintf (sk, " %6d", skips_taken);
1361 fprintf (sk, " %6d %6d %6d%12.2f\n",
1362 ptrs_dec, hits, HT->num + HT->Suplimentary_Num, total_added);
1363 }
1364 if (Abort)
1365 break;
1366 }
1367 if (sk)
1368 fclose (sk);
1369 if (!HT->Head)
1370 HT_sort (qd, HT);
1371 return HT;
1372}
1373
1374
1375
1376List_Table *
1377CosineDecodeList (query_data * qd, TermList * Terms,
1378 RankedQueryInfo * rqi)
1379{
1380 int j, num_docs, num_terms, MoreAccum;
1381 List_Table *LT;
1382 int Anding = 0;
1383 int Sorted = 0;
1384 int has_grown = 0; /* [RJM 07/97: Ranked Required Match] whether elements have been added lately */
1385 FILE *sk = open_skip (rqi->skip_dump);
1386 u_char indent = 0;
1387 if (sk)
1388 {
1389 if (!skip_header)
1390 {
1391 skip_header = 1;
1392 fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
1393 fprintf (sk, "\nSkipping method %ld\n", qd->id->ifh.skip_mode);
1394 switch (qd->id->ifh.skip_mode)
1395 {
1396 case 1:
1397 fprintf (sk, "Skipping is every %ld docnums\n",
1398 qd->id->ifh.params[0]);
1399 break;
1400 case 2:
1401 fprintf (sk, "Max nodes = %ld\nNo skips smaller or equal to %ld\n",
1402 qd->id->ifh.params[0], qd->id->ifh.params[1]);
1403 break;
1404 }
1405 }
1406 fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
1407 for (j = 0; j < Terms->num; j++)
1408 if (Terms->TE[j].Word[0] > indent)
1409 indent = Terms->TE[j].Word[0];
1410 }
1411
1412 num_docs = qd->sd->sdh.num_of_docs;
1413
1414 /* Create the list for the documents */
1415 LT = LT_create (qd, rqi->MaxAccums);
1416 if (!LT)
1417 return NULL;
1418
1419 if (rqi->MaxTerms == -1)
1420 num_terms = Terms->num;
1421 else
1422 num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
1423
1424 MoreAccum = 1;
1425
1426 /* For each word in the list . . . */
1427 for (j = 0; j < num_terms; j++)
1428 {
1429 int Abort = 0;
1430 u_char *buffer;
1431 int pos;
1432 int i, n = 0;
1433 unsigned long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
1434 unsigned long LastDocNum = 0; /* [RJM 07/97: Ranked Required Match] */
1435 int this_item = 0; /* [RJM 07/97: Ranked Required Match] points to the next possible LT to delete */
1436 double Wqt, WordLog;
1437 int dn_blk = 0, len_blk = 0, k, kd;
1438 int blk; /* bblock control parameter */
1439 int p; /* The number of documents the word occurs in */
1440 int require_match = Terms->TE[j].require_match; /* [RJM 07/97: Ranked Required Match] */
1441 WordEntry *we = &Terms->TE[j].WE;
1442 Invf_Doc_Entry *next = NULL;
1443 int next_i, next_doc_num, next_start;
1444 int skips_taken = 0, ptrs_dec = 0;
1445 double total_added = 0.0;
1446 int hits = 0;
1447
1448 if (sk)
1449 {
1450 fprintf (sk, "%3d : %8ld %6ld \"%.*s\"%*s", j, we->count,
1451 we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
1452 indent - Terms->TE[j].Word[0], "");
1453 }
1454
1455 CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
1456 if (!(buffer = Xmalloc (we->invf_len)))
1457 break;
1458
1459 ChangeMemInUse (qd, we->invf_len);
1460
1461 if (sk && qd->id->ifh.skip_mode == 2)
1462 fprintf (sk, "%4d", k);
1463
1464 Fseek (qd->File_invf, we->invf_ptr, 0);
1465 Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
1466
1467 /* Read from the buffer */
1468 DECODE_START (buffer, we->invf_len);
1469
1470 WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
1471
1472 Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
1473
1474 qd->num_of_terms++;
1475
1476 if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums)
1477 MoreAccum = 0;
1478
1479 if ((rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums) || Anding)
1480 {
1481 Anding = 1;
1482 if (!Sorted || has_grown)
1483 {
1484 LT_sort (LT);
1485 LT_pack (LT);
1486 Sorted = 1;
1487 has_grown = 0;
1488 }
1489 next = LT->IDE;
1490 n = 0;
1491 }
1492
1493 /* [RJM 07/97: Ranked Required Terms] */
1494 /* make sure that the document list is set up correctly for */
1495 /* requiring a term to match */
1496 if (require_match == 1) {
1497 if (!Sorted || has_grown) {
1498 LT_sort (LT);
1499 LT_pack (LT);
1500 Sorted = 1;
1501 has_grown = 0;
1502 }
1503 /* this_item is already set to zero */
1504 }
1505
1506 i = pos = 0;
1507 next_i = next_start = next_doc_num = 0;
1508 kd = k - 1;
1509 while (i < p && (!Anding || n < LT->num))
1510 {
1511 int Count; /* The number of times the occurs in a document */
1512 double Wdt;
1513 unsigned long diff;
1514 Invf_Doc_Entry *mem;
1515
1516 /* deal with skipping */
1517 if (k)
1518 {
1519 /* see if we should take a skip */
1520 if (Anding && i + k < p && next->DocNum > next_doc_num &&
1521 next_doc_num >= 0)
1522 {
1523 i = next_i;
1524 DECODE_SEEK (next_start);
1525 pos = next_start;
1526 CurrDocNum = next_doc_num;
1527 qd->hops_taken++;
1528 skips_taken++;
1529 kd = k - 1;
1530 }
1531 if (kd == k - 1)
1532 if (i + k < p)
1533 {
1534 int doc_diff, pos_diff;
1535 BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1536 next_doc_num += doc_diff;
1537 next_i += k;
1538 BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1539 next_start = pos + pos_diff;
1540 }
1541 else
1542 next_doc_num = -1;
1543
1544 /* should we take another skip? */
1545 if (Anding && i + k < p && next->DocNum > next_doc_num &&
1546 next_doc_num >= 0)
1547 continue;
1548 }
1549 if (k && kd == 0 && i != p - 1)
1550 CurrDocNum = next_doc_num;
1551 else
1552 {
1553 BBLOCK_DECODE_L (diff, blk, pos);
1554 CurrDocNum += diff;
1555 }
1556 qd->num_of_ptrs++;
1557 ptrs_dec++;
1558 if (qd->id->ifh.InvfLevel >= 2)
1559 GAMMA_DECODE_L (Count, pos);
1560 else
1561 Count = (double) we->count / p;
1562 i++;
1563 kd--;
1564 if (kd < 0)
1565 kd = k - 1;
1566 Wdt = Count * WordLog;
1567
1568 /* [RJM 07/97: Ranked Required Terms] */
1569 /* if this is a required match we must remove all accumulators */
1570 /* of intermediate documents */
1571 if (require_match == 1) {
1572 while (this_item < LT->num && LT->IDE[this_item].DocNum < CurrDocNum - 1) {
1573 if (LastDocNum==0 ||(LT->IDE[this_item].DocNum > LastDocNum - 1))
1574 LT->IDE[this_item].Sum = -1000.0; /* mark for deletion */
1575 this_item++;
1576 }
1577 }
1578
1579 if (Anding)
1580 {
1581 while (n < LT->num && next->DocNum < CurrDocNum - 1)
1582 {
1583 next++;
1584 n++;
1585 }
1586 if (n < LT->num && next->DocNum == CurrDocNum - 1)
1587 {
1588 next->Sum += Wqt * Wdt;
1589 total_added += Wqt * Wdt;
1590 hits++;
1591 }
1592 if (n < LT->num && next->DocNum <= CurrDocNum - 1)
1593 {
1594 next++;
1595 n++;
1596 }
1597 goto NextDocNum;
1598 }
1599 if (!Anding)
1600 {
1601 if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums &&
1602 !MoreAccum)
1603 goto NextDocNum;
1604 LT = LT_add (qd, LT, CurrDocNum - 1, Wqt * Wdt);
1605 if (!LT)
1606 {
1607 Abort = 1;
1608 goto FastAbort;
1609 }
1610
1611 /* the list of accumulators just got bigger */
1612 has_grown = 1;
1613
1614 if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums &&
1615 rqi->StopAtMaxAccum)
1616 Abort = 1;
1617
1618
1619 }
1620 else if ((mem = LT_find (LT, CurrDocNum - 1)))
1621 {
1622 mem->Sum += Wqt * Wdt;
1623 total_added += Wqt * Wdt;
1624 hits++;
1625 }
1626
1627 NextDocNum:
1628
1629 LastDocNum = CurrDocNum;
1630 }
1631
1632 FastAbort:
1633
1634 DECODE_DONE
1635
1636 Xfree (buffer);
1637 ChangeMemInUse (qd, -we->invf_len);
1638 if (sk)
1639 {
1640 if (qd->id->ifh.skip_mode != 0)
1641 fprintf (sk, " %6d", skips_taken);
1642 fprintf (sk, " %6d %6d %6d%12.2f\n",
1643 ptrs_dec, hits, LT->num, total_added);
1644 }
1645 if (Abort)
1646 break;
1647
1648 /* [RJM 07/97: Ranked Required Terms] */
1649 /* remove the remaining accumulators if this term is required to match */
1650 if (require_match == 1) {
1651 while (this_item < LT->num) {
1652 if (LT->IDE[this_item].DocNum > LastDocNum - 1)
1653 LT->IDE[this_item].Sum = -1000.0; /* mark for deletion */
1654 this_item++;
1655 }
1656 Anding = 1; /* no more documents should be added */
1657
1658 /* Get rid of any documents which have been deleted */
1659 LT_pack (LT);
1660 }
1661 }
1662
1663 if (sk)
1664 fclose (sk);
1665 return LT;
1666}
Note: See TracBrowser for help on using the repository browser.