source: main/trunk/greenstone2/common-src/indexers/mg/src/text/invf_get.c@ 25147

Last change on this file since 25147 was 25147, checked in by kjdon, 12 years ago

merged 64_bit_Greenstone branch into trunk, rev 25139

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 39.7 KB
Line 
1/**************************************************************************
2 *
3 * invf_get.c -- Inverted file reading routines
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: invf_get.c 25147 2012-02-28 00:59:00Z kjdon $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "memlib.h"
27#include "filestats.h"
28#include "timing.h"
29#include "bitio_gen.h"
30#include "bitio_m_mem.h"
31#include "bitio_m.h"
32#include "sptree.h"
33#include "messages.h"
34#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36#include "mg.h"
37#include "invf.h"
38#include "text.h"
39#include "lists.h"
40#include "backend.h"
41#include "invf_get.h"
42#include "mg_errors.h"
43
44#define SKIPPING
45#define SHORT_CUT
46
47struct pool
48 {
49 int length, pos;
50 struct pool *next;
51 Invf_Doc_Entry ide[1];
52 };
53
54static int skip_header = 0;
55
56static char skip_header_text0[] =
57"#######################################################################\n"
58"#\n"
59"# 1\tWord number.\n"
60"# 2\tNumber of time the word occurs in the collection.\n"
61"# 3\tNumber of documents the word occurs in.\n"
62"# 4\tThe word.\n"
63"# 5\tNumber of docnum/word_count entries decoded.\n"
64"# 6\tNumber of hits.\n"
65"# 7\tNumber of entries in the splay tree or hash table.\n"
66"# 8\tAmount added to the accumulators while while processing this word.\n"
67"#\n"
68"#######################################################################\n";
69
70static char skip_header_text1[] =
71"#######################################################################\n"
72"#\n"
73"# 1\tWord number.\n"
74"# 2\tNumber of time the word occurs in the collection.\n"
75"# 3\tNumber of documents the word occurs in.\n"
76"# 4\tThe word.\n"
77"# 5\tNumber of skips taken in evaluating the inverted file entry.\n"
78"# 6\tNumber of docnum/word_count entries decoded.\n"
79"# 7\tNumber of hits.\n"
80"# 8\tNumber of entries in the splay tree or hash table.\n"
81"# 9\tAmount added to the accumulators while while processing this word.\n"
82"#\n"
83"#######################################################################\n";
84
85
86static char skip_header_text2[] =
87"#######################################################################\n"
88"#\n"
89"# 1\tWord number.\n"
90"# 2\tNumber of time the word occurs in the collection.\n"
91"# 3\tNumber of documents the word occurs in.\n"
92"# 4\tThe word.\n"
93"# 5\tSkip size.\n"
94"# 6\tNumber of skips taken in evaluating the inverted file entry.\n"
95"# 7\tNumber of docnum/word_count entries decoded.\n"
96"# 8\tNumber of hits.\n"
97"# 9\tNumber of entries in the splay tree or hash table.\n"
98"# 10\tAmount added to the accumulators while while processing this word.\n"
99"#\n"
100"#######################################################################\n";
101
102static char *skip_header_text[] =
103{skip_header_text0,
104 skip_header_text1,
105 skip_header_text2};
106
107
108
109
110
111
112
113
114/**************************************************************************
115 *
116 * Code for the hash table.
117 *
118 */
119
120static Hash_Table *
121HT_create (query_data * qd, unsigned max_entries,
122 unsigned max_size)
123{
124 unsigned es, hts;
125 Hash_Table *HT;
126 if (max_entries == 0)
127 max_entries = 8192;
128 es = sizeof (Hash_Table) + sizeof (Invf_Doc_EntryH) * (max_entries - 1);
129 hts = sizeof (Invf_Doc_EntryH *) * max_size;
130
131 HT = Xmalloc (es);
132 if (!HT)
133 return NULL;
134
135 HT->HashTable = Xmalloc (hts);
136 if (!(HT->HashTable))
137 {
138 Xfree (HT);
139 return NULL;
140 }
141
142 ChangeMemInUse (qd, es + hts);
143 bzero ((char *) (HT->HashTable), hts);
144 HT->max = max_entries;
145 HT->num = 0;
146 HT->Suplimentary_Entries = NULL;
147 HT->Suplimentary_Size = 0;
148 HT->Suplimentary_Num = 0;
149 HT->Head = NULL;
150 HT->size = max_size;
151 return HT;
152}
153
154
155static void
156HT_Sup_create (query_data * qd, Hash_Table * HT, unsigned max_size)
157{
158 int es = sizeof (Invf_Doc_EntryH) * max_size;
159 HT->Suplimentary_Entries = Xmalloc (es);
160 if (!HT->Suplimentary_Entries)
161 return;
162 HT->Suplimentary_Size = max_size;
163 HT->Suplimentary_Num = 0;
164 ChangeMemInUse (qd, es);
165}
166
167static void
168HT_Sup_Add (Hash_Table * HT, mg_u_long DN, float Sum)
169{
170 Invf_Doc_EntryH *IDEH;
171 IDEH = &HT->Suplimentary_Entries[HT->Suplimentary_Num++];
172 IDEH->next = NULL;
173 IDEH->IDE.DocNum = DN;
174 IDEH->IDE.Sum = Sum;
175}
176
177Invf_Doc_Entry *
178HT_find (Hash_Table * HT, mg_u_long DN)
179{
180 Invf_Doc_EntryH *IDEH;
181 if (!HT->HashTable)
182 FatalError (1, "Accessing the hash_table through HT_find after HT_sort");
183 IDEH = HT->HashTable[DN % HT->size];
184 while (IDEH)
185 {
186 if (IDEH->IDE.DocNum == DN)
187 return &(IDEH->IDE);
188 IDEH = IDEH->next;
189 }
190 return NULL;
191}
192
193
194static int
195HT_insert (Hash_Table * HT,
196 mg_u_long DN, float Sum)
197{
198 Invf_Doc_EntryH **IDEH;
199 if (HT->num == HT->max)
200 return 0;
201 IDEH = &(HT->HashTable[DN % HT->size]);
202 while (*IDEH)
203 IDEH = &((*IDEH)->next);
204 *IDEH = &(HT->IDEH[HT->num++]);
205 (*IDEH)->next = NULL;
206 (*IDEH)->IDE.DocNum = DN;
207 (*IDEH)->IDE.Sum = Sum;
208 return 1;
209}
210
211
212void
213HT_free (query_data * qd, Hash_Table * HT)
214{
215 unsigned es;
216 es = sizeof (Hash_Table) + sizeof (Invf_Doc_EntryH) * HT->max +
217 sizeof (Invf_Doc_EntryH *) * HT->size;
218 if (HT->HashTable)
219 Xfree (HT->HashTable);
220 if (HT->Suplimentary_Entries)
221 {
222 Xfree (HT->Suplimentary_Entries);
223 es += sizeof (Invf_Doc_EntryH) * HT->Suplimentary_Size;
224 }
225 Xfree (HT);
226 ChangeMemInUse (qd, -es);
227}
228
229
230static int
231IDEH_comp (const void *a, const void *b)
232{
233 const Invf_Doc_EntryH *A = a;
234 const Invf_Doc_EntryH *B = b;
235 return A->IDE.DocNum - B->IDE.DocNum;
236}
237
238static void
239HT_sort (query_data * qd, Hash_Table * HT)
240{
241 int i;
242 Invf_Doc_EntryH *l, *r, **b;
243 unsigned es = sizeof (Invf_Doc_EntryH *) * HT->size;
244 if (HT->HashTable)
245 Xfree (HT->HashTable);
246 HT->size = 0;
247 HT->HashTable = NULL;
248 ChangeMemInUse (qd, -es);
249
250 qsort (HT->IDEH, HT->num, sizeof (Invf_Doc_EntryH), IDEH_comp);
251
252 for (i = 0; i < HT->num - 1; i++)
253 HT->IDEH[i].next = &HT->IDEH[i + 1];
254 HT->IDEH[i].next = NULL;
255 l = HT->IDEH;
256
257 if (HT->Suplimentary_Entries)
258 {
259 for (i = 0; i < HT->Suplimentary_Num - 1; i++)
260 HT->Suplimentary_Entries[i].next = &HT->Suplimentary_Entries[i + 1];
261 HT->Suplimentary_Entries[i].next = NULL;
262 }
263 r = HT->Suplimentary_Entries;
264
265 b = &HT->Head;
266
267 while (l || r)
268 {
269 if (r && (!l || (l->IDE.DocNum > r->IDE.DocNum)))
270 {
271 *b = r;
272 b = &r->next;
273 r = *b;
274 }
275 else
276 {
277 *b = l;
278 b = &l->next;
279 l = *b;
280 }
281 }
282 *b = NULL;
283}
284
285
286
287
288
289
290/**************************************************************************
291 *
292 * Code for the hash table.
293 *
294 */
295
296static List_Table *
297LT_create (query_data * qd, unsigned max_entries)
298{
299 unsigned es;
300 List_Table *LT;
301 if (max_entries == 0)
302 max_entries = 8192;
303 es = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * (max_entries - 1);
304
305 LT = Xmalloc (es);
306 if (!LT)
307 return NULL;
308
309 ChangeMemInUse (qd, es);
310 LT->max = max_entries;
311 LT->num = 0;
312 return LT;
313}
314
315
316
317Invf_Doc_Entry *
318LT_find (List_Table * LT, mg_u_long DN)
319{
320 int l = 0;
321 int r = LT->num - 1;
322 int m, c;
323 while (l <= r)
324 {
325 m = (l + r) / 2;
326 c = DN - LT->IDE[m].DocNum;
327 if (c < 0)
328 r = m - 1;
329 else if (c > 0)
330 l = m + 1;
331 else
332 return &LT->IDE[m];
333 }
334 return NULL;
335}
336
337static List_Table *
338LT_add (query_data * qd, List_Table * LT,
339 mg_u_long DN, float Sum)
340{
341 Invf_Doc_Entry *ide;
342
343 /* increase the size of the list if there is not enough */
344 /* room for this entry */
345 if (LT->num == LT->max)
346 {
347 int old, new, max_entries;
348 old = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * LT->max;
349 max_entries = LT->max + (LT->max >> 1);
350 new = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * max_entries;
351
352 LT = Xrealloc (LT, new);
353 if (!LT)
354 return NULL;
355
356 ChangeMemInUse (qd, new - old);
357 LT->max = max_entries;
358 }
359
360 /* add the entry to the end of the list */
361 ide = &LT->IDE[LT->num];
362 ide->DocNum = DN;
363 ide->Sum = Sum;
364 LT->num++;
365
366 return LT;
367}
368
369
370void
371LT_free (query_data * qd, List_Table * LT)
372{
373 unsigned es;
374 es = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * LT->max;
375 Xfree (LT);
376 ChangeMemInUse (qd, -es);
377}
378
379
380static int
381IDE_comp (const void *a, const void *b)
382{
383 const Invf_Doc_Entry *A = a;
384 const Invf_Doc_Entry *B = b;
385 return A->DocNum - B->DocNum;
386}
387
388static void
389LT_sort (List_Table * LT)
390{
391 qsort (LT->IDE, LT->num, sizeof (Invf_Doc_Entry), IDE_comp);
392}
393
394
395/* [RJM 07/97: Ranked Required Terms] */
396/* accumulators with a sum less than or equal to 0.0 are to be deleted */
397static void
398LT_pack (List_Table * LT)
399{
400 int s, d = 0;
401 for (s = 0; s < LT->num; s++) {
402 if (LT->IDE[s].Sum <= 0.0) {
403 /* ignore this accumulator */
404
405 } else if (d > 0 && LT->IDE[s].DocNum == LT->IDE[d-1].DocNum) {
406 LT->IDE[d-1].Sum += LT->IDE[s].Sum;
407
408 } else {
409 if (d != s) LT->IDE[d] = LT->IDE[s];
410 d++;
411 }
412 }
413
414 if (LT->num) LT->num = d;
415}
416
417
418/**************************************************************************/
419
420
421
422invf_data *
423InitInvfFile (File * InvfFile, stemmed_dict * sd)
424{
425 invf_data *id;
426 if (!(id = Xmalloc (sizeof (invf_data))))
427 {
428 mg_errno = MG_NOMEM;
429 return NULL;
430 }
431
432 id->InvfFile = InvfFile;
433
434 Fread (&id->ifh, sizeof (id->ifh), 1, InvfFile);
435 /* [RPAP - Jan 97: Endian Ordering] */
436 {
437 int i;
438
439 NTOHUL(id->ifh.no_of_words);
440 NTOHUL(id->ifh.no_of_ptrs);
441 NTOHUL(id->ifh.skip_mode);
442 for (i = 0; i < 16; i++)
443 NTOHUL(id->ifh.params[i]);
444 NTOHUL(id->ifh.InvfLevel);
445 }
446
447 id->N = sd->sdh.num_of_docs;
448 id->Nstatic = sd->sdh.static_num_of_docs;
449
450 return (id);
451}
452
453
454void
455FreeInvfData (invf_data * id)
456{
457 Xfree (id);
458}
459
460/****************************************************************************/
461
462
463static void
464CalcBlks (query_data * qd, WordEntry * we, int *blk,
465 int *dn_blk, int *len_blk, int *k, int *p)
466{
467 *p = we->doc_count;
468
469 *blk = BIO_Bblock_Init (qd->id->Nstatic, *p);
470 switch (qd->id->ifh.skip_mode)
471 {
472 case 1:
473 {
474 mg_u_long len;
475 if (*p <= qd->id->ifh.params[0])
476 *k = 0;
477 else
478 {
479 *k = qd->id->ifh.params[0];
480 len = BIO_Bblock_Bound (qd->id->N, *p);
481 if (qd->id->ifh.InvfLevel >= 2)
482 len += we->count;
483 *dn_blk = BIO_Bblock_Init (qd->sd->sdh.num_of_docs, (*p + *k - 1) / *k);
484 *len_blk = BIO_Bblock_Init (len, (*p + *k - 1) / *k);
485 }
486 break;
487 }
488 case 2:
489 {
490 mg_u_long len;
491 if (*p <= qd->id->ifh.params[1])
492 *k = 0;
493 else
494 {
495 *k = (int) (2 * sqrt (((double) (*p)) / qd->id->ifh.params[0]));
496 if (*k <= qd->id->ifh.params[1])
497 *k = qd->id->ifh.params[1];
498 len = BIO_Bblock_Bound (qd->id->N, *p);
499 if (qd->id->ifh.InvfLevel >= 2)
500 len += we->count;
501 *dn_blk = BIO_Bblock_Init (qd->sd->sdh.num_of_docs,
502 (*p + *k - 1) / *k);
503 *len_blk = BIO_Bblock_Init (len, (*p + *k - 1) / *k);
504 }
505 break;
506 }
507 default:
508 *k = 0;
509 }
510}
511
512
513/* =========================================================================
514 * Function: GetDocsOp
515 * Description:
516 * if op == op_term then
517 * look up term in invf and decode all its entries/documents
518 * if op == op_and_terms then
519 * go thru L, testing if it exists in invf using skips
520 * if it does exist then overwrite the entry nearer the start of the list
521 * if op == op_and_or_terms then
522 * go thru L, testing if it exists in invf using skips
523 * BUT only mark ones which are found - keep list intact
524 * if op == op_diff_terms then >>>> NOT tested <<<<
525 * Input:
526 * Output:
527 * ========================================================================= */
528
529DocList *
530GetDocsOp (query_data * qd, WordEntry * we, Op_types op, DocList * L)
531{
532 u_char *buffer;
533 DocEntry *src = NULL, *dest = NULL;
534 int i; /* index counter into inverted list */
535 mg_u_long pos;
536 int next_start, next_doc_num, next_i;
537 int block_counter; /* within counter in block */
538 int dn_blk = 0, len_blk = 0;
539 int skip_size, have_skipping;
540 int blk; /* bblock control parameter */
541 int ft; /* The number of documents the word occurs in */
542 mg_u_long CurrDoc = 0; /* NOTE: Document numbers start at 1 */
543 float Weight;
544
545 /* ... AND empty */
546 if (op == op_and_terms && (!L || !L->num))
547 {
548 if (L)
549 Xfree (L);
550 return MakeDocList (0);
551 }
552
553 if (op == op_term)
554 L = MakeDocList (we->doc_count);
555
556 CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &skip_size, &ft);
557 have_skipping = (skip_size != 0);
558 if (!(buffer = Xmalloc (we->invf_len)))
559 return MakeDocList (0);
560
561 ChangeMemInUse (qd, we->invf_len);
562
563 qd->num_of_terms++;
564
565 /* --- Read an inverted file entry for the given word --- */
566 Fseek (qd->File_invf, we->invf_ptr, 0);
567 Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
568
569 /* Read from the buffer */
570 DECODE_START (buffer, we->invf_len);
571
572 Weight = qd->sd->sdh.num_of_words / (100 * log2 ((float) we->count + 1));
573
574 if (op != op_term)
575 dest = src = L->DE;
576 i = pos = 0;
577 next_i = next_start = next_doc_num = 0;
578 block_counter = skip_size - 1;
579
580 /* Go thru the elements of an inverted list for a term */
581 /* May not go sequentially if skipping is done */
582 while (i < ft && ((op == op_term) || src - L->DE < L->num))
583 {
584 mg_u_long diff, Temp;
585
586 if (have_skipping)
587 {
588
589 /* --- Skip to next block --- */
590 if (op != op_term && i + skip_size < ft &&
591 src->DocNum > next_doc_num && next_doc_num >= 0)
592 {
593 ShortCut:
594 i = next_i;
595 DECODE_SEEK (next_start);
596 pos = next_start;
597 CurrDoc = next_doc_num;
598 qd->hops_taken++;
599 block_counter = skip_size - 1;
600 }
601
602 /* --- Decode the skip block header --- */
603 if (block_counter == skip_size - 1)
604 {
605 if (i + skip_size < ft)
606 {
607 /* --- Decode the skip block header --- */
608 int doc_diff, pos_diff;
609 BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
610 next_doc_num += doc_diff;
611 next_i += skip_size;
612 BBLOCK_DECODE_L (pos_diff, len_blk, pos);
613 next_start = pos + pos_diff;
614 }
615 else
616 next_doc_num = -1;
617 }
618
619 /* --- Do a skip if c-set doc is higher than block header doc --- */
620 if (op != op_term && i + skip_size < ft &&
621 src->DocNum > next_doc_num && next_doc_num >= 0)
622 goto ShortCut;
623
624
625 } /*if skipping */
626
627 /* --- increment CurrDoc --- */
628 if (have_skipping && block_counter == 0 && i != ft - 1)
629 CurrDoc = next_doc_num;
630 else
631 {
632 BBLOCK_DECODE_L (diff, blk, pos);
633 CurrDoc += diff;
634 }
635
636 /* --- If have f_dt's then eat them up --- */
637 if (qd->id->ifh.InvfLevel >= 2)
638 GAMMA_DECODE_L (Temp, pos); /* discard word count */
639
640 /* --- Increment counters --- */
641 qd->num_of_ptrs++;
642 i++;
643 block_counter--;
644 if (block_counter < 0) /* ??? Why would this happen ? */
645 block_counter = skip_size - 1;
646
647 switch (op)
648 {
649 case op_term:
650 /* --- Add the doc from invf onto the c-set list --- */
651 L->DE[L->num].DocNum = CurrDoc;
652 L->DE[L->num].Weight = Weight;
653 L->DE[L->num].SeekPos = 0;
654 L->DE[L->num].Len = 0;
655 L->DE[L->num].CompTextBuffer = NULL;
656 L->DE[L->num].Next = NULL;
657 L->DE[L->num].or_included = 0;
658 L->num++;
659 break;
660
661 case op_and_terms:
662 case op_and_or_terms:
663 /* --- see if have a match and save it --- */
664
665 while (src < (L->DE + L->num) && CurrDoc > src->DocNum)
666 {
667 src++;
668 }
669 /* Unless at end of list, assert(CurrDoc <= src->DocNum) */
670
671 /* --- Accept the doc as a candidate --- */
672 if (src < (L->DE + L->num) && CurrDoc == src->DocNum)
673 {
674 if (op == op_and_terms)
675 {
676 *dest++ = *src;
677 }
678 else
679 {
680 src->or_included = 1;
681 }
682 src++; /* Move onto next one */
683 }
684
685 /* If the next one to try has already been tried then move on again */
686 if (op == op_and_or_terms)
687 {
688 while (src < (L->DE + L->num) && src->or_included)
689 src++;
690 }
691
692 break;
693
694 case op_diff_terms:
695 while (src < (L->DE + L->num) && src->DocNum < CurrDoc)
696 *dest++ = *src++;
697 if (src < (L->DE + L->num) && src->DocNum == CurrDoc)
698 src++;
699 break;
700
701 default:
702 break;
703 } /*switch */
704
705 } /*while traversing invf */
706
707 if (op == op_and_terms || op == op_diff_terms)
708 {
709 if (op == op_diff_terms)
710 while (src - L->DE < L->num)
711 *dest++ = *src++;
712
713 /* update size of c-set */
714 L->num = dest - L->DE;
715 }
716 DECODE_DONE
717
718 Xfree (buffer);
719 ChangeMemInUse (qd, -we->invf_len);
720
721 return (L);
722}
723
724
725
726
727
728
729
730float *
731CosineDecode (query_data * qd, TermList * Terms, RankedQueryInfo * rqi)
732{
733 float *AccumulatedWeights;
734 int j, num_docs, num_terms;
735 int seen_must_match = 0; /* [RJM 07/97: Ranked Required Terms] says whether to 'and' queries or not */
736
737 num_docs = qd->sd->sdh.num_of_docs;
738 /* Create the array for the documents */
739 if (!(AccumulatedWeights = Xmalloc (num_docs * sizeof (float))))
740 return (NULL);
741
742 bzero ((char *) AccumulatedWeights, sizeof (float) * num_docs);
743 ChangeMemInUse (qd, sizeof (float) * num_docs);
744
745 if (rqi->MaxTerms == -1)
746 num_terms = Terms->num;
747 else
748 num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
749
750
751 /* For each word in the list . . . */
752 for (j = 0; j < num_terms; j++)
753 {
754 u_char *buffer;
755 int i, kd;
756 mg_u_long next_mjr_dn = 0;
757 mg_u_long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
758 mg_u_long LastDocNum = 0; /* [RJM 07/97: Ranked Required Terms] */
759 mg_u_long TempDocNum = 0; /* [RJM 07/97: Ranked Required Terms] */
760 double Wqt, WordLog;
761 int dn_blk = 0, len_blk = 0, k;
762 int blk; /* bblock control parameter */
763 int p; /* The number of documents the word occurs in */
764 int require_match = Terms->TE[j].require_match; /* [RJM 07/97: Ranked Required Terms] */
765 WordEntry *we = &Terms->TE[j].WE;
766 CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
767 if (!(buffer = Xmalloc (we->invf_len)))
768 break;
769
770 ChangeMemInUse (qd, we->invf_len);
771
772 Fseek (qd->File_invf, we->invf_ptr, 0);
773 Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
774
775 /* Read from the buffer */
776 DECODE_START (buffer, we->invf_len);
777
778 WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
779
780 Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
781
782 qd->num_of_terms++;
783
784 kd = 0;
785 for (i = 0; i < p; i++, kd++)
786 {
787 int Count; /* The number of times the word occurs in a document */
788 double Wdt;
789 mg_u_long diff;
790 if (kd == k)
791 {
792 kd = 0;
793 }
794 if (k && kd == 0)
795 if (i + k < p)
796 {
797 int temp;
798 BBLOCK_DECODE (next_mjr_dn, dn_blk);
799 next_mjr_dn += CurrDocNum;
800 BBLOCK_DECODE (temp, len_blk);
801 }
802 if (k && kd == k - 1 && i != p - 1)
803 CurrDocNum = next_mjr_dn;
804 else
805 {
806 BBLOCK_DECODE (diff, blk);
807 CurrDocNum += diff;
808 }
809 if (qd->id->ifh.InvfLevel >= 2)
810 GAMMA_DECODE (Count);
811 else
812 Count = (double) we->count / p;
813 Wdt = Count * WordLog;
814
815 /* [RJM 07/97: Ranked Required Terms] */
816 /* zero the weights of intermediate documents if this term is required to match */
817 if (require_match == 1) {
818 for (TempDocNum = LastDocNum+1; TempDocNum < CurrDocNum; TempDocNum++)
819 AccumulatedWeights[TempDocNum - 1] = 0.0;
820 }
821
822 /* [RJM 07/97: Ranked Required Terms] */
823 /* if a term has been required to match before, the weight we are */
824 /* adding to must be non-zero */
825 if (!seen_must_match || (AccumulatedWeights[CurrDocNum - 1] > 0.0)) {
826 AccumulatedWeights[CurrDocNum - 1] += Wqt * Wdt;
827 }
828
829 qd->num_of_ptrs++;
830 LastDocNum = CurrDocNum; /* [RJM 07/97: Ranked Required Terms] */
831 }
832
833 DECODE_DONE
834
835 Xfree (buffer);
836 ChangeMemInUse (qd, -we->invf_len);
837
838 /* [RJM 07/97: Ranked Required Terms] */
839 /* zero the remaining weights if this term is required to match */
840 if (require_match == 1) {
841 for (TempDocNum = LastDocNum+1; TempDocNum <= num_docs; TempDocNum++)
842 AccumulatedWeights[TempDocNum - 1] = 0.0;
843 seen_must_match = 1;
844 }
845 }
846 return AccumulatedWeights;
847}
848
849
850
851static int
852doc_comp (void *a, void *b)
853{
854 return ((Invf_Doc_Entry *) a)->DocNum - ((Invf_Doc_Entry *) b)->DocNum;
855}
856
857
858static Invf_Doc_Entry *
859get_ide (query_data * qd, Invf_Doc_Entry_Pool * pool)
860{
861 struct pool *p = pool->pool;
862 if (!p || p->pos == p->length)
863 {
864 register unsigned size = sizeof (struct pool) +
865 (pool->pool_chunks - 1) * sizeof (Invf_Doc_Entry);
866 p = malloc (size);
867 if (!p)
868 return NULL;
869 ChangeMemInUse (qd, size);
870 p->length = pool->pool_chunks;
871 p->pos = 0;
872 p->next = pool->pool;
873 pool->pool = p;
874 }
875 return &(p->ide[p->pos++]);
876}
877
878
879void
880free_ide_pool (query_data * qd, Invf_Doc_Entry_Pool * pool)
881{
882 while (pool->pool)
883 {
884 struct pool *p = pool->pool;
885 pool->pool = p->next;
886 ChangeMemInUse (qd, -(sizeof (struct pool) + (p->length - 1) *
887 sizeof (Invf_Doc_Entry)));
888 free (p);
889 }
890 pool->pool = NULL;
891}
892
893
894FILE *
895open_skip (char *sd)
896{
897 char buf[256];
898 if (sd == NULL || *sd == '\0')
899 return NULL;
900 sprintf (buf, sd, getpid ());
901 return fopen (buf, "a");
902}
903
904
905
906Splay_Tree *
907CosineDecodeSplay (query_data * qd, TermList * Terms,
908 RankedQueryInfo * rqi, Invf_Doc_Entry_Pool * pool)
909{
910 Splay_Tree *Doc_Set;
911 int j, num_docs, num_terms, MoreAccum;
912 int Anding = 0;
913 FILE *sk = open_skip (rqi->skip_dump);
914 u_char indent = 0;
915 if (sk)
916 {
917 if (!skip_header)
918 {
919 skip_header = 1;
920 fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
921 fprintf (sk, "\nSkipping method %d\n", qd->id->ifh.skip_mode);
922 switch (qd->id->ifh.skip_mode)
923 {
924 case 1:
925 fprintf (sk, "Skipping is every %d docnums\n",
926 qd->id->ifh.params[0]);
927 break;
928 case 2:
929 fprintf (sk, "Max nodes = %d\nNo skips smaller or equal to %d\n",
930 qd->id->ifh.params[0], qd->id->ifh.params[1]);
931 break;
932 }
933 }
934 fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
935 for (j = 0; j < Terms->num; j++)
936 if (Terms->TE[j].Word[0] > indent)
937 indent = Terms->TE[j].Word[0];
938 }
939
940 num_docs = qd->sd->sdh.num_of_docs;
941 /* Create the array for the documents */
942 Doc_Set = SP_createset (doc_comp);
943
944 pool->pool = NULL;
945 pool->pool_chunks = 1024;
946
947 if (rqi->MaxTerms == -1)
948 num_terms = Terms->num;
949 else
950 num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
951
952 MoreAccum = 1;
953
954 /* For each word in the list . . . */
955 for (j = 0; j < num_terms; j++)
956 {
957 int Abort = 0;
958 u_char *buffer;
959 int pos, i;
960 mg_u_long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
961 double Wqt, WordLog;
962 int dn_blk = 0, len_blk = 0, k, kd;
963 int blk; /* bblock control parameter */
964 int p; /* The number of documents the word occurs in */
965 WordEntry *we = &Terms->TE[j].WE;
966 Invf_Doc_Entry *next = NULL;
967 int next_i, next_doc_num, next_start;
968 int skips_taken = 0, ptrs_dec = 0;
969 double total_added = 0.0;
970 int hits = 0;
971 if (sk)
972 {
973 fprintf (sk, "%3d : %8d %6d \"%.*s\"%*s", j, we->count,
974 we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
975 indent - Terms->TE[j].Word[0], "");
976 }
977
978 CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
979 if (!(buffer = Xmalloc (we->invf_len)))
980 break;
981
982 ChangeMemInUse (qd, we->invf_len);
983
984 if (sk && qd->id->ifh.skip_mode == 2)
985 fprintf (sk, "%4d", k);
986
987 Fseek (qd->File_invf, we->invf_ptr, 0);
988 Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
989
990 /* Read from the buffer */
991 DECODE_START (buffer, we->invf_len);
992
993 WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
994
995 Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
996
997 qd->num_of_terms++;
998
999 if (rqi->MaxAccums != -1 && Doc_Set->no_of_items >= rqi->MaxAccums)
1000 {
1001 MoreAccum = 0;
1002 Anding = 1;
1003 next = SP_get_first (Doc_Set);
1004 }
1005 i = pos = 0;
1006 next_i = next_start = next_doc_num = 0;
1007 kd = k - 1;
1008 while (i < p && (!Anding || next))
1009 {
1010 int Count; /* The number of times the occurs in a document */
1011 double Wdt;
1012 mg_u_long diff;
1013 Invf_Doc_Entry ent, *mem;
1014 if (k)
1015 {
1016 if (Anding && i + k < p && next->DocNum > next_doc_num &&
1017 next_doc_num >= 0)
1018 {
1019 i = next_i;
1020 DECODE_SEEK (next_start);
1021 pos = next_start;
1022 CurrDocNum = next_doc_num;
1023 qd->hops_taken++;
1024 skips_taken++;
1025 kd = k - 1;
1026 }
1027 if (kd == k - 1)
1028 {
1029 if (i + k < p)
1030 {
1031 int doc_diff, pos_diff;
1032 BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1033 next_doc_num += doc_diff;
1034 next_i += k;
1035 BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1036 next_start = pos + pos_diff;
1037 }
1038 else
1039 {
1040 next_doc_num = -1;
1041 }
1042 }
1043 if (Anding && i + k < p && next->DocNum > next_doc_num &&
1044 next_doc_num >= 0)
1045 continue;
1046 }
1047 if (k && kd == 0 && i != p - 1)
1048 CurrDocNum = next_doc_num;
1049 else
1050 {
1051 BBLOCK_DECODE_L (diff, blk, pos);
1052 CurrDocNum += diff;
1053 }
1054 qd->num_of_ptrs++;
1055 ptrs_dec++;
1056 if (qd->id->ifh.InvfLevel >= 2)
1057 GAMMA_DECODE_L (Count, pos);
1058 else
1059 Count = (double) we->count / p;
1060 i++;
1061 kd--;
1062 if (kd < 0)
1063 kd = k - 1;
1064 Wdt = Count * WordLog;
1065
1066
1067 if (Anding)
1068 {
1069 while (next && next->DocNum < CurrDocNum - 1)
1070 next = SP_get_next (Doc_Set);
1071 if (next && next->DocNum == CurrDocNum - 1)
1072 {
1073 next->Sum += Wqt * Wdt;
1074 total_added += Wqt * Wdt;
1075 hits++;
1076 }
1077 if (next && next->DocNum <= CurrDocNum - 1)
1078 next = SP_get_next (Doc_Set);
1079 goto NextDocNum;
1080 }
1081 ent.DocNum = CurrDocNum - 1;
1082 if ((mem = SP_member (&ent, Doc_Set)) == NULL)
1083 {
1084 hits++;
1085 if (rqi->MaxAccums != -1 &&
1086 Doc_Set->no_of_items >= rqi->MaxAccums &&
1087 !MoreAccum)
1088 goto NextDocNum;
1089 if ((mem = get_ide (qd, pool)) == NULL)
1090 return NULL;
1091 ChangeMemInUse (qd, sizeof (*mem));
1092 mem->DocNum = CurrDocNum - 1;
1093 mem->Sum = 0;
1094 ChangeMemInUse (qd, -Doc_Set->mem_in_use);
1095 if (SP_insert (mem, Doc_Set) == NULL)
1096 {
1097 Message ("Unable to allocate memory for a splay node");
1098 Abort = 1;
1099 goto FastAbort;
1100 }
1101 ChangeMemInUse (qd, Doc_Set->mem_in_use);
1102 if (rqi->MaxAccums != -1 &&
1103 Doc_Set->no_of_items >= rqi->MaxAccums &&
1104 rqi->StopAtMaxAccum)
1105 Abort = 1;
1106 }
1107
1108 mem->Sum += Wqt * Wdt;
1109 total_added += Wqt * Wdt;
1110
1111 NextDocNum:
1112 ;
1113 }
1114
1115 FastAbort:
1116
1117 DECODE_DONE
1118
1119 Xfree (buffer);
1120 ChangeMemInUse (qd, -we->invf_len);
1121 if (sk)
1122 {
1123 if (qd->id->ifh.skip_mode != 0)
1124 fprintf (sk, " %6d", skips_taken);
1125 fprintf (sk, " %6d %6d %6d%12.2f\n",
1126 ptrs_dec, hits, Doc_Set->no_of_items, total_added);
1127 }
1128 if (Abort)
1129 break;
1130 }
1131 if (sk)
1132 fclose (sk);
1133 return Doc_Set;
1134}
1135
1136
1137
1138
1139Hash_Table *
1140CosineDecodeHash (query_data * qd, TermList * Terms,
1141 RankedQueryInfo * rqi)
1142{
1143 int j, num_docs, num_terms, MoreAccum;
1144 Hash_Table *HT;
1145 int Anding = 0;
1146 int Sorted = 0;
1147 FILE *sk = open_skip (rqi->skip_dump);
1148 u_char indent = 0;
1149 if (sk)
1150 {
1151 if (!skip_header)
1152 {
1153 skip_header = 1;
1154 fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
1155 fprintf (sk, "\nSkipping method %d\n", qd->id->ifh.skip_mode);
1156 switch (qd->id->ifh.skip_mode)
1157 {
1158 case 1:
1159 fprintf (sk, "Skipping is every %d docnums\n",
1160 qd->id->ifh.params[0]);
1161 break;
1162 case 2:
1163 fprintf (sk, "Max nodes = %d\nNo skips smaller or equal to %d\n",
1164 qd->id->ifh.params[0], qd->id->ifh.params[1]);
1165 break;
1166 }
1167 }
1168 fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
1169 for (j = 0; j < Terms->num; j++)
1170 if (Terms->TE[j].Word[0] > indent)
1171 indent = Terms->TE[j].Word[0];
1172 }
1173
1174 num_docs = qd->sd->sdh.num_of_docs;
1175 /* Create the array for the documents */
1176 HT = HT_create (qd, rqi->MaxAccums, rqi->HashTblSize);
1177 if (!HT)
1178 return NULL;
1179
1180 if (rqi->MaxTerms == -1)
1181 num_terms = Terms->num;
1182 else
1183 num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
1184
1185 MoreAccum = 1;
1186
1187 /* For each word in the list . . . */
1188 for (j = 0; j < num_terms; j++)
1189 {
1190 int Abort = 0;
1191 u_char *buffer;
1192 int pos;
1193 int i;
1194 mg_u_long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
1195 double Wqt, WordLog;
1196 int dn_blk = 0, len_blk = 0, k, kd;
1197 int blk; /* bblock control parameter */
1198 int p; /* The number of documents the word occurs in */
1199 WordEntry *we = &Terms->TE[j].WE;
1200 Invf_Doc_EntryH *next = NULL;
1201 int next_i, next_doc_num, next_start;
1202 int skips_taken = 0, ptrs_dec = 0;
1203 double total_added = 0.0;
1204 int hits = 0;
1205 if (sk)
1206 {
1207 fprintf (sk, "%3d : %8d %6d \"%.*s\"%*s", j, we->count,
1208 we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
1209 indent - Terms->TE[j].Word[0], "");
1210 }
1211
1212 CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
1213 if (!(buffer = Xmalloc (we->invf_len)))
1214 break;
1215
1216 ChangeMemInUse (qd, we->invf_len);
1217
1218 if (sk && qd->id->ifh.skip_mode == 2)
1219 fprintf (sk, "%4d", k);
1220
1221 Fseek (qd->File_invf, we->invf_ptr, 0);
1222 Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
1223
1224 /* Read from the buffer */
1225 DECODE_START (buffer, we->invf_len);
1226
1227 WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
1228
1229 Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
1230
1231 qd->num_of_terms++;
1232
1233 if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums)
1234 {
1235 MoreAccum = 0;
1236 Anding = 1;
1237 if (!Sorted)
1238 {
1239 HT_sort (qd, HT);
1240 Sorted = 1;
1241 }
1242 next = HT->Head;
1243 }
1244 i = pos = 0;
1245 next_i = next_start = next_doc_num = 0;
1246 kd = k - 1;
1247 while (i < p && (!Anding || next))
1248 {
1249 int Count; /* The number of times the occurs in a document */
1250 double Wdt;
1251 mg_u_long diff;
1252 Invf_Doc_Entry *mem;
1253 if (k)
1254 {
1255 if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1256 next_doc_num >= 0)
1257 {
1258#ifdef SHORT_CUT
1259 ShortCut:
1260#endif
1261 i = next_i;
1262 DECODE_SEEK (next_start);
1263 pos = next_start;
1264 CurrDocNum = next_doc_num;
1265 qd->hops_taken++;
1266 skips_taken++;
1267 kd = k - 1;
1268 }
1269 if (kd == k - 1)
1270 {
1271 if (i + k < p)
1272 {
1273 int doc_diff, pos_diff;
1274 BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1275 next_doc_num += doc_diff;
1276 next_i += k;
1277 BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1278 next_start = pos + pos_diff;
1279 }
1280 else
1281 next_doc_num = -1;
1282 }
1283#ifdef SHORT_CUT
1284 if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1285 next_doc_num >= 0)
1286 goto ShortCut;
1287#else
1288 if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1289 next_doc_num >= 0)
1290 continue;
1291#endif
1292 }
1293 if (k && kd == 0 && i != p - 1)
1294 {
1295 CurrDocNum = next_doc_num;
1296 }
1297 else
1298 {
1299 BBLOCK_DECODE_L (diff, blk, pos);
1300 CurrDocNum += diff;
1301 }
1302 qd->num_of_ptrs++;
1303 ptrs_dec++;
1304 if (qd->id->ifh.InvfLevel >= 2)
1305 GAMMA_DECODE_L (Count, pos);
1306 else
1307 Count = (double) we->count / p;
1308 i++;
1309 kd--;
1310 if (kd < 0)
1311 kd = k - 1;
1312
1313
1314 if (Anding)
1315 {
1316 while (next && next->IDE.DocNum < CurrDocNum - 1)
1317 next = next->next;
1318 if (next && next->IDE.DocNum == CurrDocNum - 1)
1319 {
1320 Wdt = Count * WordLog;
1321 next->IDE.Sum += Wqt * Wdt;
1322 total_added += Wqt * Wdt;
1323 hits++;
1324 }
1325 if (next && next->IDE.DocNum <= CurrDocNum - 1)
1326 next = next->next;
1327 goto NextDocNum;
1328 }
1329 if ((mem = HT_find (HT, CurrDocNum - 1)) == NULL)
1330 {
1331 if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums &&
1332 !MoreAccum)
1333 goto NextDocNum;
1334 Wdt = Count * WordLog;
1335 if (!HT_insert (HT, CurrDocNum - 1, Wqt * Wdt))
1336 {
1337 if (!HT->Suplimentary_Entries)
1338 HT_Sup_create (qd, HT, we->doc_count - i + 1);
1339 HT_Sup_Add (HT, CurrDocNum - 1, Wqt * Wdt);
1340 }
1341 if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums &&
1342 rqi->StopAtMaxAccum)
1343 Abort = 1;
1344 }
1345 else
1346 {
1347 Wdt = Count * WordLog;
1348 mem->Sum += Wqt * Wdt;
1349 total_added += Wqt * Wdt;
1350 hits++;
1351 }
1352
1353 NextDocNum:
1354 ;
1355 }
1356
1357 DECODE_DONE
1358
1359 Xfree (buffer);
1360 ChangeMemInUse (qd, -we->invf_len);
1361 if (sk)
1362 {
1363 if (qd->id->ifh.skip_mode != 0)
1364 fprintf (sk, " %6d", skips_taken);
1365 fprintf (sk, " %6d %6d %6d%12.2f\n",
1366 ptrs_dec, hits, HT->num + HT->Suplimentary_Num, total_added);
1367 }
1368 if (Abort)
1369 break;
1370 }
1371 if (sk)
1372 fclose (sk);
1373 if (!HT->Head)
1374 HT_sort (qd, HT);
1375 return HT;
1376}
1377
1378
1379
1380List_Table *
1381CosineDecodeList (query_data * qd, TermList * Terms,
1382 RankedQueryInfo * rqi)
1383{
1384 int j, num_docs, num_terms, MoreAccum;
1385 List_Table *LT;
1386 int Anding = 0;
1387 int Sorted = 0;
1388 int has_grown = 0; /* [RJM 07/97: Ranked Required Match] whether elements have been added lately */
1389 FILE *sk = open_skip (rqi->skip_dump);
1390 u_char indent = 0;
1391 if (sk)
1392 {
1393 if (!skip_header)
1394 {
1395 skip_header = 1;
1396 fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
1397 fprintf (sk, "\nSkipping method %d\n", qd->id->ifh.skip_mode);
1398 switch (qd->id->ifh.skip_mode)
1399 {
1400 case 1:
1401 fprintf (sk, "Skipping is every %d docnums\n",
1402 qd->id->ifh.params[0]);
1403 break;
1404 case 2:
1405 fprintf (sk, "Max nodes = %d\nNo skips smaller or equal to %d\n",
1406 qd->id->ifh.params[0], qd->id->ifh.params[1]);
1407 break;
1408 }
1409 }
1410 fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
1411 for (j = 0; j < Terms->num; j++)
1412 if (Terms->TE[j].Word[0] > indent)
1413 indent = Terms->TE[j].Word[0];
1414 }
1415
1416 num_docs = qd->sd->sdh.num_of_docs;
1417
1418 /* Create the list for the documents */
1419 LT = LT_create (qd, rqi->MaxAccums);
1420 if (!LT)
1421 return NULL;
1422
1423 if (rqi->MaxTerms == -1)
1424 num_terms = Terms->num;
1425 else
1426 num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
1427
1428 MoreAccum = 1;
1429
1430 /* For each word in the list . . . */
1431 for (j = 0; j < num_terms; j++)
1432 {
1433 int Abort = 0;
1434 u_char *buffer;
1435 int pos;
1436 int i, n = 0;
1437 mg_u_long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
1438 mg_u_long LastDocNum = 0; /* [RJM 07/97: Ranked Required Match] */
1439 int this_item = 0; /* [RJM 07/97: Ranked Required Match] points to the next possible LT to delete */
1440 double Wqt, WordLog;
1441 int dn_blk = 0, len_blk = 0, k, kd;
1442 int blk; /* bblock control parameter */
1443 int p; /* The number of documents the word occurs in */
1444 int require_match = Terms->TE[j].require_match; /* [RJM 07/97: Ranked Required Match] */
1445 WordEntry *we = &Terms->TE[j].WE;
1446 Invf_Doc_Entry *next = NULL;
1447 int next_i, next_doc_num, next_start;
1448 int skips_taken = 0, ptrs_dec = 0;
1449 double total_added = 0.0;
1450 int hits = 0;
1451
1452 if (sk)
1453 {
1454 fprintf (sk, "%3d : %8d %6d \"%.*s\"%*s", j, we->count,
1455 we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
1456 indent - Terms->TE[j].Word[0], "");
1457 }
1458
1459 CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
1460 if (!(buffer = Xmalloc (we->invf_len)))
1461 break;
1462
1463 ChangeMemInUse (qd, we->invf_len);
1464
1465 if (sk && qd->id->ifh.skip_mode == 2)
1466 fprintf (sk, "%4d", k);
1467
1468 Fseek (qd->File_invf, we->invf_ptr, 0);
1469 Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
1470
1471 /* Read from the buffer */
1472 DECODE_START (buffer, we->invf_len);
1473
1474 WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
1475
1476 Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
1477
1478 qd->num_of_terms++;
1479
1480 if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums)
1481 MoreAccum = 0;
1482
1483 if ((rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums) || Anding)
1484 {
1485 Anding = 1;
1486 if (!Sorted || has_grown)
1487 {
1488 LT_sort (LT);
1489 LT_pack (LT);
1490 Sorted = 1;
1491 has_grown = 0;
1492 }
1493 next = LT->IDE;
1494 n = 0;
1495 }
1496
1497 /* [RJM 07/97: Ranked Required Terms] */
1498 /* make sure that the document list is set up correctly for */
1499 /* requiring a term to match */
1500 if (require_match == 1) {
1501 if (!Sorted || has_grown) {
1502 LT_sort (LT);
1503 LT_pack (LT);
1504 Sorted = 1;
1505 has_grown = 0;
1506 }
1507 /* this_item is already set to zero */
1508 }
1509
1510 i = pos = 0;
1511 next_i = next_start = next_doc_num = 0;
1512 kd = k - 1;
1513 while (i < p && (!Anding || n < LT->num))
1514 {
1515 int Count; /* The number of times the occurs in a document */
1516 double Wdt;
1517 mg_u_long diff;
1518 Invf_Doc_Entry *mem;
1519
1520 /* deal with skipping */
1521 if (k)
1522 {
1523 /* see if we should take a skip */
1524 if (Anding && i + k < p && next->DocNum > next_doc_num &&
1525 next_doc_num >= 0)
1526 {
1527 i = next_i;
1528 DECODE_SEEK (next_start);
1529 pos = next_start;
1530 CurrDocNum = next_doc_num;
1531 qd->hops_taken++;
1532 skips_taken++;
1533 kd = k - 1;
1534 }
1535 if (kd == k - 1)
1536 {
1537 if (i + k < p)
1538 {
1539 int doc_diff, pos_diff;
1540 BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1541 next_doc_num += doc_diff;
1542 next_i += k;
1543 BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1544 next_start = pos + pos_diff;
1545 }
1546 else
1547 {
1548 next_doc_num = -1;
1549 }
1550 }
1551 /* should we take another skip? */
1552 if (Anding && i + k < p && next->DocNum > next_doc_num &&
1553 next_doc_num >= 0)
1554 continue;
1555 }
1556 if (k && kd == 0 && i != p - 1)
1557 CurrDocNum = next_doc_num;
1558 else
1559 {
1560 BBLOCK_DECODE_L (diff, blk, pos);
1561 CurrDocNum += diff;
1562 }
1563 qd->num_of_ptrs++;
1564 ptrs_dec++;
1565 if (qd->id->ifh.InvfLevel >= 2)
1566 GAMMA_DECODE_L (Count, pos);
1567 else
1568 Count = (double) we->count / p;
1569 i++;
1570 kd--;
1571 if (kd < 0)
1572 kd = k - 1;
1573 Wdt = Count * WordLog;
1574
1575 /* [RJM 07/97: Ranked Required Terms] */
1576 /* if this is a required match we must remove all accumulators */
1577 /* of intermediate documents */
1578 if (require_match == 1) {
1579 while (this_item < LT->num && LT->IDE[this_item].DocNum < CurrDocNum - 1) {
1580 if (LastDocNum==0 ||(LT->IDE[this_item].DocNum > LastDocNum - 1))
1581 LT->IDE[this_item].Sum = -1000.0; /* mark for deletion */
1582 this_item++;
1583 }
1584 }
1585
1586 if (Anding)
1587 {
1588 while (n < LT->num && next->DocNum < CurrDocNum - 1)
1589 {
1590 next++;
1591 n++;
1592 }
1593 if (n < LT->num && next->DocNum == CurrDocNum - 1)
1594 {
1595 next->Sum += Wqt * Wdt;
1596 total_added += Wqt * Wdt;
1597 hits++;
1598 }
1599 if (n < LT->num && next->DocNum <= CurrDocNum - 1)
1600 {
1601 next++;
1602 n++;
1603 }
1604 goto NextDocNum;
1605 }
1606 if (!Anding)
1607 {
1608 if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums &&
1609 !MoreAccum)
1610 goto NextDocNum;
1611 LT = LT_add (qd, LT, CurrDocNum - 1, Wqt * Wdt);
1612 if (!LT)
1613 {
1614 Abort = 1;
1615 goto FastAbort;
1616 }
1617
1618 /* the list of accumulators just got bigger */
1619 has_grown = 1;
1620
1621 if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums &&
1622 rqi->StopAtMaxAccum)
1623 Abort = 1;
1624
1625
1626 }
1627 else if ((mem = LT_find (LT, CurrDocNum - 1)))
1628 {
1629 mem->Sum += Wqt * Wdt;
1630 total_added += Wqt * Wdt;
1631 hits++;
1632 }
1633
1634 NextDocNum:
1635
1636 LastDocNum = CurrDocNum;
1637 }
1638
1639 FastAbort:
1640
1641 DECODE_DONE
1642
1643 Xfree (buffer);
1644 ChangeMemInUse (qd, -we->invf_len);
1645 if (sk)
1646 {
1647 if (qd->id->ifh.skip_mode != 0)
1648 fprintf (sk, " %6d", skips_taken);
1649 fprintf (sk, " %6d %6d %6d%12.2f\n",
1650 ptrs_dec, hits, LT->num, total_added);
1651 }
1652 if (Abort)
1653 break;
1654
1655 /* [RJM 07/97: Ranked Required Terms] */
1656 /* remove the remaining accumulators if this term is required to match */
1657 if (require_match == 1) {
1658 while (this_item < LT->num) {
1659 if (LT->IDE[this_item].DocNum > LastDocNum - 1)
1660 LT->IDE[this_item].Sum = -1000.0; /* mark for deletion */
1661 this_item++;
1662 }
1663 Anding = 1; /* no more documents should be added */
1664
1665 /* Get rid of any documents which have been deleted */
1666 LT_pack (LT);
1667 }
1668 }
1669
1670 if (sk)
1671 fclose (sk);
1672 return LT;
1673}
Note: See TracBrowser for help on using the repository browser.