Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: main/tags/2.80/indexers/mg/src/text/invf_get.c@ 24541

Last change on this file since 24541 was 3745, checked in by mdewsnip, 21 years ago
Addition of MG package for search and retrieval
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 39.8 KB

Line
1	/**************************************************************************
2	*
3	* invf_get.c -- Inverted file reading routines
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: invf_get.c 3745 2003-02-20 21:20:24Z mdewsnip $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "memlib.h"
27	#include "filestats.h"
28	#include "timing.h"
29	#include "bitio_gen.h"
30	#include "bitio_m_mem.h"
31	#include "bitio_m.h"
32	#include "sptree.h"
33	#include "messages.h"
34	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36	#include "mg.h"
37	#include "invf.h"
38	#include "text.h"
39	#include "lists.h"
40	#include "backend.h"
41	#include "invf_get.h"
42	#include "mg_errors.h"
43
44	#define SKIPPING
45	#define SHORT_CUT
46
47	struct pool
48	{
49	int length, pos;
50	struct pool *next;
51	Invf_Doc_Entry ide[1];
52	};
53
54	static int skip_header = 0;
55
56	static char skip_header_text0[] =
57	"#######################################################################\n"
58	"#\n"
59	"# 1\tWord number.\n"
60	"# 2\tNumber of time the word occurs in the collection.\n"
61	"# 3\tNumber of documents the word occurs in.\n"
62	"# 4\tThe word.\n"
63	"# 5\tNumber of docnum/word_count entries decoded.\n"
64	"# 6\tNumber of hits.\n"
65	"# 7\tNumber of entries in the splay tree or hash table.\n"
66	"# 8\tAmount added to the accumulators while while processing this word.\n"
67	"#\n"
68	"#######################################################################\n";
69
70	static char skip_header_text1[] =
71	"#######################################################################\n"
72	"#\n"
73	"# 1\tWord number.\n"
74	"# 2\tNumber of time the word occurs in the collection.\n"
75	"# 3\tNumber of documents the word occurs in.\n"
76	"# 4\tThe word.\n"
77	"# 5\tNumber of skips taken in evaluating the inverted file entry.\n"
78	"# 6\tNumber of docnum/word_count entries decoded.\n"
79	"# 7\tNumber of hits.\n"
80	"# 8\tNumber of entries in the splay tree or hash table.\n"
81	"# 9\tAmount added to the accumulators while while processing this word.\n"
82	"#\n"
83	"#######################################################################\n";
84
85
86	static char skip_header_text2[] =
87	"#######################################################################\n"
88	"#\n"
89	"# 1\tWord number.\n"
90	"# 2\tNumber of time the word occurs in the collection.\n"
91	"# 3\tNumber of documents the word occurs in.\n"
92	"# 4\tThe word.\n"
93	"# 5\tSkip size.\n"
94	"# 6\tNumber of skips taken in evaluating the inverted file entry.\n"
95	"# 7\tNumber of docnum/word_count entries decoded.\n"
96	"# 8\tNumber of hits.\n"
97	"# 9\tNumber of entries in the splay tree or hash table.\n"
98	"# 10\tAmount added to the accumulators while while processing this word.\n"
99	"#\n"
100	"#######################################################################\n";
101
102	static char *skip_header_text[] =
103	{skip_header_text0,
104	skip_header_text1,
105	skip_header_text2};
106
107
108
109
110
111
112
113
114	/**************************************************************************
115	*
116	* Code for the hash table.
117	*
118	*/
119
120	static Hash_Table *
121	HT_create (query_data * qd, unsigned max_entries,
122	unsigned max_size)
123	{
124	unsigned es, hts;
125	Hash_Table *HT;
126	if (max_entries == 0)
127	max_entries = 8192;
128	es = sizeof (Hash_Table) + sizeof (Invf_Doc_EntryH) * (max_entries - 1);
129	hts = sizeof (Invf_Doc_EntryH ) max_size;
130
131	HT = Xmalloc (es);
132	if (!HT)
133	return NULL;
134
135	HT->HashTable = Xmalloc (hts);
136	if (!(HT->HashTable))
137	{
138	Xfree (HT);
139	return NULL;
140	}
141
142	ChangeMemInUse (qd, es + hts);
143	bzero ((char *) (HT->HashTable), hts);
144	HT->max = max_entries;
145	HT->num = 0;
146	HT->Suplimentary_Entries = NULL;
147	HT->Suplimentary_Size = 0;
148	HT->Suplimentary_Num = 0;
149	HT->Head = NULL;
150	HT->size = max_size;
151	return HT;
152	}
153
154
155	static void
156	HT_Sup_create (query_data * qd, Hash_Table * HT, unsigned max_size)
157	{
158	int es = sizeof (Invf_Doc_EntryH) * max_size;
159	HT->Suplimentary_Entries = Xmalloc (es);
160	if (!HT->Suplimentary_Entries)
161	return;
162	HT->Suplimentary_Size = max_size;
163	HT->Suplimentary_Num = 0;
164	ChangeMemInUse (qd, es);
165	}
166
167	static void
168	HT_Sup_Add (Hash_Table * HT, unsigned long DN, float Sum)
169	{
170	Invf_Doc_EntryH *IDEH;
171	IDEH = &HT->Suplimentary_Entries[HT->Suplimentary_Num++];
172	IDEH->next = NULL;
173	IDEH->IDE.DocNum = DN;
174	IDEH->IDE.Sum = Sum;
175	}
176
177	Invf_Doc_Entry *
178	HT_find (Hash_Table * HT, unsigned long DN)
179	{
180	Invf_Doc_EntryH *IDEH;
181	if (!HT->HashTable)
182	FatalError (1, "Accessing the hash_table through HT_find after HT_sort");
183	IDEH = HT->HashTable[DN % HT->size];
184	while (IDEH)
185	{
186	if (IDEH->IDE.DocNum == DN)
187	return &(IDEH->IDE);
188	IDEH = IDEH->next;
189	}
190	return NULL;
191	}
192
193
194	static int
195	HT_insert (Hash_Table * HT,
196	unsigned long DN, float Sum)
197	{
198	Invf_Doc_EntryH **IDEH;
199	if (HT->num == HT->max)
200	return 0;
201	IDEH = &(HT->HashTable[DN % HT->size]);
202	while (*IDEH)
203	IDEH = &((*IDEH)->next);
204	*IDEH = &(HT->IDEH[HT->num++]);
205	(*IDEH)->next = NULL;
206	(*IDEH)->IDE.DocNum = DN;
207	(*IDEH)->IDE.Sum = Sum;
208	return 1;
209	}
210
211
212	void
213	HT_free (query_data * qd, Hash_Table * HT)
214	{
215	unsigned es;
216	es = sizeof (Hash_Table) + sizeof (Invf_Doc_EntryH) * HT->max +
217	sizeof (Invf_Doc_EntryH ) HT->size;
218	if (HT->HashTable)
219	Xfree (HT->HashTable);
220	if (HT->Suplimentary_Entries)
221	{
222	Xfree (HT->Suplimentary_Entries);
223	es += sizeof (Invf_Doc_EntryH) * HT->Suplimentary_Size;
224	}
225	Xfree (HT);
226	ChangeMemInUse (qd, -es);
227	}
228
229
230	static int
231	IDEH_comp (const void a, const void b)
232	{
233	const Invf_Doc_EntryH *A = a;
234	const Invf_Doc_EntryH *B = b;
235	return A->IDE.DocNum - B->IDE.DocNum;
236	}
237
238	static void
239	HT_sort (query_data * qd, Hash_Table * HT)
240	{
241	int i;
242	Invf_Doc_EntryH l, r, **b;
243	unsigned es = sizeof (Invf_Doc_EntryH ) HT->size;
244	if (HT->HashTable)
245	Xfree (HT->HashTable);
246	HT->size = 0;
247	HT->HashTable = NULL;
248	ChangeMemInUse (qd, -es);
249
250	qsort (HT->IDEH, HT->num, sizeof (Invf_Doc_EntryH), IDEH_comp);
251
252	for (i = 0; i < HT->num - 1; i++)
253	HT->IDEH[i].next = &HT->IDEH[i + 1];
254	HT->IDEH[i].next = NULL;
255	l = HT->IDEH;
256
257	if (HT->Suplimentary_Entries)
258	{
259	for (i = 0; i < HT->Suplimentary_Num - 1; i++)
260	HT->Suplimentary_Entries[i].next = &HT->Suplimentary_Entries[i + 1];
261	HT->Suplimentary_Entries[i].next = NULL;
262	}
263	r = HT->Suplimentary_Entries;
264
265	b = &HT->Head;
266
267	while (l \|\| r)
268	{
269	if (r && (!l \|\| (l->IDE.DocNum > r->IDE.DocNum)))
270	{
271	*b = r;
272	b = &r->next;
273	r = *b;
274	}
275	else
276	{
277	*b = l;
278	b = &l->next;
279	l = *b;
280	}
281	}
282	*b = NULL;
283	}
284
285
286
287
288
289
290	/**************************************************************************
291	*
292	* Code for the hash table.
293	*
294	*/
295
296	static List_Table *
297	LT_create (query_data * qd, unsigned max_entries)
298	{
299	unsigned es;
300	List_Table *LT;
301	if (max_entries == 0)
302	max_entries = 8192;
303	es = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * (max_entries - 1);
304
305	LT = Xmalloc (es);
306	if (!LT)
307	return NULL;
308
309	ChangeMemInUse (qd, es);
310	LT->max = max_entries;
311	LT->num = 0;
312	return LT;
313	}
314
315
316
317	Invf_Doc_Entry *
318	LT_find (List_Table * LT, unsigned long DN)
319	{
320	int l = 0;
321	int r = LT->num - 1;
322	int m, c;
323	while (l <= r)
324	{
325	m = (l + r) / 2;
326	c = DN - LT->IDE[m].DocNum;
327	if (c < 0)
328	r = m - 1;
329	else if (c > 0)
330	l = m + 1;
331	else
332	return &LT->IDE[m];
333	}
334	return NULL;
335	}
336
337	static List_Table *
338	LT_add (query_data * qd, List_Table * LT,
339	unsigned long DN, float Sum)
340	{
341	Invf_Doc_Entry *ide;
342
343	/* increase the size of the list if there is not enough */
344	/* room for this entry */
345	if (LT->num == LT->max)
346	{
347	int old, new, max_entries;
348	old = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * LT->max;
349	max_entries = LT->max + (LT->max >> 1);
350	new = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * max_entries;
351
352	LT = Xrealloc (LT, new);
353	if (!LT)
354	return NULL;
355
356	ChangeMemInUse (qd, new - old);
357	LT->max = max_entries;
358	}
359
360	/* add the entry to the end of the list */
361	ide = &LT->IDE[LT->num];
362	ide->DocNum = DN;
363	ide->Sum = Sum;
364	LT->num++;
365
366	return LT;
367	}
368
369
370	void
371	LT_free (query_data * qd, List_Table * LT)
372	{
373	unsigned es;
374	es = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * LT->max;
375	Xfree (LT);
376	ChangeMemInUse (qd, -es);
377	}
378
379
380	static int
381	IDE_comp (const void a, const void b)
382	{
383	const Invf_Doc_Entry *A = a;
384	const Invf_Doc_Entry *B = b;
385	return A->DocNum - B->DocNum;
386	}
387
388	static void
389	LT_sort (List_Table * LT)
390	{
391	qsort (LT->IDE, LT->num, sizeof (Invf_Doc_Entry), IDE_comp);
392	}
393
394
395	/* [RJM 07/97: Ranked Required Terms] */
396	/* accumulators with a sum less than or equal to 0.0 are to be deleted */
397	static void
398	LT_pack (List_Table * LT)
399	{
400	int s, d = 0;
401	for (s = 0; s < LT->num; s++) {
402	if (LT->IDE[s].Sum <= 0.0) {
403	/* ignore this accumulator */
404
405	} else if (d > 0 && LT->IDE[s].DocNum == LT->IDE[d-1].DocNum) {
406	LT->IDE[d-1].Sum += LT->IDE[s].Sum;
407
408	} else {
409	if (d != s) LT->IDE[d] = LT->IDE[s];
410	d++;
411	}
412	}
413
414	if (LT->num) LT->num = d;
415	}
416
417
418	/**************************************************************************/
419
420
421
422	invf_data *
423	InitInvfFile (File * InvfFile, stemmed_dict * sd)
424	{
425	invf_data *id;
426	if (!(id = Xmalloc (sizeof (invf_data))))
427	{
428	mg_errno = MG_NOMEM;
429	return NULL;
430	}
431
432	id->InvfFile = InvfFile;
433
434	Fread (&id->ifh, sizeof (id->ifh), 1, InvfFile);
435	/* [RPAP - Jan 97: Endian Ordering] */
436	{
437	int i;
438
439	NTOHUL(id->ifh.no_of_words);
440	NTOHUL(id->ifh.no_of_ptrs);
441	NTOHUL(id->ifh.skip_mode);
442	for (i = 0; i < 16; i++)
443	NTOHUL(id->ifh.params[i]);
444	NTOHUL(id->ifh.InvfLevel);
445	}
446
447	id->N = sd->sdh.num_of_docs;
448	id->Nstatic = sd->sdh.static_num_of_docs;
449
450	return (id);
451	}
452
453
454	void
455	FreeInvfData (invf_data * id)
456	{
457	Xfree (id);
458	}
459
460	/****************************************************************************/
461
462
463	static void
464	CalcBlks (query_data * qd, WordEntry * we, int *blk,
465	int dn_blk, int len_blk, int k, int p)
466	{
467	*p = we->doc_count;
468
469	blk = BIO_Bblock_Init (qd->id->Nstatic, p);
470	switch (qd->id->ifh.skip_mode)
471	{
472	case 1:
473	{
474	unsigned long len;
475	if (*p <= qd->id->ifh.params[0])
476	*k = 0;
477	else
478	{
479	*k = qd->id->ifh.params[0];
480	len = BIO_Bblock_Bound (qd->id->N, *p);
481	if (qd->id->ifh.InvfLevel >= 2)
482	len += we->count;
483	dn_blk = BIO_Bblock_Init (qd->sd->sdh.num_of_docs, (p + k - 1) / k);
484	len_blk = BIO_Bblock_Init (len, (p + k - 1) / k);
485	}
486	break;
487	}
488	case 2:
489	{
490	unsigned long len;
491	if (*p <= qd->id->ifh.params[1])
492	*k = 0;
493	else
494	{
495	k = (int) (2 sqrt (((double) (*p)) / qd->id->ifh.params[0]));
496	if (*k <= qd->id->ifh.params[1])
497	*k = qd->id->ifh.params[1];
498	len = BIO_Bblock_Bound (qd->id->N, *p);
499	if (qd->id->ifh.InvfLevel >= 2)
500	len += we->count;
501	*dn_blk = BIO_Bblock_Init (qd->sd->sdh.num_of_docs,
502	(p + k - 1) / *k);
503	len_blk = BIO_Bblock_Init (len, (p + k - 1) / k);
504	}
505	break;
506	}
507	default:
508	*k = 0;
509	}
510	}
511
512
513	/* =========================================================================
514	* Function: GetDocsOp
515	* Description:
516	* if op == op_term then
517	* look up term in invf and decode all its entries/documents
518	* if op == op_and_terms then
519	* go thru L, testing if it exists in invf using skips
520	* if it does exist then overwrite the entry nearer the start of the list
521	* if op == op_and_or_terms then
522	* go thru L, testing if it exists in invf using skips
523	* BUT only mark ones which are found - keep list intact
524	* if op == op_diff_terms then >>>> NOT tested <<<<
525	* Input:
526	* Output:
527	* ========================================================================= */
528
529	DocList *
530	GetDocsOp (query_data * qd, WordEntry * we, Op_types op, DocList * L)
531	{
532	u_char *buffer;
533	DocEntry src = NULL, dest = NULL;
534	int i; /* index counter into inverted list */
535	unsigned long pos;
536	int next_start, next_doc_num, next_i;
537	int block_counter; /* within counter in block */
538	int dn_blk = 0, len_blk = 0;
539	int skip_size, have_skipping;
540	int blk; /* bblock control parameter */
541	int ft; /* The number of documents the word occurs in */
542	unsigned long CurrDoc = 0; /* NOTE: Document numbers start at 1 */
543	float Weight;
544
545	/* ... AND empty */
546	if (op == op_and_terms && (!L \|\| !L->num))
547	{
548	if (L)
549	Xfree (L);
550	return MakeDocList (0);
551	}
552
553	if (op == op_term)
554	L = MakeDocList (we->doc_count);
555
556	CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &skip_size, &ft);
557	have_skipping = (skip_size != 0);
558	if (!(buffer = Xmalloc (we->invf_len)))
559	return MakeDocList (0);
560
561	ChangeMemInUse (qd, we->invf_len);
562
563	qd->num_of_terms++;
564
565	/* --- Read an inverted file entry for the given word --- */
566	Fseek (qd->File_invf, we->invf_ptr, 0);
567	Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
568
569	/* Read from the buffer */
570	DECODE_START (buffer, we->invf_len);
571
572	Weight = qd->sd->sdh.num_of_words / (100 * log2 ((float) we->count + 1));
573
574	if (op != op_term)
575	dest = src = L->DE;
576	i = pos = 0;
577	next_i = next_start = next_doc_num = 0;
578	block_counter = skip_size - 1;
579
580	/* Go thru the elements of an inverted list for a term */
581	/* May not go sequentially if skipping is done */
582	while (i < ft && ((op == op_term) \|\| src - L->DE < L->num))
583	{
584	unsigned long diff, Temp;
585
586	if (have_skipping)
587	{
588
589	/* --- Skip to next block --- */
590	if (op != op_term && i + skip_size < ft &&
591	src->DocNum > next_doc_num && next_doc_num >= 0)
592	{
593	ShortCut:
594	i = next_i;
595	DECODE_SEEK (next_start);
596	pos = next_start;
597	CurrDoc = next_doc_num;
598	qd->hops_taken++;
599	block_counter = skip_size - 1;
600	}
601
602	/* --- Decode the skip block header --- */
603	if (block_counter == skip_size - 1)
604	{
605	if (i + skip_size < ft)
606	{
607	/* --- Decode the skip block header --- */
608	int doc_diff, pos_diff;
609	BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
610	next_doc_num += doc_diff;
611	next_i += skip_size;
612	BBLOCK_DECODE_L (pos_diff, len_blk, pos);
613	next_start = pos + pos_diff;
614	}
615	else
616	next_doc_num = -1;
617	}
618
619	/* --- Do a skip if c-set doc is higher than block header doc --- */
620	if (op != op_term && i + skip_size < ft &&
621	src->DocNum > next_doc_num && next_doc_num >= 0)
622	goto ShortCut;
623
624
625	} /if skipping /
626
627	/* --- increment CurrDoc --- */
628	if (have_skipping && block_counter == 0 && i != ft - 1)
629	CurrDoc = next_doc_num;
630	else
631	{
632	BBLOCK_DECODE_L (diff, blk, pos);
633	CurrDoc += diff;
634	}
635
636	/* --- If have f_dt's then eat them up --- */
637	if (qd->id->ifh.InvfLevel >= 2)
638	GAMMA_DECODE_L (Temp, pos); /* discard word count */
639
640	/* --- Increment counters --- */
641	qd->num_of_ptrs++;
642	i++;
643	block_counter--;
644	if (block_counter < 0) /* ??? Why would this happen ? */
645	block_counter = skip_size - 1;
646
647	switch (op)
648	{
649	case op_term:
650	/* --- Add the doc from invf onto the c-set list --- */
651	L->DE[L->num].DocNum = CurrDoc;
652	L->DE[L->num].Weight = Weight;
653	L->DE[L->num].SeekPos = 0;
654	L->DE[L->num].Len = 0;
655	L->DE[L->num].CompTextBuffer = NULL;
656	L->DE[L->num].Next = NULL;
657	L->DE[L->num].or_included = 0;
658	L->num++;
659	break;
660
661	case op_and_terms:
662	case op_and_or_terms:
663	/* --- see if have a match and save it --- */
664
665	while (src < (L->DE + L->num) && CurrDoc > src->DocNum)
666	{
667	src++;
668	}
669	/* Unless at end of list, assert(CurrDoc <= src->DocNum) */
670
671	/* --- Accept the doc as a candidate --- */
672	if (src < (L->DE + L->num) && CurrDoc == src->DocNum)
673	{
674	if (op == op_and_terms)
675	{
676	dest++ = src;
677	}
678	else
679	{
680	src->or_included = 1;
681	}
682	src++; /* Move onto next one */
683	}
684
685	/* If the next one to try has already been tried then move on again */
686	if (op == op_and_or_terms)
687	{
688	while (src < (L->DE + L->num) && src->or_included)
689	src++;
690	}
691
692	break;
693
694	case op_diff_terms:
695	while (src < (L->DE + L->num) && src->DocNum < CurrDoc)
696	dest++ = src++;
697	if (src < (L->DE + L->num) && src->DocNum == CurrDoc)
698	src++;
699	break;
700
701	default:
702	break;
703	} /switch /
704
705	} /while traversing invf /
706
707	if (op == op_and_terms \|\| op == op_diff_terms)
708	{
709	if (op == op_diff_terms)
710	while (src - L->DE < L->num)
711	dest++ = src++;
712
713	/* update size of c-set */
714	L->num = dest - L->DE;
715	}
716	DECODE_DONE
717
718	Xfree (buffer);
719	ChangeMemInUse (qd, -we->invf_len);
720
721	return (L);
722	}
723
724
725
726
727
728
729
730	float *
731	CosineDecode (query_data * qd, TermList * Terms, RankedQueryInfo * rqi)
732	{
733	float *AccumulatedWeights;
734	int j, num_docs, num_terms;
735	int seen_must_match = 0; /* [RJM 07/97: Ranked Required Terms] says whether to 'and' queries or not */
736
737	num_docs = qd->sd->sdh.num_of_docs;
738	/* Create the array for the documents */
739	if (!(AccumulatedWeights = Xmalloc (num_docs * sizeof (float))))
740	return (NULL);
741
742	bzero ((char ) AccumulatedWeights, sizeof (float) num_docs);
743	ChangeMemInUse (qd, sizeof (float) * num_docs);
744
745	if (rqi->MaxTerms == -1)
746	num_terms = Terms->num;
747	else
748	num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
749
750
751	/* For each word in the list . . . */
752	for (j = 0; j < num_terms; j++)
753	{
754	u_char *buffer;
755	int i, kd;
756	unsigned long next_mjr_dn = 0;
757	unsigned long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
758	unsigned long LastDocNum = 0; /* [RJM 07/97: Ranked Required Terms] */
759	unsigned long TempDocNum = 0; /* [RJM 07/97: Ranked Required Terms] */
760	double Wqt, WordLog;
761	int dn_blk = 0, len_blk = 0, k;
762	int blk; /* bblock control parameter */
763	int p; /* The number of documents the word occurs in */
764	int require_match = Terms->TE[j].require_match; /* [RJM 07/97: Ranked Required Terms] */
765	WordEntry *we = &Terms->TE[j].WE;
766	CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
767	if (!(buffer = Xmalloc (we->invf_len)))
768	break;
769
770	ChangeMemInUse (qd, we->invf_len);
771
772	Fseek (qd->File_invf, we->invf_ptr, 0);
773	Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
774
775	/* Read from the buffer */
776	DECODE_START (buffer, we->invf_len);
777
778	WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
779
780	Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
781
782	qd->num_of_terms++;
783
784	kd = 0;
785	for (i = 0; i < p; i++, kd++)
786	{
787	int Count; /* The number of times the word occurs in a document */
788	double Wdt;
789	unsigned long diff;
790	if (kd == k)
791	{
792	kd = 0;
793	}
794	if (k && kd == 0)
795	if (i + k < p)
796	{
797	int temp;
798	BBLOCK_DECODE (next_mjr_dn, dn_blk);
799	next_mjr_dn += CurrDocNum;
800	BBLOCK_DECODE (temp, len_blk);
801	}
802	if (k && kd == k - 1 && i != p - 1)
803	CurrDocNum = next_mjr_dn;
804	else
805	{
806	BBLOCK_DECODE (diff, blk);
807	CurrDocNum += diff;
808	}
809	if (qd->id->ifh.InvfLevel >= 2)
810	GAMMA_DECODE (Count);
811	else
812	Count = (double) we->count / p;
813	Wdt = Count * WordLog;
814
815	/* [RJM 07/97: Ranked Required Terms] */
816	/* zero the weights of intermediate documents if this term is required to match */
817	if (require_match == 1) {
818	for (TempDocNum = LastDocNum+1; TempDocNum < CurrDocNum; TempDocNum++)
819	AccumulatedWeights[TempDocNum - 1] = 0.0;
820	}
821
822	/* [RJM 07/97: Ranked Required Terms] */
823	/* if a term has been required to match before, the weight we are */
824	/* adding to must be non-zero */
825	if (!seen_must_match \|\| (AccumulatedWeights[CurrDocNum - 1] > 0.0)) {
826	AccumulatedWeights[CurrDocNum - 1] += Wqt * Wdt;
827	}
828
829	qd->num_of_ptrs++;
830	LastDocNum = CurrDocNum; /* [RJM 07/97: Ranked Required Terms] */
831	}
832
833	DECODE_DONE
834
835	Xfree (buffer);
836	ChangeMemInUse (qd, -we->invf_len);
837
838	/* [RJM 07/97: Ranked Required Terms] */
839	/* zero the remaining weights if this term is required to match */
840	if (require_match == 1) {
841	for (TempDocNum = LastDocNum+1; TempDocNum <= num_docs; TempDocNum++)
842	AccumulatedWeights[TempDocNum - 1] = 0.0;
843	seen_must_match = 1;
844	}
845	}
846	return AccumulatedWeights;
847	}
848
849
850
851	static int
852	doc_comp (void a, void b)
853	{
854	return ((Invf_Doc_Entry ) a)->DocNum - ((Invf_Doc_Entry ) b)->DocNum;
855	}
856
857
858	static Invf_Doc_Entry *
859	get_ide (query_data * qd, Invf_Doc_Entry_Pool * pool)
860	{
861	struct pool *p = pool->pool;
862	if (!p \|\| p->pos == p->length)
863	{
864	register unsigned size = sizeof (struct pool) +
865	(pool->pool_chunks - 1) * sizeof (Invf_Doc_Entry);
866	p = malloc (size);
867	if (!p)
868	return NULL;
869	ChangeMemInUse (qd, size);
870	p->length = pool->pool_chunks;
871	p->pos = 0;
872	p->next = pool->pool;
873	pool->pool = p;
874	}
875	return &(p->ide[p->pos++]);
876	}
877
878
879	void
880	free_ide_pool (query_data * qd, Invf_Doc_Entry_Pool * pool)
881	{
882	while (pool->pool)
883	{
884	struct pool *p = pool->pool;
885	pool->pool = p->next;
886	ChangeMemInUse (qd, -(sizeof (struct pool) + (p->length - 1) *
887	sizeof (Invf_Doc_Entry)));
888	free (p);
889	}
890	pool->pool = NULL;
891	}
892
893
894	FILE *
895	open_skip (char *sd)
896	{
897	char buf[256];
898	if (sd == NULL \|\| *sd == '\0')
899	return NULL;
900	sprintf (buf, sd, getpid ());
901	return fopen (buf, "a");
902	}
903
904
905
906	Splay_Tree *
907	CosineDecodeSplay (query_data * qd, TermList * Terms,
908	RankedQueryInfo * rqi, Invf_Doc_Entry_Pool * pool)
909	{
910	Splay_Tree *Doc_Set;
911	int j, num_docs, num_terms, MoreAccum;
912	int Anding = 0;
913	FILE *sk = open_skip (rqi->skip_dump);
914	u_char indent = 0;
915	if (sk)
916	{
917	if (!skip_header)
918	{
919	skip_header = 1;
920	fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
921	fprintf (sk, "\nSkipping method %ld\n", qd->id->ifh.skip_mode);
922	switch (qd->id->ifh.skip_mode)
923	{
924	case 1:
925	fprintf (sk, "Skipping is every %ld docnums\n",
926	qd->id->ifh.params[0]);
927	break;
928	case 2:
929	fprintf (sk, "Max nodes = %ld\nNo skips smaller or equal to %ld\n",
930	qd->id->ifh.params[0], qd->id->ifh.params[1]);
931	break;
932	}
933	}
934	fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
935	for (j = 0; j < Terms->num; j++)
936	if (Terms->TE[j].Word[0] > indent)
937	indent = Terms->TE[j].Word[0];
938	}
939
940	num_docs = qd->sd->sdh.num_of_docs;
941	/* Create the array for the documents */
942	Doc_Set = SP_createset (doc_comp);
943
944	pool->pool = NULL;
945	pool->pool_chunks = 1024;
946
947	if (rqi->MaxTerms == -1)
948	num_terms = Terms->num;
949	else
950	num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
951
952	MoreAccum = 1;
953
954	/* For each word in the list . . . */
955	for (j = 0; j < num_terms; j++)
956	{
957	int Abort = 0;
958	u_char *buffer;
959	int pos, i;
960	unsigned long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
961	double Wqt, WordLog;
962	int dn_blk = 0, len_blk = 0, k, kd;
963	int blk; /* bblock control parameter */
964	int p; /* The number of documents the word occurs in */
965	WordEntry *we = &Terms->TE[j].WE;
966	Invf_Doc_Entry *next = NULL;
967	int next_i, next_doc_num, next_start;
968	int skips_taken = 0, ptrs_dec = 0;
969	double total_added = 0.0;
970	int hits = 0;
971	if (sk)
972	{
973	fprintf (sk, "%3d : %8ld %6ld \"%.s\"%s", j, we->count,
974	we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
975	indent - Terms->TE[j].Word[0], "");
976	}
977
978	CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
979	if (!(buffer = Xmalloc (we->invf_len)))
980	break;
981
982	ChangeMemInUse (qd, we->invf_len);
983
984	if (sk && qd->id->ifh.skip_mode == 2)
985	fprintf (sk, "%4d", k);
986
987	Fseek (qd->File_invf, we->invf_ptr, 0);
988	Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
989
990	/* Read from the buffer */
991	DECODE_START (buffer, we->invf_len);
992
993	WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
994
995	Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
996
997	qd->num_of_terms++;
998
999	if (rqi->MaxAccums != -1 && Doc_Set->no_of_items >= rqi->MaxAccums)
1000	{
1001	MoreAccum = 0;
1002	Anding = 1;
1003	next = SP_get_first (Doc_Set);
1004	}
1005	i = pos = 0;
1006	next_i = next_start = next_doc_num = 0;
1007	kd = k - 1;
1008	while (i < p && (!Anding \|\| next))
1009	{
1010	int Count; /* The number of times the occurs in a document */
1011	double Wdt;
1012	unsigned long diff;
1013	Invf_Doc_Entry ent, *mem;
1014	if (k)
1015	{
1016	if (Anding && i + k < p && next->DocNum > next_doc_num &&
1017	next_doc_num >= 0)
1018	{
1019	i = next_i;
1020	DECODE_SEEK (next_start);
1021	pos = next_start;
1022	CurrDocNum = next_doc_num;
1023	qd->hops_taken++;
1024	skips_taken++;
1025	kd = k - 1;
1026	}
1027	if (kd == k - 1)
1028	if (i + k < p)
1029	{
1030	int doc_diff, pos_diff;
1031	BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1032	next_doc_num += doc_diff;
1033	next_i += k;
1034	BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1035	next_start = pos + pos_diff;
1036	}
1037	else
1038	next_doc_num = -1;
1039	if (Anding && i + k < p && next->DocNum > next_doc_num &&
1040	next_doc_num >= 0)
1041	continue;
1042	}
1043	if (k && kd == 0 && i != p - 1)
1044	CurrDocNum = next_doc_num;
1045	else
1046	{
1047	BBLOCK_DECODE_L (diff, blk, pos);
1048	CurrDocNum += diff;
1049	}
1050	qd->num_of_ptrs++;
1051	ptrs_dec++;
1052	if (qd->id->ifh.InvfLevel >= 2)
1053	GAMMA_DECODE_L (Count, pos);
1054	else
1055	Count = (double) we->count / p;
1056	i++;
1057	kd--;
1058	if (kd < 0)
1059	kd = k - 1;
1060	Wdt = Count * WordLog;
1061
1062
1063	if (Anding)
1064	{
1065	while (next && next->DocNum < CurrDocNum - 1)
1066	next = SP_get_next (Doc_Set);
1067	if (next && next->DocNum == CurrDocNum - 1)
1068	{
1069	next->Sum += Wqt * Wdt;
1070	total_added += Wqt * Wdt;
1071	hits++;
1072	}
1073	if (next && next->DocNum <= CurrDocNum - 1)
1074	next = SP_get_next (Doc_Set);
1075	goto NextDocNum;
1076	}
1077	ent.DocNum = CurrDocNum - 1;
1078	if ((mem = SP_member (&ent, Doc_Set)) == NULL)
1079	{
1080	hits++;
1081	if (rqi->MaxAccums != -1 &&
1082	Doc_Set->no_of_items >= rqi->MaxAccums &&
1083	!MoreAccum)
1084	goto NextDocNum;
1085	if ((mem = get_ide (qd, pool)) == NULL)
1086	return NULL;
1087	ChangeMemInUse (qd, sizeof (*mem));
1088	mem->DocNum = CurrDocNum - 1;
1089	mem->Sum = 0;
1090	ChangeMemInUse (qd, -Doc_Set->mem_in_use);
1091	if (SP_insert (mem, Doc_Set) == NULL)
1092	{
1093	Message ("Unable to allocate memory for a splay node");
1094	Abort = 1;
1095	goto FastAbort;
1096	}
1097	ChangeMemInUse (qd, Doc_Set->mem_in_use);
1098	if (rqi->MaxAccums != -1 &&
1099	Doc_Set->no_of_items >= rqi->MaxAccums &&
1100	rqi->StopAtMaxAccum)
1101	Abort = 1;
1102	}
1103
1104	mem->Sum += Wqt * Wdt;
1105	total_added += Wqt * Wdt;
1106
1107	NextDocNum:
1108	;
1109	}
1110
1111	FastAbort:
1112
1113	DECODE_DONE
1114
1115	Xfree (buffer);
1116	ChangeMemInUse (qd, -we->invf_len);
1117	if (sk)
1118	{
1119	if (qd->id->ifh.skip_mode != 0)
1120	fprintf (sk, " %6d", skips_taken);
1121	fprintf (sk, " %6d %6d %6d%12.2f\n",
1122	ptrs_dec, hits, Doc_Set->no_of_items, total_added);
1123	}
1124	if (Abort)
1125	break;
1126	}
1127	if (sk)
1128	fclose (sk);
1129	return Doc_Set;
1130	}
1131
1132
1133
1134
1135	Hash_Table *
1136	CosineDecodeHash (query_data * qd, TermList * Terms,
1137	RankedQueryInfo * rqi)
1138	{
1139	int j, num_docs, num_terms, MoreAccum;
1140	Hash_Table *HT;
1141	int Anding = 0;
1142	int Sorted = 0;
1143	FILE *sk = open_skip (rqi->skip_dump);
1144	u_char indent = 0;
1145	if (sk)
1146	{
1147	if (!skip_header)
1148	{
1149	skip_header = 1;
1150	fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
1151	fprintf (sk, "\nSkipping method %ld\n", qd->id->ifh.skip_mode);
1152	switch (qd->id->ifh.skip_mode)
1153	{
1154	case 1:
1155	fprintf (sk, "Skipping is every %ld docnums\n",
1156	qd->id->ifh.params[0]);
1157	break;
1158	case 2:
1159	fprintf (sk, "Max nodes = %ld\nNo skips smaller or equal to %ld\n",
1160	qd->id->ifh.params[0], qd->id->ifh.params[1]);
1161	break;
1162	}
1163	}
1164	fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
1165	for (j = 0; j < Terms->num; j++)
1166	if (Terms->TE[j].Word[0] > indent)
1167	indent = Terms->TE[j].Word[0];
1168	}
1169
1170	num_docs = qd->sd->sdh.num_of_docs;
1171	/* Create the array for the documents */
1172	HT = HT_create (qd, rqi->MaxAccums, rqi->HashTblSize);
1173	if (!HT)
1174	return NULL;
1175
1176	if (rqi->MaxTerms == -1)
1177	num_terms = Terms->num;
1178	else
1179	num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
1180
1181	MoreAccum = 1;
1182
1183	/* For each word in the list . . . */
1184	for (j = 0; j < num_terms; j++)
1185	{
1186	int Abort = 0;
1187	u_char *buffer;
1188	int pos;
1189	int i;
1190	unsigned long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
1191	double Wqt, WordLog;
1192	int dn_blk = 0, len_blk = 0, k, kd;
1193	int blk; /* bblock control parameter */
1194	int p; /* The number of documents the word occurs in */
1195	WordEntry *we = &Terms->TE[j].WE;
1196	Invf_Doc_EntryH *next = NULL;
1197	int next_i, next_doc_num, next_start;
1198	int skips_taken = 0, ptrs_dec = 0;
1199	double total_added = 0.0;
1200	int hits = 0;
1201	if (sk)
1202	{
1203	fprintf (sk, "%3d : %8ld %6ld \"%.s\"%s", j, we->count,
1204	we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
1205	indent - Terms->TE[j].Word[0], "");
1206	}
1207
1208	CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
1209	if (!(buffer = Xmalloc (we->invf_len)))
1210	break;
1211
1212	ChangeMemInUse (qd, we->invf_len);
1213
1214	if (sk && qd->id->ifh.skip_mode == 2)
1215	fprintf (sk, "%4d", k);
1216
1217	Fseek (qd->File_invf, we->invf_ptr, 0);
1218	Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
1219
1220	/* Read from the buffer */
1221	DECODE_START (buffer, we->invf_len);
1222
1223	WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
1224
1225	Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
1226
1227	qd->num_of_terms++;
1228
1229	if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums)
1230	{
1231	MoreAccum = 0;
1232	Anding = 1;
1233	if (!Sorted)
1234	{
1235	HT_sort (qd, HT);
1236	Sorted = 1;
1237	}
1238	next = HT->Head;
1239	}
1240	i = pos = 0;
1241	next_i = next_start = next_doc_num = 0;
1242	kd = k - 1;
1243	while (i < p && (!Anding \|\| next))
1244	{
1245	int Count; /* The number of times the occurs in a document */
1246	double Wdt;
1247	unsigned long diff;
1248	Invf_Doc_Entry *mem;
1249	if (k)
1250	{
1251	if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1252	next_doc_num >= 0)
1253	{
1254	#ifdef SHORT_CUT
1255	ShortCut:
1256	#endif
1257	i = next_i;
1258	DECODE_SEEK (next_start);
1259	pos = next_start;
1260	CurrDocNum = next_doc_num;
1261	qd->hops_taken++;
1262	skips_taken++;
1263	kd = k - 1;
1264	}
1265	if (kd == k - 1)
1266	{
1267	if (i + k < p)
1268	{
1269	int doc_diff, pos_diff;
1270	BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1271	next_doc_num += doc_diff;
1272	next_i += k;
1273	BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1274	next_start = pos + pos_diff;
1275	}
1276	else
1277	next_doc_num = -1;
1278	}
1279	#ifdef SHORT_CUT
1280	if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1281	next_doc_num >= 0)
1282	goto ShortCut;
1283	#else
1284	if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1285	next_doc_num >= 0)
1286	continue;
1287	#endif
1288	}
1289	if (k && kd == 0 && i != p - 1)
1290	{
1291	CurrDocNum = next_doc_num;
1292	}
1293	else
1294	{
1295	BBLOCK_DECODE_L (diff, blk, pos);
1296	CurrDocNum += diff;
1297	}
1298	qd->num_of_ptrs++;
1299	ptrs_dec++;
1300	if (qd->id->ifh.InvfLevel >= 2)
1301	GAMMA_DECODE_L (Count, pos);
1302	else
1303	Count = (double) we->count / p;
1304	i++;
1305	kd--;
1306	if (kd < 0)
1307	kd = k - 1;
1308
1309
1310	if (Anding)
1311	{
1312	while (next && next->IDE.DocNum < CurrDocNum - 1)
1313	next = next->next;
1314	if (next && next->IDE.DocNum == CurrDocNum - 1)
1315	{
1316	Wdt = Count * WordLog;
1317	next->IDE.Sum += Wqt * Wdt;
1318	total_added += Wqt * Wdt;
1319	hits++;
1320	}
1321	if (next && next->IDE.DocNum <= CurrDocNum - 1)
1322	next = next->next;
1323	goto NextDocNum;
1324	}
1325	if ((mem = HT_find (HT, CurrDocNum - 1)) == NULL)
1326	{
1327	if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums &&
1328	!MoreAccum)
1329	goto NextDocNum;
1330	Wdt = Count * WordLog;
1331	if (!HT_insert (HT, CurrDocNum - 1, Wqt * Wdt))
1332	{
1333	if (!HT->Suplimentary_Entries)
1334	HT_Sup_create (qd, HT, we->doc_count - i + 1);
1335	HT_Sup_Add (HT, CurrDocNum - 1, Wqt * Wdt);
1336	}
1337	if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums &&
1338	rqi->StopAtMaxAccum)
1339	Abort = 1;
1340	}
1341	else
1342	{
1343	Wdt = Count * WordLog;
1344	mem->Sum += Wqt * Wdt;
1345	total_added += Wqt * Wdt;
1346	hits++;
1347	}
1348
1349	NextDocNum:
1350	;
1351	}
1352
1353	DECODE_DONE
1354
1355	Xfree (buffer);
1356	ChangeMemInUse (qd, -we->invf_len);
1357	if (sk)
1358	{
1359	if (qd->id->ifh.skip_mode != 0)
1360	fprintf (sk, " %6d", skips_taken);
1361	fprintf (sk, " %6d %6d %6d%12.2f\n",
1362	ptrs_dec, hits, HT->num + HT->Suplimentary_Num, total_added);
1363	}
1364	if (Abort)
1365	break;
1366	}
1367	if (sk)
1368	fclose (sk);
1369	if (!HT->Head)
1370	HT_sort (qd, HT);
1371	return HT;
1372	}
1373
1374
1375
1376	List_Table *
1377	CosineDecodeList (query_data * qd, TermList * Terms,
1378	RankedQueryInfo * rqi)
1379	{
1380	int j, num_docs, num_terms, MoreAccum;
1381	List_Table *LT;
1382	int Anding = 0;
1383	int Sorted = 0;
1384	int has_grown = 0; /* [RJM 07/97: Ranked Required Match] whether elements have been added lately */
1385	FILE *sk = open_skip (rqi->skip_dump);
1386	u_char indent = 0;
1387	if (sk)
1388	{
1389	if (!skip_header)
1390	{
1391	skip_header = 1;
1392	fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
1393	fprintf (sk, "\nSkipping method %ld\n", qd->id->ifh.skip_mode);
1394	switch (qd->id->ifh.skip_mode)
1395	{
1396	case 1:
1397	fprintf (sk, "Skipping is every %ld docnums\n",
1398	qd->id->ifh.params[0]);
1399	break;
1400	case 2:
1401	fprintf (sk, "Max nodes = %ld\nNo skips smaller or equal to %ld\n",
1402	qd->id->ifh.params[0], qd->id->ifh.params[1]);
1403	break;
1404	}
1405	}
1406	fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
1407	for (j = 0; j < Terms->num; j++)
1408	if (Terms->TE[j].Word[0] > indent)
1409	indent = Terms->TE[j].Word[0];
1410	}
1411
1412	num_docs = qd->sd->sdh.num_of_docs;
1413
1414	/* Create the list for the documents */
1415	LT = LT_create (qd, rqi->MaxAccums);
1416	if (!LT)
1417	return NULL;
1418
1419	if (rqi->MaxTerms == -1)
1420	num_terms = Terms->num;
1421	else
1422	num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
1423
1424	MoreAccum = 1;
1425
1426	/* For each word in the list . . . */
1427	for (j = 0; j < num_terms; j++)
1428	{
1429	int Abort = 0;
1430	u_char *buffer;
1431	int pos;
1432	int i, n = 0;
1433	unsigned long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
1434	unsigned long LastDocNum = 0; /* [RJM 07/97: Ranked Required Match] */
1435	int this_item = 0; /* [RJM 07/97: Ranked Required Match] points to the next possible LT to delete */
1436	double Wqt, WordLog;
1437	int dn_blk = 0, len_blk = 0, k, kd;
1438	int blk; /* bblock control parameter */
1439	int p; /* The number of documents the word occurs in */
1440	int require_match = Terms->TE[j].require_match; /* [RJM 07/97: Ranked Required Match] */
1441	WordEntry *we = &Terms->TE[j].WE;
1442	Invf_Doc_Entry *next = NULL;
1443	int next_i, next_doc_num, next_start;
1444	int skips_taken = 0, ptrs_dec = 0;
1445	double total_added = 0.0;
1446	int hits = 0;
1447
1448	if (sk)
1449	{
1450	fprintf (sk, "%3d : %8ld %6ld \"%.s\"%s", j, we->count,
1451	we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
1452	indent - Terms->TE[j].Word[0], "");
1453	}
1454
1455	CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
1456	if (!(buffer = Xmalloc (we->invf_len)))
1457	break;
1458
1459	ChangeMemInUse (qd, we->invf_len);
1460
1461	if (sk && qd->id->ifh.skip_mode == 2)
1462	fprintf (sk, "%4d", k);
1463
1464	Fseek (qd->File_invf, we->invf_ptr, 0);
1465	Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
1466
1467	/* Read from the buffer */
1468	DECODE_START (buffer, we->invf_len);
1469
1470	WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
1471
1472	Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
1473
1474	qd->num_of_terms++;
1475
1476	if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums)
1477	MoreAccum = 0;
1478
1479	if ((rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums) \|\| Anding)
1480	{
1481	Anding = 1;
1482	if (!Sorted \|\| has_grown)
1483	{
1484	LT_sort (LT);
1485	LT_pack (LT);
1486	Sorted = 1;
1487	has_grown = 0;
1488	}
1489	next = LT->IDE;
1490	n = 0;
1491	}
1492
1493	/* [RJM 07/97: Ranked Required Terms] */
1494	/* make sure that the document list is set up correctly for */
1495	/* requiring a term to match */
1496	if (require_match == 1) {
1497	if (!Sorted \|\| has_grown) {
1498	LT_sort (LT);
1499	LT_pack (LT);
1500	Sorted = 1;
1501	has_grown = 0;
1502	}
1503	/* this_item is already set to zero */
1504	}
1505
1506	i = pos = 0;
1507	next_i = next_start = next_doc_num = 0;
1508	kd = k - 1;
1509	while (i < p && (!Anding \|\| n < LT->num))
1510	{
1511	int Count; /* The number of times the occurs in a document */
1512	double Wdt;
1513	unsigned long diff;
1514	Invf_Doc_Entry *mem;
1515
1516	/* deal with skipping */
1517	if (k)
1518	{
1519	/* see if we should take a skip */
1520	if (Anding && i + k < p && next->DocNum > next_doc_num &&
1521	next_doc_num >= 0)
1522	{
1523	i = next_i;
1524	DECODE_SEEK (next_start);
1525	pos = next_start;
1526	CurrDocNum = next_doc_num;
1527	qd->hops_taken++;
1528	skips_taken++;
1529	kd = k - 1;
1530	}
1531	if (kd == k - 1)
1532	if (i + k < p)
1533	{
1534	int doc_diff, pos_diff;
1535	BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1536	next_doc_num += doc_diff;
1537	next_i += k;
1538	BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1539	next_start = pos + pos_diff;
1540	}
1541	else
1542	next_doc_num = -1;
1543
1544	/* should we take another skip? */
1545	if (Anding && i + k < p && next->DocNum > next_doc_num &&
1546	next_doc_num >= 0)
1547	continue;
1548	}
1549	if (k && kd == 0 && i != p - 1)
1550	CurrDocNum = next_doc_num;
1551	else
1552	{
1553	BBLOCK_DECODE_L (diff, blk, pos);
1554	CurrDocNum += diff;
1555	}
1556	qd->num_of_ptrs++;
1557	ptrs_dec++;
1558	if (qd->id->ifh.InvfLevel >= 2)
1559	GAMMA_DECODE_L (Count, pos);
1560	else
1561	Count = (double) we->count / p;
1562	i++;
1563	kd--;
1564	if (kd < 0)
1565	kd = k - 1;
1566	Wdt = Count * WordLog;
1567
1568	/* [RJM 07/97: Ranked Required Terms] */
1569	/* if this is a required match we must remove all accumulators */
1570	/* of intermediate documents */
1571	if (require_match == 1) {
1572	while (this_item < LT->num && LT->IDE[this_item].DocNum < CurrDocNum - 1) {
1573	if (LastDocNum==0 \|\|(LT->IDE[this_item].DocNum > LastDocNum - 1))
1574	LT->IDE[this_item].Sum = -1000.0; /* mark for deletion */
1575	this_item++;
1576	}
1577	}
1578
1579	if (Anding)
1580	{
1581	while (n < LT->num && next->DocNum < CurrDocNum - 1)
1582	{
1583	next++;
1584	n++;
1585	}
1586	if (n < LT->num && next->DocNum == CurrDocNum - 1)
1587	{
1588	next->Sum += Wqt * Wdt;
1589	total_added += Wqt * Wdt;
1590	hits++;
1591	}
1592	if (n < LT->num && next->DocNum <= CurrDocNum - 1)
1593	{
1594	next++;
1595	n++;
1596	}
1597	goto NextDocNum;
1598	}
1599	if (!Anding)
1600	{
1601	if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums &&
1602	!MoreAccum)
1603	goto NextDocNum;
1604	LT = LT_add (qd, LT, CurrDocNum - 1, Wqt * Wdt);
1605	if (!LT)
1606	{
1607	Abort = 1;
1608	goto FastAbort;
1609	}
1610
1611	/* the list of accumulators just got bigger */
1612	has_grown = 1;
1613
1614	if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums &&
1615	rqi->StopAtMaxAccum)
1616	Abort = 1;
1617
1618
1619	}
1620	else if ((mem = LT_find (LT, CurrDocNum - 1)))
1621	{
1622	mem->Sum += Wqt * Wdt;
1623	total_added += Wqt * Wdt;
1624	hits++;
1625	}
1626
1627	NextDocNum:
1628
1629	LastDocNum = CurrDocNum;
1630	}
1631
1632	FastAbort:
1633
1634	DECODE_DONE
1635
1636	Xfree (buffer);
1637	ChangeMemInUse (qd, -we->invf_len);
1638	if (sk)
1639	{
1640	if (qd->id->ifh.skip_mode != 0)
1641	fprintf (sk, " %6d", skips_taken);
1642	fprintf (sk, " %6d %6d %6d%12.2f\n",
1643	ptrs_dec, hits, LT->num, total_added);
1644	}
1645	if (Abort)
1646	break;
1647
1648	/* [RJM 07/97: Ranked Required Terms] */
1649	/* remove the remaining accumulators if this term is required to match */
1650	if (require_match == 1) {
1651	while (this_item < LT->num) {
1652	if (LT->IDE[this_item].DocNum > LastDocNum - 1)
1653	LT->IDE[this_item].Sum = -1000.0; /* mark for deletion */
1654	this_item++;
1655	}
1656	Anding = 1; /* no more documents should be added */
1657
1658	/* Get rid of any documents which have been deleted */
1659	LT_pack (LT);
1660	}
1661	}
1662
1663	if (sk)
1664	fclose (sk);
1665	return LT;
1666	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: