Context Navigation

invf_get.c@ 25147

Last change on this file since 25147 was 25147, checked in by kjdon, 12 years ago
merged 64_bit_Greenstone branch into trunk, rev 25139
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 39.7 KB

Line
1	/**************************************************************************
2	*
3	* invf_get.c -- Inverted file reading routines
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: invf_get.c 25147 2012-02-28 00:59:00Z kjdon $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "memlib.h"
27	#include "filestats.h"
28	#include "timing.h"
29	#include "bitio_gen.h"
30	#include "bitio_m_mem.h"
31	#include "bitio_m.h"
32	#include "sptree.h"
33	#include "messages.h"
34	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
35
36	#include "mg.h"
37	#include "invf.h"
38	#include "text.h"
39	#include "lists.h"
40	#include "backend.h"
41	#include "invf_get.h"
42	#include "mg_errors.h"
43
44	#define SKIPPING
45	#define SHORT_CUT
46
47	struct pool
48	{
49	int length, pos;
50	struct pool *next;
51	Invf_Doc_Entry ide[1];
52	};
53
54	static int skip_header = 0;
55
56	static char skip_header_text0[] =
57	"#######################################################################\n"
58	"#\n"
59	"# 1\tWord number.\n"
60	"# 2\tNumber of time the word occurs in the collection.\n"
61	"# 3\tNumber of documents the word occurs in.\n"
62	"# 4\tThe word.\n"
63	"# 5\tNumber of docnum/word_count entries decoded.\n"
64	"# 6\tNumber of hits.\n"
65	"# 7\tNumber of entries in the splay tree or hash table.\n"
66	"# 8\tAmount added to the accumulators while while processing this word.\n"
67	"#\n"
68	"#######################################################################\n";
69
70	static char skip_header_text1[] =
71	"#######################################################################\n"
72	"#\n"
73	"# 1\tWord number.\n"
74	"# 2\tNumber of time the word occurs in the collection.\n"
75	"# 3\tNumber of documents the word occurs in.\n"
76	"# 4\tThe word.\n"
77	"# 5\tNumber of skips taken in evaluating the inverted file entry.\n"
78	"# 6\tNumber of docnum/word_count entries decoded.\n"
79	"# 7\tNumber of hits.\n"
80	"# 8\tNumber of entries in the splay tree or hash table.\n"
81	"# 9\tAmount added to the accumulators while while processing this word.\n"
82	"#\n"
83	"#######################################################################\n";
84
85
86	static char skip_header_text2[] =
87	"#######################################################################\n"
88	"#\n"
89	"# 1\tWord number.\n"
90	"# 2\tNumber of time the word occurs in the collection.\n"
91	"# 3\tNumber of documents the word occurs in.\n"
92	"# 4\tThe word.\n"
93	"# 5\tSkip size.\n"
94	"# 6\tNumber of skips taken in evaluating the inverted file entry.\n"
95	"# 7\tNumber of docnum/word_count entries decoded.\n"
96	"# 8\tNumber of hits.\n"
97	"# 9\tNumber of entries in the splay tree or hash table.\n"
98	"# 10\tAmount added to the accumulators while while processing this word.\n"
99	"#\n"
100	"#######################################################################\n";
101
102	static char *skip_header_text[] =
103	{skip_header_text0,
104	skip_header_text1,
105	skip_header_text2};
106
107
108
109
110
111
112
113
114	/**************************************************************************
115	*
116	* Code for the hash table.
117	*
118	*/
119
120	static Hash_Table *
121	HT_create (query_data * qd, unsigned max_entries,
122	unsigned max_size)
123	{
124	unsigned es, hts;
125	Hash_Table *HT;
126	if (max_entries == 0)
127	max_entries = 8192;
128	es = sizeof (Hash_Table) + sizeof (Invf_Doc_EntryH) * (max_entries - 1);
129	hts = sizeof (Invf_Doc_EntryH ) max_size;
130
131	HT = Xmalloc (es);
132	if (!HT)
133	return NULL;
134
135	HT->HashTable = Xmalloc (hts);
136	if (!(HT->HashTable))
137	{
138	Xfree (HT);
139	return NULL;
140	}
141
142	ChangeMemInUse (qd, es + hts);
143	bzero ((char *) (HT->HashTable), hts);
144	HT->max = max_entries;
145	HT->num = 0;
146	HT->Suplimentary_Entries = NULL;
147	HT->Suplimentary_Size = 0;
148	HT->Suplimentary_Num = 0;
149	HT->Head = NULL;
150	HT->size = max_size;
151	return HT;
152	}
153
154
155	static void
156	HT_Sup_create (query_data * qd, Hash_Table * HT, unsigned max_size)
157	{
158	int es = sizeof (Invf_Doc_EntryH) * max_size;
159	HT->Suplimentary_Entries = Xmalloc (es);
160	if (!HT->Suplimentary_Entries)
161	return;
162	HT->Suplimentary_Size = max_size;
163	HT->Suplimentary_Num = 0;
164	ChangeMemInUse (qd, es);
165	}
166
167	static void
168	HT_Sup_Add (Hash_Table * HT, mg_u_long DN, float Sum)
169	{
170	Invf_Doc_EntryH *IDEH;
171	IDEH = &HT->Suplimentary_Entries[HT->Suplimentary_Num++];
172	IDEH->next = NULL;
173	IDEH->IDE.DocNum = DN;
174	IDEH->IDE.Sum = Sum;
175	}
176
177	Invf_Doc_Entry *
178	HT_find (Hash_Table * HT, mg_u_long DN)
179	{
180	Invf_Doc_EntryH *IDEH;
181	if (!HT->HashTable)
182	FatalError (1, "Accessing the hash_table through HT_find after HT_sort");
183	IDEH = HT->HashTable[DN % HT->size];
184	while (IDEH)
185	{
186	if (IDEH->IDE.DocNum == DN)
187	return &(IDEH->IDE);
188	IDEH = IDEH->next;
189	}
190	return NULL;
191	}
192
193
194	static int
195	HT_insert (Hash_Table * HT,
196	mg_u_long DN, float Sum)
197	{
198	Invf_Doc_EntryH **IDEH;
199	if (HT->num == HT->max)
200	return 0;
201	IDEH = &(HT->HashTable[DN % HT->size]);
202	while (*IDEH)
203	IDEH = &((*IDEH)->next);
204	*IDEH = &(HT->IDEH[HT->num++]);
205	(*IDEH)->next = NULL;
206	(*IDEH)->IDE.DocNum = DN;
207	(*IDEH)->IDE.Sum = Sum;
208	return 1;
209	}
210
211
212	void
213	HT_free (query_data * qd, Hash_Table * HT)
214	{
215	unsigned es;
216	es = sizeof (Hash_Table) + sizeof (Invf_Doc_EntryH) * HT->max +
217	sizeof (Invf_Doc_EntryH ) HT->size;
218	if (HT->HashTable)
219	Xfree (HT->HashTable);
220	if (HT->Suplimentary_Entries)
221	{
222	Xfree (HT->Suplimentary_Entries);
223	es += sizeof (Invf_Doc_EntryH) * HT->Suplimentary_Size;
224	}
225	Xfree (HT);
226	ChangeMemInUse (qd, -es);
227	}
228
229
230	static int
231	IDEH_comp (const void a, const void b)
232	{
233	const Invf_Doc_EntryH *A = a;
234	const Invf_Doc_EntryH *B = b;
235	return A->IDE.DocNum - B->IDE.DocNum;
236	}
237
238	static void
239	HT_sort (query_data * qd, Hash_Table * HT)
240	{
241	int i;
242	Invf_Doc_EntryH l, r, **b;
243	unsigned es = sizeof (Invf_Doc_EntryH ) HT->size;
244	if (HT->HashTable)
245	Xfree (HT->HashTable);
246	HT->size = 0;
247	HT->HashTable = NULL;
248	ChangeMemInUse (qd, -es);
249
250	qsort (HT->IDEH, HT->num, sizeof (Invf_Doc_EntryH), IDEH_comp);
251
252	for (i = 0; i < HT->num - 1; i++)
253	HT->IDEH[i].next = &HT->IDEH[i + 1];
254	HT->IDEH[i].next = NULL;
255	l = HT->IDEH;
256
257	if (HT->Suplimentary_Entries)
258	{
259	for (i = 0; i < HT->Suplimentary_Num - 1; i++)
260	HT->Suplimentary_Entries[i].next = &HT->Suplimentary_Entries[i + 1];
261	HT->Suplimentary_Entries[i].next = NULL;
262	}
263	r = HT->Suplimentary_Entries;
264
265	b = &HT->Head;
266
267	while (l \|\| r)
268	{
269	if (r && (!l \|\| (l->IDE.DocNum > r->IDE.DocNum)))
270	{
271	*b = r;
272	b = &r->next;
273	r = *b;
274	}
275	else
276	{
277	*b = l;
278	b = &l->next;
279	l = *b;
280	}
281	}
282	*b = NULL;
283	}
284
285
286
287
288
289
290	/**************************************************************************
291	*
292	* Code for the hash table.
293	*
294	*/
295
296	static List_Table *
297	LT_create (query_data * qd, unsigned max_entries)
298	{
299	unsigned es;
300	List_Table *LT;
301	if (max_entries == 0)
302	max_entries = 8192;
303	es = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * (max_entries - 1);
304
305	LT = Xmalloc (es);
306	if (!LT)
307	return NULL;
308
309	ChangeMemInUse (qd, es);
310	LT->max = max_entries;
311	LT->num = 0;
312	return LT;
313	}
314
315
316
317	Invf_Doc_Entry *
318	LT_find (List_Table * LT, mg_u_long DN)
319	{
320	int l = 0;
321	int r = LT->num - 1;
322	int m, c;
323	while (l <= r)
324	{
325	m = (l + r) / 2;
326	c = DN - LT->IDE[m].DocNum;
327	if (c < 0)
328	r = m - 1;
329	else if (c > 0)
330	l = m + 1;
331	else
332	return &LT->IDE[m];
333	}
334	return NULL;
335	}
336
337	static List_Table *
338	LT_add (query_data * qd, List_Table * LT,
339	mg_u_long DN, float Sum)
340	{
341	Invf_Doc_Entry *ide;
342
343	/* increase the size of the list if there is not enough */
344	/* room for this entry */
345	if (LT->num == LT->max)
346	{
347	int old, new, max_entries;
348	old = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * LT->max;
349	max_entries = LT->max + (LT->max >> 1);
350	new = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * max_entries;
351
352	LT = Xrealloc (LT, new);
353	if (!LT)
354	return NULL;
355
356	ChangeMemInUse (qd, new - old);
357	LT->max = max_entries;
358	}
359
360	/* add the entry to the end of the list */
361	ide = &LT->IDE[LT->num];
362	ide->DocNum = DN;
363	ide->Sum = Sum;
364	LT->num++;
365
366	return LT;
367	}
368
369
370	void
371	LT_free (query_data * qd, List_Table * LT)
372	{
373	unsigned es;
374	es = sizeof (List_Table) + sizeof (Invf_Doc_Entry) * LT->max;
375	Xfree (LT);
376	ChangeMemInUse (qd, -es);
377	}
378
379
380	static int
381	IDE_comp (const void a, const void b)
382	{
383	const Invf_Doc_Entry *A = a;
384	const Invf_Doc_Entry *B = b;
385	return A->DocNum - B->DocNum;
386	}
387
388	static void
389	LT_sort (List_Table * LT)
390	{
391	qsort (LT->IDE, LT->num, sizeof (Invf_Doc_Entry), IDE_comp);
392	}
393
394
395	/* [RJM 07/97: Ranked Required Terms] */
396	/* accumulators with a sum less than or equal to 0.0 are to be deleted */
397	static void
398	LT_pack (List_Table * LT)
399	{
400	int s, d = 0;
401	for (s = 0; s < LT->num; s++) {
402	if (LT->IDE[s].Sum <= 0.0) {
403	/* ignore this accumulator */
404
405	} else if (d > 0 && LT->IDE[s].DocNum == LT->IDE[d-1].DocNum) {
406	LT->IDE[d-1].Sum += LT->IDE[s].Sum;
407
408	} else {
409	if (d != s) LT->IDE[d] = LT->IDE[s];
410	d++;
411	}
412	}
413
414	if (LT->num) LT->num = d;
415	}
416
417
418	/**************************************************************************/
419
420
421
422	invf_data *
423	InitInvfFile (File * InvfFile, stemmed_dict * sd)
424	{
425	invf_data *id;
426	if (!(id = Xmalloc (sizeof (invf_data))))
427	{
428	mg_errno = MG_NOMEM;
429	return NULL;
430	}
431
432	id->InvfFile = InvfFile;
433
434	Fread (&id->ifh, sizeof (id->ifh), 1, InvfFile);
435	/* [RPAP - Jan 97: Endian Ordering] */
436	{
437	int i;
438
439	NTOHUL(id->ifh.no_of_words);
440	NTOHUL(id->ifh.no_of_ptrs);
441	NTOHUL(id->ifh.skip_mode);
442	for (i = 0; i < 16; i++)
443	NTOHUL(id->ifh.params[i]);
444	NTOHUL(id->ifh.InvfLevel);
445	}
446
447	id->N = sd->sdh.num_of_docs;
448	id->Nstatic = sd->sdh.static_num_of_docs;
449
450	return (id);
451	}
452
453
454	void
455	FreeInvfData (invf_data * id)
456	{
457	Xfree (id);
458	}
459
460	/****************************************************************************/
461
462
463	static void
464	CalcBlks (query_data * qd, WordEntry * we, int *blk,
465	int dn_blk, int len_blk, int k, int p)
466	{
467	*p = we->doc_count;
468
469	blk = BIO_Bblock_Init (qd->id->Nstatic, p);
470	switch (qd->id->ifh.skip_mode)
471	{
472	case 1:
473	{
474	mg_u_long len;
475	if (*p <= qd->id->ifh.params[0])
476	*k = 0;
477	else
478	{
479	*k = qd->id->ifh.params[0];
480	len = BIO_Bblock_Bound (qd->id->N, *p);
481	if (qd->id->ifh.InvfLevel >= 2)
482	len += we->count;
483	dn_blk = BIO_Bblock_Init (qd->sd->sdh.num_of_docs, (p + k - 1) / k);
484	len_blk = BIO_Bblock_Init (len, (p + k - 1) / k);
485	}
486	break;
487	}
488	case 2:
489	{
490	mg_u_long len;
491	if (*p <= qd->id->ifh.params[1])
492	*k = 0;
493	else
494	{
495	k = (int) (2 sqrt (((double) (*p)) / qd->id->ifh.params[0]));
496	if (*k <= qd->id->ifh.params[1])
497	*k = qd->id->ifh.params[1];
498	len = BIO_Bblock_Bound (qd->id->N, *p);
499	if (qd->id->ifh.InvfLevel >= 2)
500	len += we->count;
501	*dn_blk = BIO_Bblock_Init (qd->sd->sdh.num_of_docs,
502	(p + k - 1) / *k);
503	len_blk = BIO_Bblock_Init (len, (p + k - 1) / k);
504	}
505	break;
506	}
507	default:
508	*k = 0;
509	}
510	}
511
512
513	/* =========================================================================
514	* Function: GetDocsOp
515	* Description:
516	* if op == op_term then
517	* look up term in invf and decode all its entries/documents
518	* if op == op_and_terms then
519	* go thru L, testing if it exists in invf using skips
520	* if it does exist then overwrite the entry nearer the start of the list
521	* if op == op_and_or_terms then
522	* go thru L, testing if it exists in invf using skips
523	* BUT only mark ones which are found - keep list intact
524	* if op == op_diff_terms then >>>> NOT tested <<<<
525	* Input:
526	* Output:
527	* ========================================================================= */
528
529	DocList *
530	GetDocsOp (query_data * qd, WordEntry * we, Op_types op, DocList * L)
531	{
532	u_char *buffer;
533	DocEntry src = NULL, dest = NULL;
534	int i; /* index counter into inverted list */
535	mg_u_long pos;
536	int next_start, next_doc_num, next_i;
537	int block_counter; /* within counter in block */
538	int dn_blk = 0, len_blk = 0;
539	int skip_size, have_skipping;
540	int blk; /* bblock control parameter */
541	int ft; /* The number of documents the word occurs in */
542	mg_u_long CurrDoc = 0; /* NOTE: Document numbers start at 1 */
543	float Weight;
544
545	/* ... AND empty */
546	if (op == op_and_terms && (!L \|\| !L->num))
547	{
548	if (L)
549	Xfree (L);
550	return MakeDocList (0);
551	}
552
553	if (op == op_term)
554	L = MakeDocList (we->doc_count);
555
556	CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &skip_size, &ft);
557	have_skipping = (skip_size != 0);
558	if (!(buffer = Xmalloc (we->invf_len)))
559	return MakeDocList (0);
560
561	ChangeMemInUse (qd, we->invf_len);
562
563	qd->num_of_terms++;
564
565	/* --- Read an inverted file entry for the given word --- */
566	Fseek (qd->File_invf, we->invf_ptr, 0);
567	Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
568
569	/* Read from the buffer */
570	DECODE_START (buffer, we->invf_len);
571
572	Weight = qd->sd->sdh.num_of_words / (100 * log2 ((float) we->count + 1));
573
574	if (op != op_term)
575	dest = src = L->DE;
576	i = pos = 0;
577	next_i = next_start = next_doc_num = 0;
578	block_counter = skip_size - 1;
579
580	/* Go thru the elements of an inverted list for a term */
581	/* May not go sequentially if skipping is done */
582	while (i < ft && ((op == op_term) \|\| src - L->DE < L->num))
583	{
584	mg_u_long diff, Temp;
585
586	if (have_skipping)
587	{
588
589	/* --- Skip to next block --- */
590	if (op != op_term && i + skip_size < ft &&
591	src->DocNum > next_doc_num && next_doc_num >= 0)
592	{
593	ShortCut:
594	i = next_i;
595	DECODE_SEEK (next_start);
596	pos = next_start;
597	CurrDoc = next_doc_num;
598	qd->hops_taken++;
599	block_counter = skip_size - 1;
600	}
601
602	/* --- Decode the skip block header --- */
603	if (block_counter == skip_size - 1)
604	{
605	if (i + skip_size < ft)
606	{
607	/* --- Decode the skip block header --- */
608	int doc_diff, pos_diff;
609	BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
610	next_doc_num += doc_diff;
611	next_i += skip_size;
612	BBLOCK_DECODE_L (pos_diff, len_blk, pos);
613	next_start = pos + pos_diff;
614	}
615	else
616	next_doc_num = -1;
617	}
618
619	/* --- Do a skip if c-set doc is higher than block header doc --- */
620	if (op != op_term && i + skip_size < ft &&
621	src->DocNum > next_doc_num && next_doc_num >= 0)
622	goto ShortCut;
623
624
625	} /if skipping /
626
627	/* --- increment CurrDoc --- */
628	if (have_skipping && block_counter == 0 && i != ft - 1)
629	CurrDoc = next_doc_num;
630	else
631	{
632	BBLOCK_DECODE_L (diff, blk, pos);
633	CurrDoc += diff;
634	}
635
636	/* --- If have f_dt's then eat them up --- */
637	if (qd->id->ifh.InvfLevel >= 2)
638	GAMMA_DECODE_L (Temp, pos); /* discard word count */
639
640	/* --- Increment counters --- */
641	qd->num_of_ptrs++;
642	i++;
643	block_counter--;
644	if (block_counter < 0) /* ??? Why would this happen ? */
645	block_counter = skip_size - 1;
646
647	switch (op)
648	{
649	case op_term:
650	/* --- Add the doc from invf onto the c-set list --- */
651	L->DE[L->num].DocNum = CurrDoc;
652	L->DE[L->num].Weight = Weight;
653	L->DE[L->num].SeekPos = 0;
654	L->DE[L->num].Len = 0;
655	L->DE[L->num].CompTextBuffer = NULL;
656	L->DE[L->num].Next = NULL;
657	L->DE[L->num].or_included = 0;
658	L->num++;
659	break;
660
661	case op_and_terms:
662	case op_and_or_terms:
663	/* --- see if have a match and save it --- */
664
665	while (src < (L->DE + L->num) && CurrDoc > src->DocNum)
666	{
667	src++;
668	}
669	/* Unless at end of list, assert(CurrDoc <= src->DocNum) */
670
671	/* --- Accept the doc as a candidate --- */
672	if (src < (L->DE + L->num) && CurrDoc == src->DocNum)
673	{
674	if (op == op_and_terms)
675	{
676	dest++ = src;
677	}
678	else
679	{
680	src->or_included = 1;
681	}
682	src++; /* Move onto next one */
683	}
684
685	/* If the next one to try has already been tried then move on again */
686	if (op == op_and_or_terms)
687	{
688	while (src < (L->DE + L->num) && src->or_included)
689	src++;
690	}
691
692	break;
693
694	case op_diff_terms:
695	while (src < (L->DE + L->num) && src->DocNum < CurrDoc)
696	dest++ = src++;
697	if (src < (L->DE + L->num) && src->DocNum == CurrDoc)
698	src++;
699	break;
700
701	default:
702	break;
703	} /switch /
704
705	} /while traversing invf /
706
707	if (op == op_and_terms \|\| op == op_diff_terms)
708	{
709	if (op == op_diff_terms)
710	while (src - L->DE < L->num)
711	dest++ = src++;
712
713	/* update size of c-set */
714	L->num = dest - L->DE;
715	}
716	DECODE_DONE
717
718	Xfree (buffer);
719	ChangeMemInUse (qd, -we->invf_len);
720
721	return (L);
722	}
723
724
725
726
727
728
729
730	float *
731	CosineDecode (query_data * qd, TermList * Terms, RankedQueryInfo * rqi)
732	{
733	float *AccumulatedWeights;
734	int j, num_docs, num_terms;
735	int seen_must_match = 0; /* [RJM 07/97: Ranked Required Terms] says whether to 'and' queries or not */
736
737	num_docs = qd->sd->sdh.num_of_docs;
738	/* Create the array for the documents */
739	if (!(AccumulatedWeights = Xmalloc (num_docs * sizeof (float))))
740	return (NULL);
741
742	bzero ((char ) AccumulatedWeights, sizeof (float) num_docs);
743	ChangeMemInUse (qd, sizeof (float) * num_docs);
744
745	if (rqi->MaxTerms == -1)
746	num_terms = Terms->num;
747	else
748	num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
749
750
751	/* For each word in the list . . . */
752	for (j = 0; j < num_terms; j++)
753	{
754	u_char *buffer;
755	int i, kd;
756	mg_u_long next_mjr_dn = 0;
757	mg_u_long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
758	mg_u_long LastDocNum = 0; /* [RJM 07/97: Ranked Required Terms] */
759	mg_u_long TempDocNum = 0; /* [RJM 07/97: Ranked Required Terms] */
760	double Wqt, WordLog;
761	int dn_blk = 0, len_blk = 0, k;
762	int blk; /* bblock control parameter */
763	int p; /* The number of documents the word occurs in */
764	int require_match = Terms->TE[j].require_match; /* [RJM 07/97: Ranked Required Terms] */
765	WordEntry *we = &Terms->TE[j].WE;
766	CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
767	if (!(buffer = Xmalloc (we->invf_len)))
768	break;
769
770	ChangeMemInUse (qd, we->invf_len);
771
772	Fseek (qd->File_invf, we->invf_ptr, 0);
773	Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
774
775	/* Read from the buffer */
776	DECODE_START (buffer, we->invf_len);
777
778	WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
779
780	Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
781
782	qd->num_of_terms++;
783
784	kd = 0;
785	for (i = 0; i < p; i++, kd++)
786	{
787	int Count; /* The number of times the word occurs in a document */
788	double Wdt;
789	mg_u_long diff;
790	if (kd == k)
791	{
792	kd = 0;
793	}
794	if (k && kd == 0)
795	if (i + k < p)
796	{
797	int temp;
798	BBLOCK_DECODE (next_mjr_dn, dn_blk);
799	next_mjr_dn += CurrDocNum;
800	BBLOCK_DECODE (temp, len_blk);
801	}
802	if (k && kd == k - 1 && i != p - 1)
803	CurrDocNum = next_mjr_dn;
804	else
805	{
806	BBLOCK_DECODE (diff, blk);
807	CurrDocNum += diff;
808	}
809	if (qd->id->ifh.InvfLevel >= 2)
810	GAMMA_DECODE (Count);
811	else
812	Count = (double) we->count / p;
813	Wdt = Count * WordLog;
814
815	/* [RJM 07/97: Ranked Required Terms] */
816	/* zero the weights of intermediate documents if this term is required to match */
817	if (require_match == 1) {
818	for (TempDocNum = LastDocNum+1; TempDocNum < CurrDocNum; TempDocNum++)
819	AccumulatedWeights[TempDocNum - 1] = 0.0;
820	}
821
822	/* [RJM 07/97: Ranked Required Terms] */
823	/* if a term has been required to match before, the weight we are */
824	/* adding to must be non-zero */
825	if (!seen_must_match \|\| (AccumulatedWeights[CurrDocNum - 1] > 0.0)) {
826	AccumulatedWeights[CurrDocNum - 1] += Wqt * Wdt;
827	}
828
829	qd->num_of_ptrs++;
830	LastDocNum = CurrDocNum; /* [RJM 07/97: Ranked Required Terms] */
831	}
832
833	DECODE_DONE
834
835	Xfree (buffer);
836	ChangeMemInUse (qd, -we->invf_len);
837
838	/* [RJM 07/97: Ranked Required Terms] */
839	/* zero the remaining weights if this term is required to match */
840	if (require_match == 1) {
841	for (TempDocNum = LastDocNum+1; TempDocNum <= num_docs; TempDocNum++)
842	AccumulatedWeights[TempDocNum - 1] = 0.0;
843	seen_must_match = 1;
844	}
845	}
846	return AccumulatedWeights;
847	}
848
849
850
851	static int
852	doc_comp (void a, void b)
853	{
854	return ((Invf_Doc_Entry ) a)->DocNum - ((Invf_Doc_Entry ) b)->DocNum;
855	}
856
857
858	static Invf_Doc_Entry *
859	get_ide (query_data * qd, Invf_Doc_Entry_Pool * pool)
860	{
861	struct pool *p = pool->pool;
862	if (!p \|\| p->pos == p->length)
863	{
864	register unsigned size = sizeof (struct pool) +
865	(pool->pool_chunks - 1) * sizeof (Invf_Doc_Entry);
866	p = malloc (size);
867	if (!p)
868	return NULL;
869	ChangeMemInUse (qd, size);
870	p->length = pool->pool_chunks;
871	p->pos = 0;
872	p->next = pool->pool;
873	pool->pool = p;
874	}
875	return &(p->ide[p->pos++]);
876	}
877
878
879	void
880	free_ide_pool (query_data * qd, Invf_Doc_Entry_Pool * pool)
881	{
882	while (pool->pool)
883	{
884	struct pool *p = pool->pool;
885	pool->pool = p->next;
886	ChangeMemInUse (qd, -(sizeof (struct pool) + (p->length - 1) *
887	sizeof (Invf_Doc_Entry)));
888	free (p);
889	}
890	pool->pool = NULL;
891	}
892
893
894	FILE *
895	open_skip (char *sd)
896	{
897	char buf[256];
898	if (sd == NULL \|\| *sd == '\0')
899	return NULL;
900	sprintf (buf, sd, getpid ());
901	return fopen (buf, "a");
902	}
903
904
905
906	Splay_Tree *
907	CosineDecodeSplay (query_data * qd, TermList * Terms,
908	RankedQueryInfo * rqi, Invf_Doc_Entry_Pool * pool)
909	{
910	Splay_Tree *Doc_Set;
911	int j, num_docs, num_terms, MoreAccum;
912	int Anding = 0;
913	FILE *sk = open_skip (rqi->skip_dump);
914	u_char indent = 0;
915	if (sk)
916	{
917	if (!skip_header)
918	{
919	skip_header = 1;
920	fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
921	fprintf (sk, "\nSkipping method %d\n", qd->id->ifh.skip_mode);
922	switch (qd->id->ifh.skip_mode)
923	{
924	case 1:
925	fprintf (sk, "Skipping is every %d docnums\n",
926	qd->id->ifh.params[0]);
927	break;
928	case 2:
929	fprintf (sk, "Max nodes = %d\nNo skips smaller or equal to %d\n",
930	qd->id->ifh.params[0], qd->id->ifh.params[1]);
931	break;
932	}
933	}
934	fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
935	for (j = 0; j < Terms->num; j++)
936	if (Terms->TE[j].Word[0] > indent)
937	indent = Terms->TE[j].Word[0];
938	}
939
940	num_docs = qd->sd->sdh.num_of_docs;
941	/* Create the array for the documents */
942	Doc_Set = SP_createset (doc_comp);
943
944	pool->pool = NULL;
945	pool->pool_chunks = 1024;
946
947	if (rqi->MaxTerms == -1)
948	num_terms = Terms->num;
949	else
950	num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
951
952	MoreAccum = 1;
953
954	/* For each word in the list . . . */
955	for (j = 0; j < num_terms; j++)
956	{
957	int Abort = 0;
958	u_char *buffer;
959	int pos, i;
960	mg_u_long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
961	double Wqt, WordLog;
962	int dn_blk = 0, len_blk = 0, k, kd;
963	int blk; /* bblock control parameter */
964	int p; /* The number of documents the word occurs in */
965	WordEntry *we = &Terms->TE[j].WE;
966	Invf_Doc_Entry *next = NULL;
967	int next_i, next_doc_num, next_start;
968	int skips_taken = 0, ptrs_dec = 0;
969	double total_added = 0.0;
970	int hits = 0;
971	if (sk)
972	{
973	fprintf (sk, "%3d : %8d %6d \"%.s\"%s", j, we->count,
974	we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
975	indent - Terms->TE[j].Word[0], "");
976	}
977
978	CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
979	if (!(buffer = Xmalloc (we->invf_len)))
980	break;
981
982	ChangeMemInUse (qd, we->invf_len);
983
984	if (sk && qd->id->ifh.skip_mode == 2)
985	fprintf (sk, "%4d", k);
986
987	Fseek (qd->File_invf, we->invf_ptr, 0);
988	Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
989
990	/* Read from the buffer */
991	DECODE_START (buffer, we->invf_len);
992
993	WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
994
995	Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
996
997	qd->num_of_terms++;
998
999	if (rqi->MaxAccums != -1 && Doc_Set->no_of_items >= rqi->MaxAccums)
1000	{
1001	MoreAccum = 0;
1002	Anding = 1;
1003	next = SP_get_first (Doc_Set);
1004	}
1005	i = pos = 0;
1006	next_i = next_start = next_doc_num = 0;
1007	kd = k - 1;
1008	while (i < p && (!Anding \|\| next))
1009	{
1010	int Count; /* The number of times the occurs in a document */
1011	double Wdt;
1012	mg_u_long diff;
1013	Invf_Doc_Entry ent, *mem;
1014	if (k)
1015	{
1016	if (Anding && i + k < p && next->DocNum > next_doc_num &&
1017	next_doc_num >= 0)
1018	{
1019	i = next_i;
1020	DECODE_SEEK (next_start);
1021	pos = next_start;
1022	CurrDocNum = next_doc_num;
1023	qd->hops_taken++;
1024	skips_taken++;
1025	kd = k - 1;
1026	}
1027	if (kd == k - 1)
1028	{
1029	if (i + k < p)
1030	{
1031	int doc_diff, pos_diff;
1032	BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1033	next_doc_num += doc_diff;
1034	next_i += k;
1035	BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1036	next_start = pos + pos_diff;
1037	}
1038	else
1039	{
1040	next_doc_num = -1;
1041	}
1042	}
1043	if (Anding && i + k < p && next->DocNum > next_doc_num &&
1044	next_doc_num >= 0)
1045	continue;
1046	}
1047	if (k && kd == 0 && i != p - 1)
1048	CurrDocNum = next_doc_num;
1049	else
1050	{
1051	BBLOCK_DECODE_L (diff, blk, pos);
1052	CurrDocNum += diff;
1053	}
1054	qd->num_of_ptrs++;
1055	ptrs_dec++;
1056	if (qd->id->ifh.InvfLevel >= 2)
1057	GAMMA_DECODE_L (Count, pos);
1058	else
1059	Count = (double) we->count / p;
1060	i++;
1061	kd--;
1062	if (kd < 0)
1063	kd = k - 1;
1064	Wdt = Count * WordLog;
1065
1066
1067	if (Anding)
1068	{
1069	while (next && next->DocNum < CurrDocNum - 1)
1070	next = SP_get_next (Doc_Set);
1071	if (next && next->DocNum == CurrDocNum - 1)
1072	{
1073	next->Sum += Wqt * Wdt;
1074	total_added += Wqt * Wdt;
1075	hits++;
1076	}
1077	if (next && next->DocNum <= CurrDocNum - 1)
1078	next = SP_get_next (Doc_Set);
1079	goto NextDocNum;
1080	}
1081	ent.DocNum = CurrDocNum - 1;
1082	if ((mem = SP_member (&ent, Doc_Set)) == NULL)
1083	{
1084	hits++;
1085	if (rqi->MaxAccums != -1 &&
1086	Doc_Set->no_of_items >= rqi->MaxAccums &&
1087	!MoreAccum)
1088	goto NextDocNum;
1089	if ((mem = get_ide (qd, pool)) == NULL)
1090	return NULL;
1091	ChangeMemInUse (qd, sizeof (*mem));
1092	mem->DocNum = CurrDocNum - 1;
1093	mem->Sum = 0;
1094	ChangeMemInUse (qd, -Doc_Set->mem_in_use);
1095	if (SP_insert (mem, Doc_Set) == NULL)
1096	{
1097	Message ("Unable to allocate memory for a splay node");
1098	Abort = 1;
1099	goto FastAbort;
1100	}
1101	ChangeMemInUse (qd, Doc_Set->mem_in_use);
1102	if (rqi->MaxAccums != -1 &&
1103	Doc_Set->no_of_items >= rqi->MaxAccums &&
1104	rqi->StopAtMaxAccum)
1105	Abort = 1;
1106	}
1107
1108	mem->Sum += Wqt * Wdt;
1109	total_added += Wqt * Wdt;
1110
1111	NextDocNum:
1112	;
1113	}
1114
1115	FastAbort:
1116
1117	DECODE_DONE
1118
1119	Xfree (buffer);
1120	ChangeMemInUse (qd, -we->invf_len);
1121	if (sk)
1122	{
1123	if (qd->id->ifh.skip_mode != 0)
1124	fprintf (sk, " %6d", skips_taken);
1125	fprintf (sk, " %6d %6d %6d%12.2f\n",
1126	ptrs_dec, hits, Doc_Set->no_of_items, total_added);
1127	}
1128	if (Abort)
1129	break;
1130	}
1131	if (sk)
1132	fclose (sk);
1133	return Doc_Set;
1134	}
1135
1136
1137
1138
1139	Hash_Table *
1140	CosineDecodeHash (query_data * qd, TermList * Terms,
1141	RankedQueryInfo * rqi)
1142	{
1143	int j, num_docs, num_terms, MoreAccum;
1144	Hash_Table *HT;
1145	int Anding = 0;
1146	int Sorted = 0;
1147	FILE *sk = open_skip (rqi->skip_dump);
1148	u_char indent = 0;
1149	if (sk)
1150	{
1151	if (!skip_header)
1152	{
1153	skip_header = 1;
1154	fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
1155	fprintf (sk, "\nSkipping method %d\n", qd->id->ifh.skip_mode);
1156	switch (qd->id->ifh.skip_mode)
1157	{
1158	case 1:
1159	fprintf (sk, "Skipping is every %d docnums\n",
1160	qd->id->ifh.params[0]);
1161	break;
1162	case 2:
1163	fprintf (sk, "Max nodes = %d\nNo skips smaller or equal to %d\n",
1164	qd->id->ifh.params[0], qd->id->ifh.params[1]);
1165	break;
1166	}
1167	}
1168	fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
1169	for (j = 0; j < Terms->num; j++)
1170	if (Terms->TE[j].Word[0] > indent)
1171	indent = Terms->TE[j].Word[0];
1172	}
1173
1174	num_docs = qd->sd->sdh.num_of_docs;
1175	/* Create the array for the documents */
1176	HT = HT_create (qd, rqi->MaxAccums, rqi->HashTblSize);
1177	if (!HT)
1178	return NULL;
1179
1180	if (rqi->MaxTerms == -1)
1181	num_terms = Terms->num;
1182	else
1183	num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
1184
1185	MoreAccum = 1;
1186
1187	/* For each word in the list . . . */
1188	for (j = 0; j < num_terms; j++)
1189	{
1190	int Abort = 0;
1191	u_char *buffer;
1192	int pos;
1193	int i;
1194	mg_u_long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
1195	double Wqt, WordLog;
1196	int dn_blk = 0, len_blk = 0, k, kd;
1197	int blk; /* bblock control parameter */
1198	int p; /* The number of documents the word occurs in */
1199	WordEntry *we = &Terms->TE[j].WE;
1200	Invf_Doc_EntryH *next = NULL;
1201	int next_i, next_doc_num, next_start;
1202	int skips_taken = 0, ptrs_dec = 0;
1203	double total_added = 0.0;
1204	int hits = 0;
1205	if (sk)
1206	{
1207	fprintf (sk, "%3d : %8d %6d \"%.s\"%s", j, we->count,
1208	we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
1209	indent - Terms->TE[j].Word[0], "");
1210	}
1211
1212	CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
1213	if (!(buffer = Xmalloc (we->invf_len)))
1214	break;
1215
1216	ChangeMemInUse (qd, we->invf_len);
1217
1218	if (sk && qd->id->ifh.skip_mode == 2)
1219	fprintf (sk, "%4d", k);
1220
1221	Fseek (qd->File_invf, we->invf_ptr, 0);
1222	Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
1223
1224	/* Read from the buffer */
1225	DECODE_START (buffer, we->invf_len);
1226
1227	WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
1228
1229	Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
1230
1231	qd->num_of_terms++;
1232
1233	if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums)
1234	{
1235	MoreAccum = 0;
1236	Anding = 1;
1237	if (!Sorted)
1238	{
1239	HT_sort (qd, HT);
1240	Sorted = 1;
1241	}
1242	next = HT->Head;
1243	}
1244	i = pos = 0;
1245	next_i = next_start = next_doc_num = 0;
1246	kd = k - 1;
1247	while (i < p && (!Anding \|\| next))
1248	{
1249	int Count; /* The number of times the occurs in a document */
1250	double Wdt;
1251	mg_u_long diff;
1252	Invf_Doc_Entry *mem;
1253	if (k)
1254	{
1255	if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1256	next_doc_num >= 0)
1257	{
1258	#ifdef SHORT_CUT
1259	ShortCut:
1260	#endif
1261	i = next_i;
1262	DECODE_SEEK (next_start);
1263	pos = next_start;
1264	CurrDocNum = next_doc_num;
1265	qd->hops_taken++;
1266	skips_taken++;
1267	kd = k - 1;
1268	}
1269	if (kd == k - 1)
1270	{
1271	if (i + k < p)
1272	{
1273	int doc_diff, pos_diff;
1274	BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1275	next_doc_num += doc_diff;
1276	next_i += k;
1277	BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1278	next_start = pos + pos_diff;
1279	}
1280	else
1281	next_doc_num = -1;
1282	}
1283	#ifdef SHORT_CUT
1284	if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1285	next_doc_num >= 0)
1286	goto ShortCut;
1287	#else
1288	if (Anding && i + k < p && next->IDE.DocNum > next_doc_num &&
1289	next_doc_num >= 0)
1290	continue;
1291	#endif
1292	}
1293	if (k && kd == 0 && i != p - 1)
1294	{
1295	CurrDocNum = next_doc_num;
1296	}
1297	else
1298	{
1299	BBLOCK_DECODE_L (diff, blk, pos);
1300	CurrDocNum += diff;
1301	}
1302	qd->num_of_ptrs++;
1303	ptrs_dec++;
1304	if (qd->id->ifh.InvfLevel >= 2)
1305	GAMMA_DECODE_L (Count, pos);
1306	else
1307	Count = (double) we->count / p;
1308	i++;
1309	kd--;
1310	if (kd < 0)
1311	kd = k - 1;
1312
1313
1314	if (Anding)
1315	{
1316	while (next && next->IDE.DocNum < CurrDocNum - 1)
1317	next = next->next;
1318	if (next && next->IDE.DocNum == CurrDocNum - 1)
1319	{
1320	Wdt = Count * WordLog;
1321	next->IDE.Sum += Wqt * Wdt;
1322	total_added += Wqt * Wdt;
1323	hits++;
1324	}
1325	if (next && next->IDE.DocNum <= CurrDocNum - 1)
1326	next = next->next;
1327	goto NextDocNum;
1328	}
1329	if ((mem = HT_find (HT, CurrDocNum - 1)) == NULL)
1330	{
1331	if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums &&
1332	!MoreAccum)
1333	goto NextDocNum;
1334	Wdt = Count * WordLog;
1335	if (!HT_insert (HT, CurrDocNum - 1, Wqt * Wdt))
1336	{
1337	if (!HT->Suplimentary_Entries)
1338	HT_Sup_create (qd, HT, we->doc_count - i + 1);
1339	HT_Sup_Add (HT, CurrDocNum - 1, Wqt * Wdt);
1340	}
1341	if (rqi->MaxAccums != -1 && HT->num >= rqi->MaxAccums &&
1342	rqi->StopAtMaxAccum)
1343	Abort = 1;
1344	}
1345	else
1346	{
1347	Wdt = Count * WordLog;
1348	mem->Sum += Wqt * Wdt;
1349	total_added += Wqt * Wdt;
1350	hits++;
1351	}
1352
1353	NextDocNum:
1354	;
1355	}
1356
1357	DECODE_DONE
1358
1359	Xfree (buffer);
1360	ChangeMemInUse (qd, -we->invf_len);
1361	if (sk)
1362	{
1363	if (qd->id->ifh.skip_mode != 0)
1364	fprintf (sk, " %6d", skips_taken);
1365	fprintf (sk, " %6d %6d %6d%12.2f\n",
1366	ptrs_dec, hits, HT->num + HT->Suplimentary_Num, total_added);
1367	}
1368	if (Abort)
1369	break;
1370	}
1371	if (sk)
1372	fclose (sk);
1373	if (!HT->Head)
1374	HT_sort (qd, HT);
1375	return HT;
1376	}
1377
1378
1379
1380	List_Table *
1381	CosineDecodeList (query_data * qd, TermList * Terms,
1382	RankedQueryInfo * rqi)
1383	{
1384	int j, num_docs, num_terms, MoreAccum;
1385	List_Table *LT;
1386	int Anding = 0;
1387	int Sorted = 0;
1388	int has_grown = 0; /* [RJM 07/97: Ranked Required Match] whether elements have been added lately */
1389	FILE *sk = open_skip (rqi->skip_dump);
1390	u_char indent = 0;
1391	if (sk)
1392	{
1393	if (!skip_header)
1394	{
1395	skip_header = 1;
1396	fprintf (sk, "%s", skip_header_text[qd->id->ifh.skip_mode]);
1397	fprintf (sk, "\nSkipping method %d\n", qd->id->ifh.skip_mode);
1398	switch (qd->id->ifh.skip_mode)
1399	{
1400	case 1:
1401	fprintf (sk, "Skipping is every %d docnums\n",
1402	qd->id->ifh.params[0]);
1403	break;
1404	case 2:
1405	fprintf (sk, "Max nodes = %d\nNo skips smaller or equal to %d\n",
1406	qd->id->ifh.params[0], qd->id->ifh.params[1]);
1407	break;
1408	}
1409	}
1410	fprintf (sk, "\n\t\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
1411	for (j = 0; j < Terms->num; j++)
1412	if (Terms->TE[j].Word[0] > indent)
1413	indent = Terms->TE[j].Word[0];
1414	}
1415
1416	num_docs = qd->sd->sdh.num_of_docs;
1417
1418	/* Create the list for the documents */
1419	LT = LT_create (qd, rqi->MaxAccums);
1420	if (!LT)
1421	return NULL;
1422
1423	if (rqi->MaxTerms == -1)
1424	num_terms = Terms->num;
1425	else
1426	num_terms = rqi->MaxTerms < Terms->num ? rqi->MaxTerms : Terms->num;
1427
1428	MoreAccum = 1;
1429
1430	/* For each word in the list . . . */
1431	for (j = 0; j < num_terms; j++)
1432	{
1433	int Abort = 0;
1434	u_char *buffer;
1435	int pos;
1436	int i, n = 0;
1437	mg_u_long CurrDocNum = 0; /* NOTE: Document numbers start at 1 */
1438	mg_u_long LastDocNum = 0; /* [RJM 07/97: Ranked Required Match] */
1439	int this_item = 0; /* [RJM 07/97: Ranked Required Match] points to the next possible LT to delete */
1440	double Wqt, WordLog;
1441	int dn_blk = 0, len_blk = 0, k, kd;
1442	int blk; /* bblock control parameter */
1443	int p; /* The number of documents the word occurs in */
1444	int require_match = Terms->TE[j].require_match; /* [RJM 07/97: Ranked Required Match] */
1445	WordEntry *we = &Terms->TE[j].WE;
1446	Invf_Doc_Entry *next = NULL;
1447	int next_i, next_doc_num, next_start;
1448	int skips_taken = 0, ptrs_dec = 0;
1449	double total_added = 0.0;
1450	int hits = 0;
1451
1452	if (sk)
1453	{
1454	fprintf (sk, "%3d : %8d %6d \"%.s\"%s", j, we->count,
1455	we->doc_count, Terms->TE[j].Word[0], Terms->TE[j].Word + 1,
1456	indent - Terms->TE[j].Word[0], "");
1457	}
1458
1459	CalcBlks (qd, we, &blk, &dn_blk, &len_blk, &k, &p);
1460	if (!(buffer = Xmalloc (we->invf_len)))
1461	break;
1462
1463	ChangeMemInUse (qd, we->invf_len);
1464
1465	if (sk && qd->id->ifh.skip_mode == 2)
1466	fprintf (sk, "%4d", k);
1467
1468	Fseek (qd->File_invf, we->invf_ptr, 0);
1469	Fread (buffer, sizeof (char), we->invf_len, qd->File_invf);
1470
1471	/* Read from the buffer */
1472	DECODE_START (buffer, we->invf_len);
1473
1474	WordLog = log ((double) num_docs / (double) we->max_doc_count); /* [RPAP - Jan 97: Stem Index Change] */
1475
1476	Wqt = (rqi->QueryFreqs ? Terms->TE[j].Count : 1) * WordLog;
1477
1478	qd->num_of_terms++;
1479
1480	if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums)
1481	MoreAccum = 0;
1482
1483	if ((rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums) \|\| Anding)
1484	{
1485	Anding = 1;
1486	if (!Sorted \|\| has_grown)
1487	{
1488	LT_sort (LT);
1489	LT_pack (LT);
1490	Sorted = 1;
1491	has_grown = 0;
1492	}
1493	next = LT->IDE;
1494	n = 0;
1495	}
1496
1497	/* [RJM 07/97: Ranked Required Terms] */
1498	/* make sure that the document list is set up correctly for */
1499	/* requiring a term to match */
1500	if (require_match == 1) {
1501	if (!Sorted \|\| has_grown) {
1502	LT_sort (LT);
1503	LT_pack (LT);
1504	Sorted = 1;
1505	has_grown = 0;
1506	}
1507	/* this_item is already set to zero */
1508	}
1509
1510	i = pos = 0;
1511	next_i = next_start = next_doc_num = 0;
1512	kd = k - 1;
1513	while (i < p && (!Anding \|\| n < LT->num))
1514	{
1515	int Count; /* The number of times the occurs in a document */
1516	double Wdt;
1517	mg_u_long diff;
1518	Invf_Doc_Entry *mem;
1519
1520	/* deal with skipping */
1521	if (k)
1522	{
1523	/* see if we should take a skip */
1524	if (Anding && i + k < p && next->DocNum > next_doc_num &&
1525	next_doc_num >= 0)
1526	{
1527	i = next_i;
1528	DECODE_SEEK (next_start);
1529	pos = next_start;
1530	CurrDocNum = next_doc_num;
1531	qd->hops_taken++;
1532	skips_taken++;
1533	kd = k - 1;
1534	}
1535	if (kd == k - 1)
1536	{
1537	if (i + k < p)
1538	{
1539	int doc_diff, pos_diff;
1540	BBLOCK_DECODE_L (doc_diff, dn_blk, pos);
1541	next_doc_num += doc_diff;
1542	next_i += k;
1543	BBLOCK_DECODE_L (pos_diff, len_blk, pos);
1544	next_start = pos + pos_diff;
1545	}
1546	else
1547	{
1548	next_doc_num = -1;
1549	}
1550	}
1551	/* should we take another skip? */
1552	if (Anding && i + k < p && next->DocNum > next_doc_num &&
1553	next_doc_num >= 0)
1554	continue;
1555	}
1556	if (k && kd == 0 && i != p - 1)
1557	CurrDocNum = next_doc_num;
1558	else
1559	{
1560	BBLOCK_DECODE_L (diff, blk, pos);
1561	CurrDocNum += diff;
1562	}
1563	qd->num_of_ptrs++;
1564	ptrs_dec++;
1565	if (qd->id->ifh.InvfLevel >= 2)
1566	GAMMA_DECODE_L (Count, pos);
1567	else
1568	Count = (double) we->count / p;
1569	i++;
1570	kd--;
1571	if (kd < 0)
1572	kd = k - 1;
1573	Wdt = Count * WordLog;
1574
1575	/* [RJM 07/97: Ranked Required Terms] */
1576	/* if this is a required match we must remove all accumulators */
1577	/* of intermediate documents */
1578	if (require_match == 1) {
1579	while (this_item < LT->num && LT->IDE[this_item].DocNum < CurrDocNum - 1) {
1580	if (LastDocNum==0 \|\|(LT->IDE[this_item].DocNum > LastDocNum - 1))
1581	LT->IDE[this_item].Sum = -1000.0; /* mark for deletion */
1582	this_item++;
1583	}
1584	}
1585
1586	if (Anding)
1587	{
1588	while (n < LT->num && next->DocNum < CurrDocNum - 1)
1589	{
1590	next++;
1591	n++;
1592	}
1593	if (n < LT->num && next->DocNum == CurrDocNum - 1)
1594	{
1595	next->Sum += Wqt * Wdt;
1596	total_added += Wqt * Wdt;
1597	hits++;
1598	}
1599	if (n < LT->num && next->DocNum <= CurrDocNum - 1)
1600	{
1601	next++;
1602	n++;
1603	}
1604	goto NextDocNum;
1605	}
1606	if (!Anding)
1607	{
1608	if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums &&
1609	!MoreAccum)
1610	goto NextDocNum;
1611	LT = LT_add (qd, LT, CurrDocNum - 1, Wqt * Wdt);
1612	if (!LT)
1613	{
1614	Abort = 1;
1615	goto FastAbort;
1616	}
1617
1618	/* the list of accumulators just got bigger */
1619	has_grown = 1;
1620
1621	if (rqi->MaxAccums != -1 && LT->num >= rqi->MaxAccums &&
1622	rqi->StopAtMaxAccum)
1623	Abort = 1;
1624
1625
1626	}
1627	else if ((mem = LT_find (LT, CurrDocNum - 1)))
1628	{
1629	mem->Sum += Wqt * Wdt;
1630	total_added += Wqt * Wdt;
1631	hits++;
1632	}
1633
1634	NextDocNum:
1635
1636	LastDocNum = CurrDocNum;
1637	}
1638
1639	FastAbort:
1640
1641	DECODE_DONE
1642
1643	Xfree (buffer);
1644	ChangeMemInUse (qd, -we->invf_len);
1645	if (sk)
1646	{
1647	if (qd->id->ifh.skip_mode != 0)
1648	fprintf (sk, " %6d", skips_taken);
1649	fprintf (sk, " %6d %6d %6d%12.2f\n",
1650	ptrs_dec, hits, LT->num, total_added);
1651	}
1652	if (Abort)
1653	break;
1654
1655	/* [RJM 07/97: Ranked Required Terms] */
1656	/* remove the remaining accumulators if this term is required to match */
1657	if (require_match == 1) {
1658	while (this_item < LT->num) {
1659	if (LT->IDE[this_item].DocNum > LastDocNum - 1)
1660	LT->IDE[this_item].Sum = -1000.0; /* mark for deletion */
1661	this_item++;
1662	}
1663	Anding = 1; /* no more documents should be added */
1664
1665	/* Get rid of any documents which have been deleted */
1666	LT_pack (LT);
1667	}
1668	}
1669
1670	if (sk)
1671	fclose (sk);
1672	return LT;
1673	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/mg/src/text/invf_get.c@ 25147

Download in other formats: