Context Navigation

query.ranked.c@ 23508

Last change on this file since 23508 was 23508, checked in by sjm84, 13 years ago
Committing 64 bit changes into the branch
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 23.5 KB

Rev	Line
[3745]	1	/**************************************************************************
	2	*
	3	* query.ranked.c -- Ranked query evaluation
	4	* Copyright (C) 1994 Neil Sharman
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	* $Id: query.ranked.c 23508 2010-12-17 01:04:10Z sjm84 $
	21	*
	22	**************************************************************************/
	23
	24	#include "sysfuncs.h"
	25
	26	#include "memlib.h"
	27	#include "filestats.h"
	28	#include "messages.h"
	29	#include "timing.h"
	30	#include "sptree.h"
	31
	32	#include "mg.h"
	33	#include "invf.h"
	34	#include "text.h"
	35	#include "lists.h"
	36	#include "backend.h"
	37	#include "stem_search.h"
	38	#include "weights.h"
	39	#include "text_get.h"
	40	#include "invf_get.h"
	41	#include "words.h"
	42	#include "stemmer.h"
	43	#include "locallib.h"
	44	#include "environment.h"
	45	#include "term_lists.h"
	46	#include "local_strings.h"
	47	#include "query_term_list.h" /* [RPAP - Feb 97: Term Frequency] */
	48
	49	/*
	50	$Log$
	51	Revision 1.1 2003/02/20 21:18:24 mdewsnip
	52	Addition of MG package for search and retrieval
	53
	54	Revision 1.1 1999/08/10 21:18:20 sjboddie
	55	renamed mg-1.3d directory mg
	56
	57	Revision 1.2 1998/11/25 07:55:50 rjmcnab
	58
	59	Modified mg to that you can specify the stemmer you want
	60	to use via a command line option. You specify it to
	61	mg_passes during the build process. The number of the
	62	stemmer that you used is stored within the inverted
	63	dictionary header and the stemmed dictionary header so
	64	the correct stemmer is used in later stages of building
	65	and querying.
	66
	67	Revision 1.1 1998/11/17 09:35:34 rjmcnab
	68	* empty log message *
	69
	70	* Revision 1.4 1994/11/25 03:47:46 tes
	71	* Committing files before adding the merge stuff.
	72	*
	73	* Revision 1.3 1994/10/20 03:57:03 tes
	74	* I have rewritten the boolean query optimiser and abstracted out the
	75	* components of the boolean query.
	76	*
	77	* Revision 1.2 1994/09/20 04:42:04 tes
	78	* For version 1.1
	79	*
	80	*/
	81
	82	static char *RCSID = "$Id: query.ranked.c 23508 2010-12-17 01:04:10Z sjm84 $";
	83
	84	/*************************************************************************/
	85
	86	typedef struct HeapEntry
	87	{
	88	float Weight;
	89	float *OrgWeight;
	90	int DocNum;
[23508]	91	mg_u_long SeekPos; /* position in the text file in bytes */
	92	mg_u_long Len; /* length of the document in bytes */
[3745]	93	}
	94	HeapEntry;
	95
	96	typedef int (HeapComp) (HeapEntry , HeapEntry *);
	97
	98	typedef struct Heap
	99	{
	100	int NumItems;
	101	int MaxSize;
	102	HeapComp HC;
	103	HeapEntry HE[1];
	104	}
	105	Heap;
	106
	107
	108	Heap *
	109	Heap_Make (int size, HeapComp hc)
	110	{
	111	Heap *H;
	112	H = Xmalloc (sizeof (Heap) + (size - 1) * sizeof (HeapEntry));
	113	if (!H)
	114	return NULL;
	115	H->NumItems = 0;
	116	H->MaxSize = size;
	117	H->HC = hc;
	118	return H;
	119	}
	120
	121	int
	122	Heap_Size (Heap * H)
	123	{
	124	return sizeof (Heap) + H->MaxSize * sizeof (HeapEntry);
	125	}
	126
	127	HeapEntry *
	128	Heap_GetHead (Heap * H)
	129	{
	130	if (H && H->NumItems)
	131	return &H->HE[0];
	132	else
	133	return NULL;
	134	}
	135
	136
	137
	138	void
	139	Heap_Heapify (Heap * H, int i)
	140	{
	141	register int curr, child;
	142	curr = i;
	143	child = curr * 2;
	144	while (child <= H->NumItems)
	145	{
	146	if (child < H->NumItems && H->HC (&H->HE[child], &H->HE[child - 1]) > 0)
	147	child++;
	148	if (H->HC (&H->HE[curr - 1], &H->HE[child - 1]) < 0)
	149	{
	150	HeapEntry temp = H->HE[child - 1];
	151	H->HE[child - 1] = H->HE[curr - 1];
	152	H->HE[curr - 1] = temp;
	153	curr = child;
	154	child = 2 * child;
	155	}
	156	else
	157	break;
	158	}
	159	}
	160
	161
	162	void
	163	Heap_Build (Heap * H)
	164	{
	165	register int i;
	166	for (i = H->NumItems / 2; i > 0; i--)
	167	Heap_Heapify (H, i);
	168	}
	169
	170
	171	void
	172	Heap_Sort (Heap * H)
	173	{
	174	register int i;
	175	for (i = H->NumItems; i > 1; i--)
	176	{
	177	HeapEntry temp = H->HE[0];
	178	H->HE[0] = H->HE[i - 1];
	179	H->HE[i - 1] = temp;
	180	H->NumItems--;
	181	Heap_Heapify (H, 1);
	182	}
	183	}
	184
	185
	186	void
	187	Heap_DeleteHead (Heap * H)
	188	{
	189	H->HE[0] = H->HE[--H->NumItems];
	190	Heap_Heapify (H, 1);
	191	}
	192
	193	int
	194	Heap_Lesser (HeapEntry * a, HeapEntry * b)
	195	{
	196	return (a->Weight > b->Weight ? -1 :
	197	(a->Weight == b->Weight ? 0 : 1));
	198	}
	199
	200
	201	int
	202	Heap_Greater (HeapEntry * a, HeapEntry * b)
	203	{
	204	return (a->Weight > b->Weight ? 1 :
	205	(a->Weight == b->Weight ? 0 : -1));
	206	}
	207
	208	int
	209	Make_Exact_Root (query_data * qd, Heap * H)
	210	{
	211	int num = 0;
	212	HeapEntry *he = H->HE;
	213	while (he->SeekPos == 0)
	214	{
	215	he->Weight = he->Weight *
	216	GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
	217	FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
	218	Heap_Heapify (H, 1);
	219	num++;
	220	}
	221	return num;
	222	}
	223
	224	void
	225	Heap_Dump (Heap * H, int num)
	226	{
	227	int i, l, lines, r;
	228	if (num > H->NumItems)
	229	num = H->NumItems;
	230	lines = (num + 3) / 4;
	231	for (l = 0; l < lines; l++)
	232	for (r = 0; r < 4; r++)
	233	{
	234	i = lines * r + l;
	235	if (i < num)
	236	fprintf (stderr, "[%2d] %7.4f", i, H->HE[i].Weight);
	237	fprintf (stderr, r == 3 ? "\n" : " ");
	238	}
	239	fprintf (stderr, "\n");
	240	}
	241
	242	/*************************************************************************/
	243
	244
	245
	246
	247
	248
	249	int
	250	doc_count_comp (const void A, const void B)
	251	{
	252	const TermEntry *a = A;
	253	const TermEntry *b = B;
	254	return (a->WE.doc_count - b->WE.doc_count);
	255	}
	256
	257	/* =========================================================================
	258	* Function: ParseRankedQuery
	259	* Description:
	260	* Takes a string query line and extracts the terms in the query
	261	* which exist in the stemmed dictionary.
	262	* Optionally sorts the terms into ascending order by doc. number.
	263	* Input:
	264	* stemmed dictionary, query line, sort-flag
	265	* Output:
	266	* A list of terms.
	267	* ========================================================================= */
	268
	269	/* [RPAP - Jan 97: Stem Index Change] */
	270	static TermList *
	271	ParseRankedQuery (stemmed_dict * sd, char *QueryLine, int Sort, int indexed,
	272	QueryTermList *query_term_list) / [RPAP - Feb 97: Term Frequency] */
	273	{
	274	u_char Word[MAXSTEMLEN + 1];
	275	u_char sWord[MAXSTEMLEN + 1];
	276	u_char end, s_in;
	277	int default_stem_method = 0;
	278	TermList *Terms = MakeTermList(0);
	279
	280	s_in = (u_char *) QueryLine;
	281	end = s_in + strlen ((char *) s_in) - 1;
	282	query_term_list = MakeQueryTermList(0); / [RPAP - Feb 97: Term Frequency] */
	283
	284	if (indexed)
	285	default_stem_method = BooleanEnv (GetEnv ("casefold"), 0) \| (BooleanEnv (GetEnv ("stem"), 0) << 1);
	286	else
	287	default_stem_method = sd->sdh.stem_method;
	288
	289	while (s_in <= end)
	290	{
	291	int j;
[23508]	292	mg_s_long num_entries, word_num;
	293	mg_u_long count, doc_count, invf_ptr, invf_len;
[3745]	294	int weight_to_apply, stem_to_apply;
	295	int method_using = -1;
	296
	297	/* 0=optional, 1=mustmatch */
	298	int require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
	299
	300	/* Skip over the non word separator taking note of any parameters */
	301	PARSE_RANKED_NON_STEM_WORD (require_match, s_in, end); /* [RJM 07/97: Ranked Required Terms] */
	302	if (s_in > end) break;
	303
	304	/* Get a word and stem it */
	305	PARSE_STEM_WORD (Word, s_in, end);
	306
	307	/* Extract any parameters */
	308	weight_to_apply = 1;
	309	stem_to_apply = default_stem_method;
	310	while (s_in <= end)
	311	{
	312	int stem_param, weight_param, param_type;
	313	char param[MAXPARAMLEN + 1];
	314
	315	param_type = 0;
	316	PARSE_OPT_TERM_PARAM (param, param_type, s_in, end);
	317	if (!param_type)
	318	break;
	319
	320	switch (param_type)
	321	{
	322	case (WEIGHTPARAM):
	323	weight_param = atoi (param);
	324	if (errno != ERANGE && weight_param > 0)
	325	weight_to_apply = weight_param;
	326	break;
	327
	328	case (STEMPARAM):
	329	stem_param = atoi (param);
	330	if (errno != ERANGE && indexed && stem_param >= 0 && stem_param <= 3)
	331	method_using = stem_to_apply = stem_param;
	332	break;
	333	}
	334	}
	335
	336	bcopy ((char ) Word, (char ) sWord, *Word + 1);
	337	stemmer (stem_to_apply, sd->sdh.stemmer_num, sWord);
	338
	339	if (!indexed \|\| stem_to_apply == 0)
	340	{
	341	/* Look for the word in the already identified terms */
	342	for (j = 0; j < Terms->num; j++)
	343	if (compare (Terms->TE[j].Word, Word) == 0)
	344	break;
	345
	346	/* Increment the weight if the word is in the list */
	347	/* Update the require match attribute */
	348	if (j < Terms->num)
	349	{
	350	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
	351	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
	352	Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
	353	AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
	354	}
	355	else
	356	{
	357	/* Look for it in the stemmed dictionary */
	358	if ((word_num = FindWord (sd, sWord, &count, &doc_count,
	359	&invf_ptr, &invf_len)) != -1)
	360	{
	361	/* Search the list for the word */
	362	for (j = 0; j < Terms->num; j++)
	363	if (Terms->TE[j].WE.word_num == word_num)
	364	break;
	365
	366	/* Increment the weight if the word is in the list */
	367	if (j < Terms->num)
	368	{
	369	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
	370	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
	371	Terms->TE[j].require_match = require_match; /* [RJM 07/97: Ranked Require match] */
	372	AddQueryTerm (query_term_list, Word, Terms->TE[j].WE.count, method_using); /* [RPAP - Feb 97: Term Frequency] */
	373	}
	374	else
	375	{
	376	/* Create a new entry in the list for the new word */
	377	TermEntry te;
	378
	379	te.WE.word_num = word_num;
	380	te.WE.count = count;
	381	te.WE.doc_count = doc_count;
	382	te.WE.max_doc_count = doc_count;
	383	te.WE.invf_ptr = invf_ptr;
	384	te.WE.invf_len = invf_len;
	385	te.Count = weight_to_apply;
	386	te.Word = copy_string (Word);
	387	if (!te.Word)
	388	FatalError (1, "Could NOT create memory to add term");
	389	te.Stem = NULL;
	390	te.require_match = require_match;
	391
	392	AddTermEntry (&Terms, &te);
	393
	394	/* [RPAP - Feb 97: Term Frequency] */
	395	AddQueryTerm (query_term_list, Word, count, method_using);
	396	}
	397	}
	398	/* [RPAP - Feb 97: Term Frequency] */
	399	else
	400	AddQueryTerm (query_term_list, Word, 0, method_using);
	401	}
	402	}
	403	else
	404	{
	405	int total_count = 0; /* [RPAP - Feb 97: Term Frequency] */
	406	TermList *tempList = MakeTermList (0);
	407	if ((num_entries = FindWords (sd, sWord, stem_to_apply, &tempList)) > 0)
	408	{
	409	int i;
[23508]	410	mg_u_long max_doc_count = 0;
[3745]	411
	412	/* get the maximum doc count */
	413	for (i = 0; i < tempList->num; i++)
	414	{
	415	if (tempList->TE[i].WE.doc_count > max_doc_count)
	416	max_doc_count = tempList->TE[i].WE.doc_count;
	417	total_count += tempList->TE[i].WE.count; /* [RPAP - Feb 97: Term Frequency] */
	418	}
	419
	420	for (i = 0; i < tempList->num; i++)
	421	{
	422	/* Look for the word(s) in the already identified terms */
	423	for (j = 0; j < Terms->num; j++)
	424	{
	425	if (compare (Terms->TE[j].Word, tempList->TE[i].Word) == 0)
	426	{
	427	/* found the word */
	428	/* Modify weight */
	429	Terms->TE[j].Count = ((Terms->TE[j].Count + weight_to_apply > INT_MAX) ?
	430	INT_MAX : (Terms->TE[j].Count + weight_to_apply));
	431	if (Terms->TE[j].WE.max_doc_count < max_doc_count)
	432	Terms->TE[j].WE.max_doc_count = max_doc_count;
	433	break;
	434	}
	435	}
	436
	437	if (j == Terms->num)
	438	{
	439	/* word was not found */
	440	tempList->TE[i].WE.max_doc_count = max_doc_count;
	441	tempList->TE[i].Count = weight_to_apply;
	442
	443	/* We cannot require a term to match if it is expanded */
	444	/* into multiple terms :-( */
	445	tempList->TE[i].require_match = 0; /* [RJM 07/97: Ranked Required Terms] */
	446
	447	AddTermEntry (&Terms, &tempList->TE[i]);
	448	}
	449	}
	450	}
	451	/* [RPAP - Feb 97: Term Frequency] */
	452	AddQueryTerm (query_term_list, Word, total_count, method_using);
	453
	454	if (tempList != NULL) Xfree(tempList); /* [RJM 07/98: Memory Leak] */
	455	} /* end indexed */
	456	} /* end while */
	457
	458	if (Sort)
	459	/* Sort the terms in ascending order by doc_count */
	460	qsort (Terms->TE, Terms->num, sizeof (TermEntry), doc_count_comp);
	461	return (Terms);
	462	}
	463
	464
	465
	466
	467	static int
	468	DE_comp (void a, void b)
	469	{
	470	return ((DocEntry ) a)->SeekPos - ((DocEntry ) b)->SeekPos;
	471	}
	472
	473
	474
	475
	476
	477	/*
	478	* This function is given a list of term numbers and it returns a list
	479	* of document numbers based on the cosine document weighting system.
	480	* This puts the entries in an array.
	481	* inverted file.
	482	* If MaxDocs == -1 then it means all
	483	*/
	484	static DocList *
	485	CosineGet (query_data * qd, TermList * Terms, RankedQueryInfo * rqi) {
	486	DocList *Docs;
	487	float *AccumulatedWeights = NULL;
	488	Splay_Tree *ST = NULL;
	489	Splay_Tree *Para_ST = NULL;
	490	Hash_Table *HT = NULL;
	491	List_Table *LT = NULL;
	492	Heap *H;
	493	HeapEntry *he;
	494	register float *fptr = NULL;
	495	register Invf_Doc_Entry *ide = NULL;
	496	register Invf_Doc_EntryH *ideh = NULL;
	497	int BackEnd, NumExact, MaxExact, NumParas;
	498	int MaxDocs = 0, MaxParas = 0;
	499	int i;
	500	Invf_Doc_Entry_Pool ide_pool;
	501	ide_pool.pool = NULL;
	502
	503	qd->hops_taken = qd->num_of_ptrs = qd->num_of_accum = 0;
	504
	505	switch (rqi->AccumMethod)
	506	{
	507	case 'S':
	508	ST = CosineDecodeSplay (qd, Terms, rqi, &ide_pool);
	509	if (!ST)
	510	return NULL;
	511	break;
	512	case 'A':
	513	AccumulatedWeights = CosineDecode (qd, Terms, rqi);
	514	if (!AccumulatedWeights)
	515	return NULL;
	516	break;
	517	case 'H':
	518	HT = CosineDecodeHash (qd, Terms, rqi);
	519	if (!HT)
	520	return NULL;
	521	break;
	522	case 'L':
	523	LT = CosineDecodeList (qd, Terms, rqi);
	524	if (!LT)
	525	return NULL;
	526	break;
	527	}
	528
	529	#if 0
	530	if (rqi->UseSplayTree)
	531	{
	532
	533	AccumulatedWeights = CosineDecode (qd, Terms, rqi);
	534	fptr = AccumulatedWeights;
	535	ide = SP_get_first (ST);
	536	for (i = 0; i < qd->sd->sdh.num_of_docs; i++)
	537	{
	538	if (AccumulatedWeights[i] != 0)
	539	{
	540	if (i != ide->DocNum)
	541	fprintf (stderr, "Sum mismatch for %d %f %d %f\n", i + 1,
	542	AccumulatedWeights[i], ide->DocNum + 1, ide->Sum);
	543	ide = SP_get_next (ST);
	544	}
	545	}
	546	}
	547	#endif
	548
	549	switch (rqi->AccumMethod)
	550	{
	551	case 'S':
	552	MaxParas = ST->no_of_items;
	553	break;
	554	case 'A':
	555	{ /* count the number of non-zero document weights */
	556	register int i = qd->sd->sdh.num_of_docs;
	557	register float *d;
	558	MaxParas = 0;
	559	for (d = AccumulatedWeights; i; i--, d++)
	560	if (*d)
	561	MaxParas++;
	562	}
	563	break;
	564	case 'H':
	565	MaxParas = HT->num + HT->Suplimentary_Num;
	566	break;
	567	case 'L':
	568	MaxParas = LT->num;
	569	break;
	570	}
	571
	572	if (rqi->MaxParasToRetrieve != -1 && MaxParas > rqi->MaxParasToRetrieve)
	573	MaxParas = rqi->MaxParasToRetrieve;
	574	MaxDocs = MaxParas;
	575
	576	/* Allocate memory for the heap */
	577	Docs = MakeDocList (MaxDocs);
	578	ChangeMemInUse (qd, sizeof (DocEntry) * MaxDocs);
	579
	580	H = Heap_Make (MaxDocs, Heap_Lesser);
	581
	582
	583	/* Get the sums from the array divide the sums by the
	584	document weights which we retrieve from the ".idx.wgt" file and put
	585	the resulting data into a heap */
	586
	587
	588	he = H->HE;
	589	H->NumItems = MaxDocs;
	590	switch (rqi->AccumMethod)
	591	{
	592	case 'S':
	593	{
	594	ide = SP_get_first (ST);
	595	for (i = 0; i < H->NumItems; i++, ide = SP_get_next (ST), he++)
	596	{
	597	he->DocNum = ide->DocNum + 1;
	598	he->OrgWeight = &ide->Sum;
	599	qd->num_of_accum++;
	600	}
	601	}
	602	break;
	603	case 'A':
	604	{
	605	fptr = AccumulatedWeights;
	606	for (i = 0; i < H->NumItems; i++, fptr++, he++)
	607	{
	608	he->DocNum = i + 1;
	609	he->OrgWeight = fptr;
	610	if (*fptr)
	611	qd->num_of_accum++;
	612	}
	613	}
	614	break;
	615	case 'H':
	616	{
	617	ideh = HT->Head;
	618	for (i = 0; i < H->NumItems; i++, ideh = ideh->next, he++)
	619	{
	620	he->DocNum = ideh->IDE.DocNum + 1;
	621	he->OrgWeight = &ideh->IDE.Sum;
	622	qd->num_of_accum++;
	623	}
	624	}
	625	break;
	626	case 'L':
	627	{
	628	ide = LT->IDE;
	629	for (i = 0; i < H->NumItems; i++, ide++, he++)
	630	{
	631	he->DocNum = ide->DocNum + 1;
	632	he->OrgWeight = &ide->Sum;
	633	qd->num_of_accum++;
	634	}
	635	}
	636	break;
	637	}
	638
	639	he = H->HE;
	640	for (i = 0; i < H->NumItems; i++, he++)
	641	{
	642	*he->OrgWeight /= GetLowerApproxDocWeight (qd->awd, he->DocNum - 1);
	643	he->Weight = *he->OrgWeight;
	644	*he->OrgWeight = 0;
	645	he->SeekPos = he->Len = 0;
	646	}
	647
	648	Heap_Build (H);
	649
	650	he = H->HE;
	651	switch (rqi->AccumMethod)
	652	{
	653	case 'S':
	654	{
	655	for (i = MaxDocs; i < ST->no_of_items; i++, ide = SP_get_next (ST))
	656	{
	657	ide->Sum /= GetLowerApproxDocWeight (qd->awd, ide->DocNum);
	658	qd->num_of_accum++;
	659	if (ide->Sum <= he->Weight)
	660	continue;
	661	*he->OrgWeight = he->Weight;
	662	he->DocNum = ide->DocNum + 1;
	663	he->Weight = ide->Sum;
	664	he->OrgWeight = &ide->Sum;
	665	*he->OrgWeight = 0;
	666	Heap_Heapify (H, 1);
	667	}
	668	}
	669	break;
	670	case 'A':
	671	{
	672	for (i = MaxDocs; i < qd->sd->sdh.num_of_docs; i++, fptr++)
	673	{
	674	if (!*fptr)
	675	continue;
	676	qd->num_of_accum++;
	677	*fptr /= GetLowerApproxDocWeight (qd->awd, i);
	678	if (*fptr <= he->Weight)
	679	continue;
	680	*he->OrgWeight = he->Weight;
	681	he->DocNum = i + 1;
	682	he->Weight = *fptr;
	683	he->OrgWeight = fptr;
	684	*he->OrgWeight = 0;
	685	Heap_Heapify (H, 1);
	686	}
	687	}
	688	break;
	689	case 'H':
	690	{
	691	for (; ideh; ideh = ideh->next)
	692	{
	693	qd->num_of_accum++;
	694	ideh->IDE.Sum /=
	695	GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum);
	696	if (ideh->IDE.Sum <= he->Weight)
	697	continue;
	698	*he->OrgWeight = he->Weight;
	699	he->DocNum = ideh->IDE.DocNum + 1;
	700	he->Weight = ideh->IDE.Sum;
	701	he->OrgWeight = &ideh->IDE.Sum;
	702	*he->OrgWeight = 0;
	703	Heap_Heapify (H, 1);
	704	}
	705	}
	706	break;
	707	case 'L':
	708	{
	709	for (i = MaxDocs; i < LT->num; i++, ide++)
	710	{
	711	qd->num_of_accum++;
	712	ide->Sum /=
	713	GetLowerApproxDocWeight (qd->awd, ide->DocNum);
	714	if (ide->Sum <= he->Weight)
	715	continue;
	716	*he->OrgWeight = he->Weight;
	717	he->DocNum = ide->DocNum + 1;
	718	he->Weight = ide->Sum;
	719	he->OrgWeight = &ide->Sum;
	720	*he->OrgWeight = 0;
	721	Heap_Heapify (H, 1);
	722	}
	723	}
	724	break;
	725	}
	726
	727
	728	if (rqi->Exact && qd->id->ifh.InvfLevel != 3)
	729	{
	730	HeapEntry *he = H->HE;
	731
	732	for (i = 0; i < H->NumItems; i++, he++)
	733	{
	734	he->Weight = he->Weight *
	735	GetLowerApproxDocWeight (qd->awd, he->DocNum - 1) /
	736	FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
	737	}
	738
	739	Heap_Build (H);
	740
	741	he = H->HE;
	742
	743	switch (rqi->AccumMethod)
	744	{
	745	case 'S':
	746	{
	747	ide = SP_get_first (ST);
	748	for (i = 0; i < ST->no_of_items; i++, ide = SP_get_next (ST))
	749	{
[23508]	750	mg_u_long SeekPos, Len;
[3745]	751	float Weight;
	752	if (!ide->Sum)
	753	continue;
	754	if (ide->Sum <= he->Weight)
	755	continue;
	756	Weight = ide->Sum *
	757	GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
	758	FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
	759	if (Weight <= he->Weight)
	760	continue;
	761	he->DocNum = ide->DocNum + 1;
	762	he->OrgWeight = &ide->Sum;
	763	he->Weight = Weight;
	764	he->SeekPos = SeekPos;
	765	he->Len = Len;
	766	ide->Sum = 0;
	767	Heap_Heapify (H, 1);
	768	}
	769	}
	770	break;
	771
	772	/* up to here */
	773
	774	case 'A':
	775	{
	776	fptr = AccumulatedWeights;
	777	for (i = 0; i < qd->sd->sdh.num_of_docs; i++, fptr++)
	778	{
[23508]	779	mg_u_long SeekPos, Len;
[3745]	780	float Weight;
	781	if (!*fptr)
	782	continue;
	783	if (*fptr <= he->Weight)
	784	continue;
	785	Weight = fptr
	786	GetLowerApproxDocWeight (qd->awd, i) /
	787	FetchDocStart (qd, i + 1, &SeekPos, &Len);
	788	if (Weight <= he->Weight)
	789	continue;
	790	he->DocNum = i + 1;
	791	he->OrgWeight = fptr;
	792	he->Weight = Weight;
	793	he->SeekPos = SeekPos;
	794	he->Len = Len;
	795	*fptr = 0;
	796	Heap_Heapify (H, 1);
	797	}
	798	}
	799	break;
	800	case 'H':
	801	{
	802	ideh = HT->Head;
	803	for (ideh = HT->Head; ideh; ideh = ideh->next)
	804	{
[23508]	805	mg_u_long SeekPos, Len;
[3745]	806	float Weight;
	807	if (!ideh->IDE.Sum)
	808	continue;
	809	if (ideh->IDE.Sum <= he->Weight)
	810	continue;
	811	Weight = ideh->IDE.Sum *
	812	GetLowerApproxDocWeight (qd->awd, ideh->IDE.DocNum) /
	813	FetchDocStart (qd, ideh->IDE.DocNum + 1, &SeekPos, &Len);
	814	if (Weight <= he->Weight)
	815	continue;
	816	he->DocNum = ideh->IDE.DocNum + 1;
	817	he->OrgWeight = &ideh->IDE.Sum;
	818	he->Weight = Weight;
	819	he->SeekPos = SeekPos;
	820	he->Len = Len;
	821	ideh->IDE.Sum = 0;
	822	Heap_Heapify (H, 1);
	823	}
	824	}
	825	break;
	826	case 'L':
	827	{
	828	ide = LT->IDE;
	829	for (i = 0; i < LT->num; i++, ide++)
	830	{
[23508]	831	mg_u_long SeekPos, Len;
[3745]	832	float Weight;
	833	if (!ide->Sum)
	834	continue;
	835	if (ide->Sum <= he->Weight)
	836	continue;
	837	Weight = ide->Sum *
	838	GetLowerApproxDocWeight (qd->awd, ide->DocNum) /
	839	FetchDocStart (qd, ide->DocNum + 1, &SeekPos, &Len);
	840	if (Weight <= he->Weight)
	841	continue;
	842	he->DocNum = ide->DocNum + 1;
	843	he->OrgWeight = &ide->Sum;
	844	he->Weight = Weight;
	845	he->SeekPos = SeekPos;
	846	he->Len = Len;
	847	ide->Sum = 0;
	848	Heap_Heapify (H, 1);
	849	}
	850	}
	851	break;
	852	}
	853	}
	854
	855
	856
	857	H->HC = Heap_Greater;
	858	Heap_Build (H);
	859
	860
	861	MaxDocs = H->NumItems;
	862	if (rqi->MaxDocsToRetrieve != -1 && MaxDocs > rqi->MaxDocsToRetrieve)
	863	MaxDocs = rqi->MaxDocsToRetrieve;
	864
	865	/* Alarm */
	866
	867	he = H->HE;
	868	BackEnd = H->NumItems - 1;
	869	NumExact = 0;
	870	MaxExact = H->NumItems;
	871	NumParas = 0;
	872	Para_ST = SP_createset (DE_comp);
	873	while (H->NumItems && Docs->num < MaxDocs)
	874	{
	875	DocEntry DocEnt;
	876	DocEntry *mem;
	877
	878	if (rqi->Exact)
	879	{
	880	if (H->HE[0].SeekPos == 0)
	881	NumExact += Make_Exact_Root (qd, H);
	882	}
	883	else
	884	FetchDocStart (qd, he->DocNum, &he->SeekPos, &he->Len);
	885
	886	NumParas++;
	887
	888	DocEnt.DocNum = he->DocNum;
	889	DocEnt.Weight = he->Weight;
	890	DocEnt.Len = he->Len;
	891	DocEnt.SeekPos = he->SeekPos;
	892	DocEnt.CompTextBuffer = NULL;
	893	DocEnt.Next = NULL;
	894
	895	Heap_DeleteHead (H);
	896
	897	if (!(mem = SP_member (&DocEnt, Para_ST)))
	898	{
	899	Docs->DE[Docs->num] = DocEnt;
	900	SP_insert (&Docs->DE[Docs->num], Para_ST);
	901	Docs->num++;
	902	}
	903	else
	904	{
	905	DocEnt.Next = mem->Next;
	906	Docs->DE[BackEnd] = DocEnt;
	907	mem->Next = &Docs->DE[BackEnd--];
	908	}
	909	}
	910	SP_freeset (Para_ST);
	911
	912	if (qd->id->ifh.InvfLevel == 3)
	913	{
	914	Message ("%d Paragraphs were required to get %d documents",
	915	NumParas, Docs->num);
	916	if (NumExact == MaxExact)
	917	{
	918	Message ("The exact weights of all %d paragraphs had to be calculated", MaxExact);
	919	Message ("to obtain %d paragraphs. This may mean that the the documents", NumParas);
	920	Message ("returned do not necessarly represent an exact cosine ranking.");
	921	Message ("This problem may be corrected by increasing \'maxparas\'.");
	922	}
	923	}
	924	#if 0
	925	{
	926	int i;
	927	FILE *f = fopen ("top.paras", "w");
	928	fprintf (f, "=========================\nTop Paragraphs\n");
	929	for (i = 0; i < Docs->num; i++)
	930	{
	931	DocEntry *d;
	932	fprintf (f, "<%d(%f)> ", Heap[i].DocNum, Heap[i].Weight);
	933	for (d = Heap[i].Next; d; d = d->Next)
	934	fprintf (f, "%d(%f) ", d->DocNum, d->Weight);
	935	fprintf (f, "\n");
	936	}
	937	fprintf (f, "=========================\n");
	938	fclose (f);
	939	}
	940	#endif
	941
	942	if (AccumulatedWeights)
	943	{
	944	Xfree (AccumulatedWeights);
	945	ChangeMemInUse (qd, -sizeof (float) * qd->sd->sdh.num_of_docs);
	946	}
	947	if (ST)
	948	{
	949	int mem = ST->mem_in_use;
	950	SP_freeset (ST);
	951	ChangeMemInUse (qd, -mem);
	952	free_ide_pool (qd, &ide_pool);
	953	}
	954	if (HT)
	955	HT_free (qd, HT);
	956
	957	if (LT)
	958	LT_free (qd, LT);
	959
	960	if (H) Xfree (H); /* [RJM 07/98: Memory Leak] */
	961
	962	return (Docs);
	963	}
	964
	965
	966
	967
	968	/* if MaxDocs == -1 it means all */
	969	void
	970	RankedQuery (query_data qd, char Query, RankedQueryInfo *rqi)
	971	{
	972	DocList *dl;
	973
	974	if (qd->TL)
	975	FreeTermList (&(qd->TL));
	976
	977	/* [RPAP - Feb 97: Term Frequency] */
	978	if (qd->QTL)
	979	FreeQueryTermList (&(qd->QTL));
	980
	981	qd->TL = ParseRankedQuery (qd->sd, Query, rqi->Sort, qd->sd->sdh.indexed, /* [RPAP - Jan 97: Stem Index Change] */
	982	&(qd->QTL)); /* [RPAP - Feb 97: Term Frequency] */
	983
	984	/* PrintTermList (qd->TL, stderr); */
	985
	986	dl = CosineGet (qd, qd->TL, rqi);
	987
	988	if (!dl)
	989	FatalError (1, "Out of memory\n");
	990
	991	FreeQueryDocs (qd);
	992
	993	qd->DL = dl;
	994	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mg/src/text/query.ranked.c@ 23508

Download in other formats: