Context Navigation

source: trunk/gsdl/src/mgpp/text/MGQuery.cpp@ 879

Last change on this file since 879 was 879, checked in by rjmcnab, 24 years ago
fixed phrase searching bug.
Property svn:keywords set to `Author Date Id Revision`
File size: 17.1 KB

Line
1	/**************************************************************************
2	*
3	* MGQuery.cpp -- Query related functions
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: MGQuery.cpp 879 2000-01-31 03:01:25Z rjmcnab $
21	*
22	**************************************************************************/
23
24	#include "MGQuery.h"
25	#include "bitio_m_stdio.h"
26	#include "bitio_gen.h"
27	#include "Terms.h"
28	#include "QueryResultsSort.h"
29	#include <assert.h>
30
31
32	void PrintIndent (ostream &s, int indent) {
33	while (indent-- > 0) s << " ";
34	}
35
36
37	void PrintIndentText (ostream &s, char *text, int indent) {
38	PrintIndent (s, indent);
39	s << text;
40	}
41
42	void PrintNode (ostream &s, QueryNode *node, int indent) {
43	if (node == NULL) {
44	PrintIndentText (s, "NULL\n", indent);
45	} else {
46	node->Print (s, indent+2);
47	}
48	}
49
50
51
52	QueryNode::QueryNode () {
53	}
54
55	QueryNode::~QueryNode () {
56	}
57
58	void QueryNode::Calculate (IndexData &/indexData/,
59	const QueryInfo &/queryInfo/,
60	QueryResult &result) const {
61	result.Clear();
62	}
63
64	void QueryNode::Free () {
65	}
66
67	void QueryNode::Print (ostream &/s/, int /indent/) const {
68	}
69
70
71
72	AndQueryNode::AndQueryNode () {
73	leftNode = NULL;
74	rightNode = NULL;
75	}
76
77	AndQueryNode::~AndQueryNode () {
78	Free ();
79	}
80
81	void AndQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
82	QueryResult &result) const {
83	result.Clear();
84
85	// an And between nothing and something is nothing...
86	if (leftNode == NULL \|\| rightNode == NULL) return;
87
88	// calculate the result from the left tree and the result
89	// from the right tree
90	QueryResult rightResult;
91	leftNode->Calculate (indexData, queryInfo, result);
92	rightNode->Calculate (indexData, queryInfo, rightResult);
93
94	// merge the results, this can be done in place as the results
95	// will always shrink with an And
96
97	bool haveAccum = (queryInfo.sortByRank \|\| queryInfo.needRankInfo);
98	if (haveAccum && (result.ranks.size() != result.docs.size() \|\|
99	rightResult.ranks.size() != rightResult.docs.size())) {
100	// shouldn't ever get here
101	haveAccum = false;
102	assert (0);
103	}
104
105	// combine document numbers and corresponding ranks
106	unsigned long leftI = 0;
107	unsigned long rightI = 0;
108	unsigned long outI = 0;
109	while (leftI < result.docs.size() &&
110	rightI < rightResult.docs.size()) {
111	if (result.docs[leftI] < rightResult.docs[rightI]) {
112	leftI++;
113	} else if (result.docs[leftI] > rightResult.docs[rightI]) {
114	rightI++;
115	} else {
116	// the documents are equal
117	result.docs[outI] = result.docs[leftI];
118	if (haveAccum)
119	result.ranks[outI] = result.ranks[leftI] + rightResult.ranks[rightI];
120	leftI++;
121	rightI++;
122	outI++;
123	}
124	}
125
126	// erase unused document numbers and ranks
127	result.docs.erase(result.docs.begin()+outI, result.docs.end());
128	if (haveAccum)
129	result.ranks.erase(result.ranks.begin()+outI, result.ranks.end());
130
131	// combine term frequency information
132	if (queryInfo.needTermFreqs)
133	result.termFreqs.insert (result.termFreqs.end(),
134	rightResult.termFreqs.begin(),
135	rightResult.termFreqs.end());
136	}
137
138	void AndQueryNode::Free () {
139	if (leftNode != NULL) {
140	delete leftNode;
141	leftNode = NULL;
142	}
143	if (rightNode != NULL) {
144	delete rightNode;
145	rightNode = NULL;
146	}
147	}
148
149	void AndQueryNode::Print (ostream &s, int indent) const {
150	PrintIndentText (s, "leftNode:\n", indent);
151	PrintNode (s, leftNode, indent+2);
152	PrintIndentText (s, "AND\n", indent);
153	PrintIndentText (s, "rightNode:\n", indent);
154	PrintNode (s, rightNode, indent+2);
155	}
156
157
158	OrQueryNode::OrQueryNode () {
159	leftNode = NULL;
160	rightNode = NULL;
161	}
162
163	OrQueryNode::~OrQueryNode () {
164	Free ();
165	}
166
167	void OrQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
168	QueryResult &result) const {
169	result.Clear();
170
171	// calculate the result from the left tree and the result
172	// from the right tree
173	QueryResult leftResult;
174	QueryResult rightResult;
175	if (leftNode != NULL)
176	leftNode->Calculate (indexData, queryInfo, leftResult);
177	if (rightNode != NULL)
178	rightNode->Calculate (indexData, queryInfo, rightResult);
179
180	// merge the results
181
182	bool haveAccum = (queryInfo.sortByRank \|\| queryInfo.needRankInfo);
183	if (haveAccum && (leftResult.ranks.size() != leftResult.docs.size() \|\|
184	rightResult.ranks.size() != rightResult.docs.size())) {
185	// shouldn't ever get here
186	haveAccum = false;
187	assert (0);
188	}
189
190	// combine document numbers and corresponding ranks
191	unsigned long leftSize = leftResult.docs.size();
192	unsigned long rightSize = rightResult.docs.size();
193	unsigned long leftI = 0;
194	unsigned long rightI = 0;
195	unsigned long leftDocNum = 0;
196	unsigned long rightDocNum = 0;
197	while (leftI < leftSize \|\| rightI < rightSize) {
198	// check leftI
199	if (leftI < leftResult.docs.size())
200	leftDocNum = leftResult.docs[leftI];
201	else leftDocNum = ULONG_MAX;
202
203	// check rightI
204	if (rightI < rightResult.docs.size())
205	rightDocNum = rightResult.docs[rightI];
206	else rightDocNum = ULONG_MAX;
207
208	// combine
209	if (leftDocNum < rightDocNum) {
210	result.docs.push_back (leftDocNum);
211	if (haveAccum)
212	result.ranks.push_back (leftResult.ranks[leftI]);
213	leftI++;
214
215	} else if (leftDocNum > rightDocNum) {
216	result.docs.push_back (rightDocNum);
217	if (haveAccum)
218	result.ranks.push_back (rightResult.ranks[rightI]);
219	rightI++;
220
221	} else { // equal
222	result.docs.push_back (leftDocNum);
223	if (haveAccum)
224	result.ranks.push_back (leftResult.ranks[leftI] +
225	rightResult.ranks[rightI]);
226	leftI++;
227	rightI++;
228	}
229	}
230
231	// combine term frequency information
232	if (queryInfo.needTermFreqs) {
233	result.termFreqs.insert (result.termFreqs.end(),
234	leftResult.termFreqs.begin(),
235	leftResult.termFreqs.end());
236	result.termFreqs.insert (result.termFreqs.end(),
237	rightResult.termFreqs.begin(),
238	rightResult.termFreqs.end());
239	}
240	}
241
242	void OrQueryNode::Free () {
243	if (leftNode != NULL) {
244	delete leftNode;
245	leftNode = NULL;
246	}
247	if (rightNode != NULL) {
248	delete rightNode;
249	rightNode = NULL;
250	}
251	}
252
253	void OrQueryNode::Print (ostream &s, int indent) const {
254	PrintIndentText (s, "leftNode:\n", indent);
255	PrintNode (s, leftNode, indent+2);
256	PrintIndentText (s, "OR\n", indent);
257	PrintIndentText (s, "rightNode:\n", indent);
258	PrintNode (s, rightNode, indent+2);
259	}
260
261
262
263	NotQueryNode::NotQueryNode () {
264	queryNode = NULL;
265	notNode = NULL;
266	}
267
268	NotQueryNode::~NotQueryNode () {
269	Free ();
270	}
271
272	void NotQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
273	QueryResult &result) const {
274	result.Clear();
275
276	// check for nothing
277	if (queryNode == NULL) return;
278	if (notNode == NULL) {
279	queryNode->Calculate (indexData, queryInfo, result);
280	return;
281	}
282
283	// calculate the result from the query tree and the result
284	// from the not tree
285	QueryResult notResult;
286	queryNode->Calculate (indexData, queryInfo, result);
287	notNode->Calculate (indexData, queryInfo, notResult);
288
289	// merge the results, this can be done in place as the results
290	// will always shrink with a Not
291
292	bool haveAccum = (queryInfo.sortByRank \|\| queryInfo.needRankInfo);
293	if (haveAccum && (result.ranks.size() != result.docs.size() \|\|
294	notResult.ranks.size() != notResult.docs.size())) {
295	// shouldn't ever get here
296	haveAccum = false;
297	assert (0);
298	}
299
300	// combine document numbers and corresponding ranks
301	unsigned long queryI = 0;
302	unsigned long notI = 0;
303	unsigned long outI = 0;
304	while (queryI < result.docs.size() &&
305	notI < notResult.docs.size()) {
306	if (result.docs[queryI] < notResult.docs[notI]) {
307	// found a document not in the notResults
308	result.docs[outI] = result.docs[queryI];
309	if (haveAccum)
310	result.ranks[outI] = result.ranks[queryI];
311	queryI++;
312	outI++;
313	} else if (result.docs[queryI] > notResult.docs[notI]) {
314	notI++;
315	} else {
316	// the documents are equal, ignore both
317	queryI++;
318	notI++;
319	}
320	}
321
322	// erase unused document numbers and ranks
323	result.docs.erase(result.docs.begin()+outI, result.docs.end());
324	if (haveAccum)
325	result.ranks.erase(result.ranks.begin()+outI, result.ranks.end());
326
327	// combine term frequency information
328	if (queryInfo.needTermFreqs)
329	result.termFreqs.insert (result.termFreqs.end(),
330	notResult.termFreqs.begin(),
331	notResult.termFreqs.end());
332	}
333
334	void NotQueryNode::Free () {
335	if (queryNode != NULL) {
336	delete queryNode;
337	queryNode = NULL;
338	}
339	if (notNode != NULL) {
340	delete notNode;
341	notNode = NULL;
342	}
343	}
344
345	void NotQueryNode::Print (ostream &s, int indent) const {
346	PrintIndentText (s, "queryNode:\n", indent);
347	PrintNode (s, queryNode, indent+2);
348	PrintIndentText (s, "NOT\n", indent);
349	PrintIndentText (s, "notNode:\n", indent);
350	PrintNode (s, notNode, indent+2);
351	}
352
353
354	void TagNode::Clear () {
355	UCArrayClear (tagName);
356	}
357
358	void TagNode::Calculate (IndexData &indexData,
359	FragRangeArray &fragRange) const {
360	fragRange.erase (fragRange.begin(), fragRange.end());
361	if (tagName.empty()) return;
362
363	// get information about this tag
364	block_dict_el tagEl;
365	unsigned long tagElNum;
366	if (!SearchBlockDictEl (indexData.dictFile, indexData.biTags,
367	indexData.bdh.entries_per_tblk,
368	indexData.bdh.tag_dict_size,
369	tagName, tagEl, tagElNum))
370	return;
371
372	// seek to the appropriate place in the inverted file
373	fseek (indexData.invfFile, tagEl.invf_ptr, SEEK_SET);
374
375	stdio_bitio_buffer buffer(indexData.invfFile);
376
377	unsigned long pTag = tagEl.frag_occur*2;
378	unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags+pTag, pTag);
379	unsigned long fragNum = 0;
380	unsigned long i;
381	FragRange thisFrag;
382	for (i=0; i<tagEl.frag_occur; i++) {
383	// get start
384	unsigned long delta = buffer.bblock_decode (B, NULL)-1;
385	fragNum += delta;
386
387	thisFrag.rangeStart = fragNum;
388
389	// get end
390	delta = buffer.bblock_decode (B, NULL)-1;
391	fragNum += delta;
392
393	thisFrag.rangeEnd = fragNum;
394	fragRange.push_back (thisFrag);
395	}
396
397	buffer.done();
398	}
399
400	void TagNode::Free () {
401	Clear ();
402	}
403
404	void TagNode::Print (ostream &s, int indent) const {
405	PrintIndent (s, indent);
406	s << "TAG: \"" << tagName << "\"\n";
407	}
408
409
410	void TermNode::Clear () {
411	UCArrayClear (term);
412	termWeight = 1;
413	stemMethod = 0;
414	startRange = NO_TERM_RANGE_START;
415	endRange = NO_TERM_RANGE_END;
416	}
417
418	TermNode::TermNode () {
419	Clear ();
420	}
421
422	void TermNode::Calculate (IndexData &indexData,
423	bool needFragFreqs,
424	FragRangeArray *fragLimits,
425	FragData &fragData) const {
426	fragData.Clear ();
427
428	// get a list of term numbers
429	vector<unsigned long> equivWords;
430	FindWordNumbers (indexData, term, stemMethod, equivWords);
431
432	// get the information for each word and merge it with
433	// previous results
434	FragData tempFragData1;
435	FragData tempFragData2;
436	vector<unsigned long>::iterator here = equivWords.begin();
437	vector<unsigned long>::iterator end = equivWords.end();
438	while (here != end) {
439	// get the information for this word
440	ReadTermFragData (indexData, needFragFreqs, *here,
441	tempFragData1, fragLimits);
442
443	// combine with last results
444	tempFragData2 = fragData;
445	CombineFragData (needFragFreqs, tempFragData1, tempFragData2, fragData);
446
447	here++;
448	}
449	}
450
451	void TermNode::Free () {
452	Clear ();
453	}
454
455	void TermNode::Print (ostream &s, int indent) const {
456	PrintIndent (s, indent);
457	s << "TERM: \"" << term << "\"\n";
458	PrintIndent (s, indent+2);
459	s << "termWeight: " << termWeight << "\n";
460	PrintIndent (s, indent+2);
461	s << "stemMethod: " << stemMethod << "\n";
462	PrintIndent (s, indent+2);
463	s << "startRange: " << startRange << "\n";
464	PrintIndent (s, indent+2);
465	s << "endRange: " << endRange << "\n";
466	}
467
468
469	ProxMatchQueryNode::ProxMatchQueryNode () {
470	tagNodePtr = NULL;
471	}
472
473	ProxMatchQueryNode::~ProxMatchQueryNode () {
474	Free ();
475	}
476
477	void ProxMatchQueryNode::Calculate (IndexData &indexData,
478	const QueryInfo &queryInfo,
479	QueryResult &result) const {
480	result.Clear ();
481
482	bool needFragFreqs = (queryInfo.sortByRank \|\| queryInfo.needRankInfo);
483
484	// read in the tag if needed
485	FragRangeArray fragLimits;
486	FragRangeArray *fragLimitsPtr = NULL;
487	if (tagNodePtr == NULL && terms.size() > 1) {
488	// multiple terms must be compared relative to some tag
489	// otherwise phrase matches could span documents
490	TagNode tempTagNode;
491	tempTagNode.tagName = indexData.curLevel;
492	tempTagNode.Calculate (indexData, fragLimits);
493	fragLimitsPtr = &fragLimits;
494
495	} else if (tagNodePtr != NULL) {
496	(*tagNodePtr).Calculate (indexData, fragLimits);
497	fragLimitsPtr = &fragLimits;
498	}
499
500	UCArray tagOrLevel = indexData.curLevel;
501	if (tagNodePtr != NULL) tagOrLevel = (*tagNodePtr).tagName;
502
503	// read in the first term
504	FragData termData;
505	TermNodeArray::const_iterator termHere=terms.begin(), termEnd = terms.end();
506	if (termHere != termEnd) {
507	(*termHere).Calculate (indexData, needFragFreqs, fragLimitsPtr, termData);
508
509	// convert initial fragment information
510	FragsToQueryResult (indexData,
511	queryInfo,
512	termData,
513	tagOrLevel,
514	(*termHere).term,
515	(*termHere).stemMethod,
516	(*termHere).termWeight,
517	result);
518
519	termHere++;
520
521	if (termHere == termEnd) return; // nothing more to do
522	}
523
524	// read and combine the rest of the terms
525	FragData comTermData;
526	while (termHere != termEnd) {
527	(*termHere).Calculate (indexData, needFragFreqs,
528	fragLimitsPtr, comTermData);
529
530	AndFragsToQueryResult (indexData,
531	queryInfo,
532	comTermData,
533	tagOrLevel,
534	(*termHere).term,
535	(*termHere).stemMethod,
536	(*termHere).termWeight,
537	result);
538
539	AndCombineFragData (needFragFreqs, termData, comTermData,
540	(*termHere).startRange,
541	(*termHere).endRange,
542	fragLimitsPtr);
543	termHere++;
544	}
545
546	// remove unwanted document numbers
547	RemoveUnwantedResults (indexData,
548	queryInfo,
549	termData,
550	result);
551	}
552
553	void ProxMatchQueryNode::Free () {
554	if (tagNodePtr != NULL) {
555	delete tagNodePtr;
556	tagNodePtr = NULL;
557	}
558	terms.erase (terms.begin(), terms.end());
559	}
560
561	void ProxMatchQueryNode::Print (ostream &s, int indent) const {
562	PrintIndentText (s, "PROXMATCH\n", indent);
563	if (tagNodePtr != NULL) tagNodePtr->Print (s, indent+2);
564
565	TermNodeArray::const_iterator here = terms.begin();
566	TermNodeArray::const_iterator end = terms.end();
567	while (here != end) {
568	(*here).Print (s, indent+2);
569	here++;
570	}
571	}
572
573
574	void MGQuery (IndexData &indexData,
575	const QueryInfo &queryInfo,
576	const QueryNode *queryTree,
577	QueryResult &result) {
578	result.Clear ();
579
580	// make sure level is current
581	indexData.LoadLevel (queryInfo.docLevel);
582
583	// do query
584	if (queryTree != NULL)
585	(*queryTree).Calculate (indexData, queryInfo, result);
586
587	// make weights into ranks if needed
588	unsigned long i;
589	if (queryInfo.sortByRank \|\| queryInfo.needRankInfo) {
590	for (i=0; i<result.ranks.size(); i++) {
591	result.ranks[i] /=
592	indexData.weightData.GetLowerApproxDocWeight (result.docs[i]);
593	}
594	}
595
596	unsigned long resultsSize = queryInfo.maxDocs;
597	if (resultsSize == 0 \|\| resultsSize > result.docs.size())
598	resultsSize = result.docs.size();
599
600	// sort results by rank if needed
601	GTRank gtRank;
602	if (queryInfo.sortByRank) {
603	// need in ascending order for SelectAddHeap
604	MakeHeap (result.docs.begin(), result.ranks.begin(),
605	resultsSize, gtRank);
606	SelectAddHeap (result.docs.begin(), result.ranks.begin(), resultsSize,
607	result.docs.begin()+resultsSize,
608	result.ranks.begin()+resultsSize,
609	result.ranks.size()-resultsSize, gtRank);
610
611	// sort into descending order
612	SortHeap (result.docs.begin(), result.ranks.begin(), resultsSize, gtRank);
613	}
614
615	// get exact weights if needed
616	if (queryInfo.exactWeights &&
617	(queryInfo.sortByRank \|\| queryInfo.needRankInfo)) {
618	unsigned long exactDiskPtr =
619	indexData.levels.levelInfo[indexData.curLevel].exactWeightsDiskPtr;
620
621	for (i=0; i<resultsSize; i++) {
622	result.ranks[i] = result.ranks[i] *
623	indexData.weightData.GetLowerApproxDocWeight (result.docs[i]) /
624	GetExactDocWeight (indexData.exactWeightsFile, exactDiskPtr,
625	result.docs[i]);
626	}
627
628	// re-sort top candidates based on exact weights
629	if (queryInfo.sortByRank) {
630	MakeHeap (result.docs.begin(), result.ranks.begin(),
631	resultsSize, gtRank);
632	SortHeap (result.docs.begin(), result.ranks.begin(),
633	resultsSize, gtRank);
634	}
635	}
636
637	// get rid of unwanted ranking results
638	if (result.ranks.empty()) {
639	// do nothing
640	} else if (!queryInfo.needRankInfo) {
641	result.ranks.erase(result.ranks.begin(), result.ranks.end());
642	} else {
643	result.ranks.erase(result.ranks.begin()+resultsSize, result.ranks.end());
644	}
645	result.docs.erase(result.docs.begin()+resultsSize, result.docs.end());
646	}
647

Note: See TracBrowser for help on using the repository browser.

Download in other formats: