Context Navigation

source: trunk/gsdl/src/mgpp/text/MGQuery.cpp@ 861

Last change on this file since 861 was 861, checked in by rjmcnab, 24 years ago
fixed a few more bugs
Property svn:keywords set to `Author Date Id Revision`
File size: 16.7 KB

Line
1	/**************************************************************************
2	*
3	* MGQuery.cpp -- Query related functions
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: MGQuery.cpp 861 2000-01-18 23:24:19Z rjmcnab $
21	*
22	**************************************************************************/
23
24	#include "MGQuery.h"
25	#include "bitio_m_stdio.h"
26	#include "bitio_gen.h"
27	#include "Terms.h"
28	#include "QueryResultsSort.h"
29	#include <assert.h>
30
31
32	void PrintIndent (ostream &s, int indent) {
33	while (indent-- > 0) s << " ";
34	}
35
36
37	void PrintIndentText (ostream &s, char *text, int indent) {
38	PrintIndent (s, indent);
39	s << text;
40	}
41
42	void PrintNode (ostream &s, QueryNode *node, int indent) {
43	if (node == NULL) {
44	PrintIndentText (s, "NULL\n", indent);
45	} else {
46	node->Print (s, indent+2);
47	}
48	}
49
50
51
52	QueryNode::QueryNode () {
53	}
54
55	QueryNode::~QueryNode () {
56	}
57
58	void QueryNode::Calculate (IndexData &/indexData/,
59	const QueryInfo &/queryInfo/,
60	QueryResult &result) const {
61	result.Clear();
62	}
63
64	void QueryNode::Free () {
65	}
66
67	void QueryNode::Print (ostream &/s/, int /indent/) const {
68	}
69
70
71
72	AndQueryNode::AndQueryNode () {
73	leftNode = NULL;
74	rightNode = NULL;
75	}
76
77	AndQueryNode::~AndQueryNode () {
78	Free ();
79	}
80
81	void AndQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
82	QueryResult &result) const {
83	result.Clear();
84
85	// an And between nothing and something is nothing...
86	if (leftNode == NULL \|\| rightNode == NULL) return;
87
88	// calculate the result from the left tree and the result
89	// from the right tree
90	QueryResult rightResult;
91	leftNode->Calculate (indexData, queryInfo, result);
92	rightNode->Calculate (indexData, queryInfo, rightResult);
93
94	// merge the results, this can be done in place as the results
95	// will always shrink with an And
96
97	bool haveAccum = (queryInfo.sortByRank \|\| queryInfo.needRankInfo);
98	if (haveAccum && (result.ranks.size() != result.docs.size() \|\|
99	rightResult.ranks.size() != rightResult.docs.size())) {
100	// shouldn't ever get here
101	haveAccum = false;
102	assert (0);
103	}
104
105	// combine document numbers and corresponding ranks
106	unsigned long leftI = 0;
107	unsigned long rightI = 0;
108	unsigned long outI = 0;
109	while (leftI < result.docs.size() &&
110	rightI < rightResult.docs.size()) {
111	if (result.docs[leftI] < rightResult.docs[rightI]) {
112	leftI++;
113	} else if (result.docs[leftI] > rightResult.docs[rightI]) {
114	rightI++;
115	} else {
116	// the documents are equal
117	result.docs[outI] = result.docs[leftI];
118	if (haveAccum)
119	result.ranks[outI] = result.ranks[leftI] + rightResult.ranks[rightI];
120	leftI++;
121	rightI++;
122	outI++;
123	}
124	}
125
126	// erase unused document numbers and ranks
127	result.docs.erase(result.docs.begin()+outI, result.docs.end());
128	if (haveAccum)
129	result.ranks.erase(result.ranks.begin()+outI, result.ranks.end());
130
131	// combine term frequency information
132	if (queryInfo.needTermFreqs)
133	result.termFreqs.insert (result.termFreqs.end(),
134	rightResult.termFreqs.begin(),
135	rightResult.termFreqs.end());
136	}
137
138	void AndQueryNode::Free () {
139	if (leftNode != NULL) {
140	delete leftNode;
141	leftNode = NULL;
142	}
143	if (rightNode != NULL) {
144	delete rightNode;
145	rightNode = NULL;
146	}
147	}
148
149	void AndQueryNode::Print (ostream &s, int indent) const {
150	PrintIndentText (s, "leftNode:\n", indent);
151	PrintNode (s, leftNode, indent+2);
152	PrintIndentText (s, "AND\n", indent);
153	PrintIndentText (s, "rightNode:\n", indent);
154	PrintNode (s, rightNode, indent+2);
155	}
156
157
158	OrQueryNode::OrQueryNode () {
159	leftNode = NULL;
160	rightNode = NULL;
161	}
162
163	OrQueryNode::~OrQueryNode () {
164	Free ();
165	}
166
167	void OrQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
168	QueryResult &result) const {
169	result.Clear();
170
171	// calculate the result from the left tree and the result
172	// from the right tree
173	QueryResult leftResult;
174	QueryResult rightResult;
175	if (leftNode != NULL)
176	leftNode->Calculate (indexData, queryInfo, leftResult);
177	if (rightNode != NULL)
178	rightNode->Calculate (indexData, queryInfo, rightResult);
179
180	// merge the results
181
182	bool haveAccum = (queryInfo.sortByRank \|\| queryInfo.needRankInfo);
183	if (haveAccum && (leftResult.ranks.size() != leftResult.docs.size() \|\|
184	rightResult.ranks.size() != rightResult.docs.size())) {
185	// shouldn't ever get here
186	haveAccum = false;
187	assert (0);
188	}
189
190	// combine document numbers and corresponding ranks
191	unsigned long leftSize = leftResult.docs.size();
192	unsigned long rightSize = rightResult.docs.size();
193	unsigned long leftI = 0;
194	unsigned long rightI = 0;
195	unsigned long leftDocNum = 0;
196	unsigned long rightDocNum = 0;
197	while (leftI < leftSize \|\| rightI < rightSize) {
198	// check leftI
199	if (leftI < leftResult.docs.size())
200	leftDocNum = leftResult.docs[leftI];
201	else leftDocNum = ULONG_MAX;
202
203	// check rightI
204	if (rightI < rightResult.docs.size())
205	rightDocNum = rightResult.docs[rightI];
206	else rightDocNum = ULONG_MAX;
207
208	// combine
209	if (leftDocNum < rightDocNum) {
210	result.docs.push_back (leftDocNum);
211	if (haveAccum)
212	result.ranks.push_back (leftResult.ranks[leftI]);
213	leftI++;
214
215	} else if (leftDocNum > rightDocNum) {
216	result.docs.push_back (rightDocNum);
217	if (haveAccum)
218	result.ranks.push_back (rightResult.ranks[rightI]);
219	rightI++;
220
221	} else { // equal
222	result.docs.push_back (leftDocNum);
223	if (haveAccum)
224	result.ranks.push_back (leftResult.ranks[leftI] +
225	rightResult.ranks[rightI]);
226	leftI++;
227	rightI++;
228	}
229	}
230
231	// combine term frequency information
232	if (queryInfo.needTermFreqs) {
233	result.termFreqs.insert (result.termFreqs.end(),
234	leftResult.termFreqs.begin(),
235	leftResult.termFreqs.end());
236	result.termFreqs.insert (result.termFreqs.end(),
237	rightResult.termFreqs.begin(),
238	rightResult.termFreqs.end());
239	}
240	}
241
242	void OrQueryNode::Free () {
243	if (leftNode != NULL) {
244	delete leftNode;
245	leftNode = NULL;
246	}
247	if (rightNode != NULL) {
248	delete rightNode;
249	rightNode = NULL;
250	}
251	}
252
253	void OrQueryNode::Print (ostream &s, int indent) const {
254	PrintIndentText (s, "leftNode:\n", indent);
255	PrintNode (s, leftNode, indent+2);
256	PrintIndentText (s, "OR\n", indent);
257	PrintIndentText (s, "rightNode:\n", indent);
258	PrintNode (s, rightNode, indent+2);
259	}
260
261
262
263	NotQueryNode::NotQueryNode () {
264	queryNode = NULL;
265	notNode = NULL;
266	}
267
268	NotQueryNode::~NotQueryNode () {
269	Free ();
270	}
271
272	void NotQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
273	QueryResult &result) const {
274	result.Clear();
275
276	// check for nothing
277	if (queryNode == NULL) return;
278	if (notNode == NULL) {
279	queryNode->Calculate (indexData, queryInfo, result);
280	return;
281	}
282
283	// calculate the result from the query tree and the result
284	// from the not tree
285	QueryResult notResult;
286	queryNode->Calculate (indexData, queryInfo, result);
287	notNode->Calculate (indexData, queryInfo, notResult);
288
289	// merge the results, this can be done in place as the results
290	// will always shrink with a Not
291
292	bool haveAccum = (queryInfo.sortByRank \|\| queryInfo.needRankInfo);
293	if (haveAccum && (result.ranks.size() != result.docs.size() \|\|
294	notResult.ranks.size() != notResult.docs.size())) {
295	// shouldn't ever get here
296	haveAccum = false;
297	assert (0);
298	}
299
300	// combine document numbers and corresponding ranks
301	unsigned long queryI = 0;
302	unsigned long notI = 0;
303	unsigned long outI = 0;
304	while (queryI < result.docs.size() &&
305	notI < notResult.docs.size()) {
306	if (result.docs[queryI] < notResult.docs[notI]) {
307	// found a document not in the notResults
308	result.docs[outI] = result.docs[queryI];
309	if (haveAccum)
310	result.ranks[outI] = result.ranks[queryI];
311	queryI++;
312	outI++;
313	} else if (result.docs[queryI] > notResult.docs[notI]) {
314	notI++;
315	} else {
316	// the documents are equal, ignore both
317	queryI++;
318	notI++;
319	}
320	}
321
322	// erase unused document numbers and ranks
323	result.docs.erase(result.docs.begin()+outI, result.docs.end());
324	if (haveAccum)
325	result.ranks.erase(result.ranks.begin()+outI, result.ranks.end());
326
327	// combine term frequency information
328	if (queryInfo.needTermFreqs)
329	result.termFreqs.insert (result.termFreqs.end(),
330	notResult.termFreqs.begin(),
331	notResult.termFreqs.end());
332	}
333
334	void NotQueryNode::Free () {
335	if (queryNode != NULL) {
336	delete queryNode;
337	queryNode = NULL;
338	}
339	if (notNode != NULL) {
340	delete notNode;
341	notNode = NULL;
342	}
343	}
344
345	void NotQueryNode::Print (ostream &s, int indent) const {
346	PrintIndentText (s, "queryNode:\n", indent);
347	PrintNode (s, queryNode, indent+2);
348	PrintIndentText (s, "NOT\n", indent);
349	PrintIndentText (s, "notNode:\n", indent);
350	PrintNode (s, notNode, indent+2);
351	}
352
353
354	void TagNode::Clear () {
355	UCArrayClear (tagName);
356	}
357
358	void TagNode::Calculate (IndexData &indexData,
359	FragRangeArray &fragRange) const {
360	fragRange.erase (fragRange.begin(), fragRange.end());
361	if (tagName.empty()) return;
362
363	// get information about this tag
364	block_dict_el tagEl;
365	unsigned long tagElNum;
366	if (!SearchBlockDictEl (indexData.dictFile, indexData.biTags,
367	indexData.bdh.entries_per_tblk,
368	indexData.bdh.tag_dict_size,
369	tagName, tagEl, tagElNum))
370	return;
371
372	// seek to the appropriate place in the inverted file
373	fseek (indexData.invfFile, tagEl.invf_ptr, SEEK_SET);
374
375	stdio_bitio_buffer buffer(indexData.invfFile);
376
377	unsigned long pTag = tagEl.frag_occur*2;
378	unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags+pTag, pTag);
379	unsigned long fragNum = 0;
380	unsigned long i;
381	FragRange thisFrag;
382	for (i=0; i<tagEl.frag_occur; i++) {
383	// get start
384	unsigned long delta = buffer.bblock_decode (B, NULL)-1;
385	fragNum += delta;
386
387	thisFrag.rangeStart = fragNum;
388
389	// get end
390	delta = buffer.bblock_decode (B, NULL)-1;
391	fragNum += delta;
392
393	thisFrag.rangeEnd = fragNum;
394	fragRange.push_back (thisFrag);
395	}
396
397	buffer.done();
398	}
399
400	void TagNode::Free () {
401	Clear ();
402	}
403
404	void TagNode::Print (ostream &s, int indent) const {
405	PrintIndent (s, indent);
406	s << "TAG: \"" << tagName << "\"\n";
407	}
408
409
410	void TermNode::Clear () {
411	UCArrayClear (term);
412	termWeight = 1;
413	stemMethod = 0;
414	startRange = NO_TERM_RANGE_START;
415	endRange = NO_TERM_RANGE_END;
416	}
417
418	TermNode::TermNode () {
419	Clear ();
420	}
421
422	void TermNode::Calculate (IndexData &indexData,
423	bool needFragFreqs,
424	FragRangeArray *fragLimits,
425	FragData &fragData) const {
426	fragData.Clear ();
427
428	// get a list of term numbers
429	vector<unsigned long> equivWords;
430	FindWordNumbers (indexData, term, stemMethod, equivWords);
431
432	// get the information for each word and merge it with
433	// previous results
434	FragData tempFragData1;
435	FragData tempFragData2;
436	vector<unsigned long>::iterator here = equivWords.begin();
437	vector<unsigned long>::iterator end = equivWords.end();
438	while (here != end) {
439	// get the information for this word
440	ReadTermFragData (indexData, needFragFreqs, *here,
441	tempFragData1, fragLimits);
442
443	// combine with last results
444	tempFragData2 = fragData;
445	CombineFragData (needFragFreqs, tempFragData1, tempFragData2, fragData);
446
447	here++;
448	}
449	}
450
451	void TermNode::Free () {
452	Clear ();
453	}
454
455	void TermNode::Print (ostream &s, int indent) const {
456	PrintIndent (s, indent);
457	s << "TERM: \"" << term << "\"\n";
458	PrintIndent (s, indent+2);
459	s << "termWeight: " << termWeight << "\n";
460	PrintIndent (s, indent+2);
461	s << "stemMethod: " << stemMethod << "\n";
462	PrintIndent (s, indent+2);
463	s << "startRange: " << startRange << "\n";
464	PrintIndent (s, indent+2);
465	s << "endRange: " << endRange << "\n";
466	}
467
468
469	ProxMatchQueryNode::ProxMatchQueryNode () {
470	tagNodePtr = NULL;
471	}
472
473	ProxMatchQueryNode::~ProxMatchQueryNode () {
474	Free ();
475	}
476
477	void ProxMatchQueryNode::Calculate (IndexData &indexData,
478	const QueryInfo &queryInfo,
479	QueryResult &result) const {
480	result.Clear ();
481
482	bool needFragFreqs = (queryInfo.sortByRank \|\| queryInfo.needRankInfo);
483
484	// read in the tag if needed
485	FragRangeArray fragLimits;
486	FragRangeArray *fragLimitsPtr = NULL;
487	if (tagNodePtr != NULL) {
488	(*tagNodePtr).Calculate (indexData, fragLimits);
489	fragLimitsPtr = &fragLimits;
490	}
491
492	UCArray tagOrLevel = indexData.curLevel;
493	if (tagNodePtr != NULL) tagOrLevel = (*tagNodePtr).tagName;
494
495	// read in the first term
496	FragData termData;
497	TermNodeArray::const_iterator termHere=terms.begin(), termEnd = terms.end();
498	if (termHere != termEnd) {
499	(*termHere).Calculate (indexData, needFragFreqs, fragLimitsPtr, termData);
500
501	// convert initial fragment information
502	FragsToQueryResult (indexData,
503	queryInfo,
504	termData,
505	tagOrLevel,
506	(*termHere).term,
507	(*termHere).stemMethod,
508	(*termHere).termWeight,
509	result);
510
511	termHere++;
512
513	if (termHere == termEnd) return; // nothing more to do
514	}
515
516	// read and combine the rest of the terms
517	FragData comTermData;
518	while (termHere != termEnd) {
519	(*termHere).Calculate (indexData, needFragFreqs,
520	fragLimitsPtr, comTermData);
521
522	AndFragsToQueryResult (indexData,
523	queryInfo,
524	comTermData,
525	tagOrLevel,
526	(*termHere).term,
527	(*termHere).stemMethod,
528	(*termHere).termWeight,
529	result);
530
531	AndCombineFragData (needFragFreqs, termData, comTermData,
532	(*termHere).startRange,
533	(*termHere).endRange,
534	fragLimitsPtr);
535	termHere++;
536	}
537
538	// remove unwanted document numbers
539	RemoveUnwantedResults (indexData,
540	queryInfo,
541	termData,
542	result);
543	}
544
545	void ProxMatchQueryNode::Free () {
546	if (tagNodePtr != NULL) {
547	delete tagNodePtr;
548	tagNodePtr = NULL;
549	}
550	terms.erase (terms.begin(), terms.end());
551	}
552
553	void ProxMatchQueryNode::Print (ostream &s, int indent) const {
554	PrintIndentText (s, "PROXMATCH\n", indent);
555	if (tagNodePtr != NULL) tagNodePtr->Print (s, indent+2);
556
557	TermNodeArray::const_iterator here = terms.begin();
558	TermNodeArray::const_iterator end = terms.end();
559	while (here != end) {
560	(*here).Print (s, indent+2);
561	here++;
562	}
563	}
564
565
566	void MGQuery (IndexData &indexData,
567	const QueryInfo &queryInfo,
568	const QueryNode *queryTree,
569	QueryResult &result) {
570	result.Clear ();
571
572	// make sure level is current
573	indexData.LoadLevel (queryInfo.docLevel);
574
575	// do query
576	if (queryTree != NULL)
577	(*queryTree).Calculate (indexData, queryInfo, result);
578
579	// make weights into ranks if needed
580	unsigned long i;
581	if (queryInfo.sortByRank \|\| queryInfo.needRankInfo) {
582	for (i=0; i<result.ranks.size(); i++) {
583	result.ranks[i] /=
584	indexData.weightData.GetLowerApproxDocWeight (result.docs[i]);
585	}
586	}
587
588	unsigned long resultsSize = queryInfo.maxDocs;
589	if (resultsSize == 0 \|\| resultsSize > result.docs.size())
590	resultsSize = result.docs.size();
591
592	// sort results by rank if needed
593	GTRank gtRank;
594	if (queryInfo.sortByRank) {
595	// need in ascending order for SelectAddHeap
596	MakeHeap (result.docs.begin(), result.ranks.begin(),
597	resultsSize, gtRank);
598	SelectAddHeap (result.docs.begin(), result.ranks.begin(), resultsSize,
599	result.docs.begin()+resultsSize,
600	result.ranks.begin()+resultsSize,
601	result.ranks.size()-resultsSize, gtRank);
602
603	// sort into descending order
604	SortHeap (result.docs.begin(), result.ranks.begin(), resultsSize, gtRank);
605	}
606
607	// get exact weights if needed
608	if (queryInfo.exactWeights &&
609	(queryInfo.sortByRank \|\| queryInfo.needRankInfo)) {
610	unsigned long exactDiskPtr =
611	indexData.levels.levelInfo[indexData.curLevel].exactWeightsDiskPtr;
612
613	for (i=0; i<resultsSize; i++) {
614	result.ranks[i] = result.ranks[i] *
615	indexData.weightData.GetLowerApproxDocWeight (result.docs[i]) /
616	GetExactDocWeight (indexData.exactWeightsFile, exactDiskPtr,
617	result.docs[i]);
618	}
619
620	// re-sort top candidates based on exact weights
621	if (queryInfo.sortByRank) {
622	MakeHeap (result.docs.begin(), result.ranks.begin(),
623	resultsSize, gtRank);
624	SortHeap (result.docs.begin(), result.ranks.begin(),
625	resultsSize, gtRank);
626	}
627	}
628
629	// get rid of unwanted ranking results
630	if (result.ranks.empty()) {
631	// do nothing
632	} else if (!queryInfo.needRankInfo) {
633	result.ranks.erase(result.ranks.begin(), result.ranks.end());
634	} else {
635	result.ranks.erase(result.ranks.begin()+resultsSize, result.ranks.end());
636	}
637	result.docs.erase(result.docs.begin()+resultsSize, result.docs.end());
638	}
639

Note: See TracBrowser for help on using the repository browser.

Download in other formats: