source: trunk/gsdl/src/mgpp/text/MGQuery.cpp@ 879

Last change on this file since 879 was 879, checked in by rjmcnab, 24 years ago

fixed phrase searching bug.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.1 KB
Line 
1/**************************************************************************
2 *
3 * MGQuery.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: MGQuery.cpp 879 2000-01-31 03:01:25Z rjmcnab $
21 *
22 **************************************************************************/
23
24#include "MGQuery.h"
25#include "bitio_m_stdio.h"
26#include "bitio_gen.h"
27#include "Terms.h"
28#include "QueryResultsSort.h"
29#include <assert.h>
30
31
32void PrintIndent (ostream &s, int indent) {
33 while (indent-- > 0) s << " ";
34}
35
36
37void PrintIndentText (ostream &s, char *text, int indent) {
38 PrintIndent (s, indent);
39 s << text;
40}
41
42void PrintNode (ostream &s, QueryNode *node, int indent) {
43 if (node == NULL) {
44 PrintIndentText (s, "NULL\n", indent);
45 } else {
46 node->Print (s, indent+2);
47 }
48}
49
50
51
52QueryNode::QueryNode () {
53}
54
55QueryNode::~QueryNode () {
56}
57
58void QueryNode::Calculate (IndexData &/*indexData*/,
59 const QueryInfo &/*queryInfo*/,
60 QueryResult &result) const {
61 result.Clear();
62}
63
64void QueryNode::Free () {
65}
66
67void QueryNode::Print (ostream &/*s*/, int /*indent*/) const {
68}
69
70
71
72AndQueryNode::AndQueryNode () {
73 leftNode = NULL;
74 rightNode = NULL;
75}
76
77AndQueryNode::~AndQueryNode () {
78 Free ();
79}
80
81void AndQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
82 QueryResult &result) const {
83 result.Clear();
84
85 // an And between nothing and something is nothing...
86 if (leftNode == NULL || rightNode == NULL) return;
87
88 // calculate the result from the left tree and the result
89 // from the right tree
90 QueryResult rightResult;
91 leftNode->Calculate (indexData, queryInfo, result);
92 rightNode->Calculate (indexData, queryInfo, rightResult);
93
94 // merge the results, this can be done in place as the results
95 // will always shrink with an And
96
97 bool haveAccum = (queryInfo.sortByRank || queryInfo.needRankInfo);
98 if (haveAccum && (result.ranks.size() != result.docs.size() ||
99 rightResult.ranks.size() != rightResult.docs.size())) {
100 // shouldn't ever get here
101 haveAccum = false;
102 assert (0);
103 }
104
105 // combine document numbers and corresponding ranks
106 unsigned long leftI = 0;
107 unsigned long rightI = 0;
108 unsigned long outI = 0;
109 while (leftI < result.docs.size() &&
110 rightI < rightResult.docs.size()) {
111 if (result.docs[leftI] < rightResult.docs[rightI]) {
112 leftI++;
113 } else if (result.docs[leftI] > rightResult.docs[rightI]) {
114 rightI++;
115 } else {
116 // the documents are equal
117 result.docs[outI] = result.docs[leftI];
118 if (haveAccum)
119 result.ranks[outI] = result.ranks[leftI] + rightResult.ranks[rightI];
120 leftI++;
121 rightI++;
122 outI++;
123 }
124 }
125
126 // erase unused document numbers and ranks
127 result.docs.erase(result.docs.begin()+outI, result.docs.end());
128 if (haveAccum)
129 result.ranks.erase(result.ranks.begin()+outI, result.ranks.end());
130
131 // combine term frequency information
132 if (queryInfo.needTermFreqs)
133 result.termFreqs.insert (result.termFreqs.end(),
134 rightResult.termFreqs.begin(),
135 rightResult.termFreqs.end());
136}
137
138void AndQueryNode::Free () {
139 if (leftNode != NULL) {
140 delete leftNode;
141 leftNode = NULL;
142 }
143 if (rightNode != NULL) {
144 delete rightNode;
145 rightNode = NULL;
146 }
147}
148
149void AndQueryNode::Print (ostream &s, int indent) const {
150 PrintIndentText (s, "leftNode:\n", indent);
151 PrintNode (s, leftNode, indent+2);
152 PrintIndentText (s, "AND\n", indent);
153 PrintIndentText (s, "rightNode:\n", indent);
154 PrintNode (s, rightNode, indent+2);
155}
156
157
158OrQueryNode::OrQueryNode () {
159 leftNode = NULL;
160 rightNode = NULL;
161}
162
163OrQueryNode::~OrQueryNode () {
164 Free ();
165}
166
167void OrQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
168 QueryResult &result) const {
169 result.Clear();
170
171 // calculate the result from the left tree and the result
172 // from the right tree
173 QueryResult leftResult;
174 QueryResult rightResult;
175 if (leftNode != NULL)
176 leftNode->Calculate (indexData, queryInfo, leftResult);
177 if (rightNode != NULL)
178 rightNode->Calculate (indexData, queryInfo, rightResult);
179
180 // merge the results
181
182 bool haveAccum = (queryInfo.sortByRank || queryInfo.needRankInfo);
183 if (haveAccum && (leftResult.ranks.size() != leftResult.docs.size() ||
184 rightResult.ranks.size() != rightResult.docs.size())) {
185 // shouldn't ever get here
186 haveAccum = false;
187 assert (0);
188 }
189
190 // combine document numbers and corresponding ranks
191 unsigned long leftSize = leftResult.docs.size();
192 unsigned long rightSize = rightResult.docs.size();
193 unsigned long leftI = 0;
194 unsigned long rightI = 0;
195 unsigned long leftDocNum = 0;
196 unsigned long rightDocNum = 0;
197 while (leftI < leftSize || rightI < rightSize) {
198 // check leftI
199 if (leftI < leftResult.docs.size())
200 leftDocNum = leftResult.docs[leftI];
201 else leftDocNum = ULONG_MAX;
202
203 // check rightI
204 if (rightI < rightResult.docs.size())
205 rightDocNum = rightResult.docs[rightI];
206 else rightDocNum = ULONG_MAX;
207
208 // combine
209 if (leftDocNum < rightDocNum) {
210 result.docs.push_back (leftDocNum);
211 if (haveAccum)
212 result.ranks.push_back (leftResult.ranks[leftI]);
213 leftI++;
214
215 } else if (leftDocNum > rightDocNum) {
216 result.docs.push_back (rightDocNum);
217 if (haveAccum)
218 result.ranks.push_back (rightResult.ranks[rightI]);
219 rightI++;
220
221 } else { // equal
222 result.docs.push_back (leftDocNum);
223 if (haveAccum)
224 result.ranks.push_back (leftResult.ranks[leftI] +
225 rightResult.ranks[rightI]);
226 leftI++;
227 rightI++;
228 }
229 }
230
231 // combine term frequency information
232 if (queryInfo.needTermFreqs) {
233 result.termFreqs.insert (result.termFreqs.end(),
234 leftResult.termFreqs.begin(),
235 leftResult.termFreqs.end());
236 result.termFreqs.insert (result.termFreqs.end(),
237 rightResult.termFreqs.begin(),
238 rightResult.termFreqs.end());
239 }
240}
241
242void OrQueryNode::Free () {
243 if (leftNode != NULL) {
244 delete leftNode;
245 leftNode = NULL;
246 }
247 if (rightNode != NULL) {
248 delete rightNode;
249 rightNode = NULL;
250 }
251}
252
253void OrQueryNode::Print (ostream &s, int indent) const {
254 PrintIndentText (s, "leftNode:\n", indent);
255 PrintNode (s, leftNode, indent+2);
256 PrintIndentText (s, "OR\n", indent);
257 PrintIndentText (s, "rightNode:\n", indent);
258 PrintNode (s, rightNode, indent+2);
259}
260
261
262
263NotQueryNode::NotQueryNode () {
264 queryNode = NULL;
265 notNode = NULL;
266}
267
268NotQueryNode::~NotQueryNode () {
269 Free ();
270}
271
272void NotQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
273 QueryResult &result) const {
274 result.Clear();
275
276 // check for nothing
277 if (queryNode == NULL) return;
278 if (notNode == NULL) {
279 queryNode->Calculate (indexData, queryInfo, result);
280 return;
281 }
282
283 // calculate the result from the query tree and the result
284 // from the not tree
285 QueryResult notResult;
286 queryNode->Calculate (indexData, queryInfo, result);
287 notNode->Calculate (indexData, queryInfo, notResult);
288
289 // merge the results, this can be done in place as the results
290 // will always shrink with a Not
291
292 bool haveAccum = (queryInfo.sortByRank || queryInfo.needRankInfo);
293 if (haveAccum && (result.ranks.size() != result.docs.size() ||
294 notResult.ranks.size() != notResult.docs.size())) {
295 // shouldn't ever get here
296 haveAccum = false;
297 assert (0);
298 }
299
300 // combine document numbers and corresponding ranks
301 unsigned long queryI = 0;
302 unsigned long notI = 0;
303 unsigned long outI = 0;
304 while (queryI < result.docs.size() &&
305 notI < notResult.docs.size()) {
306 if (result.docs[queryI] < notResult.docs[notI]) {
307 // found a document not in the notResults
308 result.docs[outI] = result.docs[queryI];
309 if (haveAccum)
310 result.ranks[outI] = result.ranks[queryI];
311 queryI++;
312 outI++;
313 } else if (result.docs[queryI] > notResult.docs[notI]) {
314 notI++;
315 } else {
316 // the documents are equal, ignore both
317 queryI++;
318 notI++;
319 }
320 }
321
322 // erase unused document numbers and ranks
323 result.docs.erase(result.docs.begin()+outI, result.docs.end());
324 if (haveAccum)
325 result.ranks.erase(result.ranks.begin()+outI, result.ranks.end());
326
327 // combine term frequency information
328 if (queryInfo.needTermFreqs)
329 result.termFreqs.insert (result.termFreqs.end(),
330 notResult.termFreqs.begin(),
331 notResult.termFreqs.end());
332}
333
334void NotQueryNode::Free () {
335 if (queryNode != NULL) {
336 delete queryNode;
337 queryNode = NULL;
338 }
339 if (notNode != NULL) {
340 delete notNode;
341 notNode = NULL;
342 }
343}
344
345void NotQueryNode::Print (ostream &s, int indent) const {
346 PrintIndentText (s, "queryNode:\n", indent);
347 PrintNode (s, queryNode, indent+2);
348 PrintIndentText (s, "NOT\n", indent);
349 PrintIndentText (s, "notNode:\n", indent);
350 PrintNode (s, notNode, indent+2);
351}
352
353
354void TagNode::Clear () {
355 UCArrayClear (tagName);
356}
357
358void TagNode::Calculate (IndexData &indexData,
359 FragRangeArray &fragRange) const {
360 fragRange.erase (fragRange.begin(), fragRange.end());
361 if (tagName.empty()) return;
362
363 // get information about this tag
364 block_dict_el tagEl;
365 unsigned long tagElNum;
366 if (!SearchBlockDictEl (indexData.dictFile, indexData.biTags,
367 indexData.bdh.entries_per_tblk,
368 indexData.bdh.tag_dict_size,
369 tagName, tagEl, tagElNum))
370 return;
371
372 // seek to the appropriate place in the inverted file
373 fseek (indexData.invfFile, tagEl.invf_ptr, SEEK_SET);
374
375 stdio_bitio_buffer buffer(indexData.invfFile);
376
377 unsigned long pTag = tagEl.frag_occur*2;
378 unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags+pTag, pTag);
379 unsigned long fragNum = 0;
380 unsigned long i;
381 FragRange thisFrag;
382 for (i=0; i<tagEl.frag_occur; i++) {
383 // get start
384 unsigned long delta = buffer.bblock_decode (B, NULL)-1;
385 fragNum += delta;
386
387 thisFrag.rangeStart = fragNum;
388
389 // get end
390 delta = buffer.bblock_decode (B, NULL)-1;
391 fragNum += delta;
392
393 thisFrag.rangeEnd = fragNum;
394 fragRange.push_back (thisFrag);
395 }
396
397 buffer.done();
398}
399
400void TagNode::Free () {
401 Clear ();
402}
403
404void TagNode::Print (ostream &s, int indent) const {
405 PrintIndent (s, indent);
406 s << "TAG: \"" << tagName << "\"\n";
407}
408
409
410void TermNode::Clear () {
411 UCArrayClear (term);
412 termWeight = 1;
413 stemMethod = 0;
414 startRange = NO_TERM_RANGE_START;
415 endRange = NO_TERM_RANGE_END;
416}
417
418TermNode::TermNode () {
419 Clear ();
420}
421
422void TermNode::Calculate (IndexData &indexData,
423 bool needFragFreqs,
424 FragRangeArray *fragLimits,
425 FragData &fragData) const {
426 fragData.Clear ();
427
428 // get a list of term numbers
429 vector<unsigned long> equivWords;
430 FindWordNumbers (indexData, term, stemMethod, equivWords);
431
432 // get the information for each word and merge it with
433 // previous results
434 FragData tempFragData1;
435 FragData tempFragData2;
436 vector<unsigned long>::iterator here = equivWords.begin();
437 vector<unsigned long>::iterator end = equivWords.end();
438 while (here != end) {
439 // get the information for this word
440 ReadTermFragData (indexData, needFragFreqs, *here,
441 tempFragData1, fragLimits);
442
443 // combine with last results
444 tempFragData2 = fragData;
445 CombineFragData (needFragFreqs, tempFragData1, tempFragData2, fragData);
446
447 here++;
448 }
449}
450
451void TermNode::Free () {
452 Clear ();
453}
454
455void TermNode::Print (ostream &s, int indent) const {
456 PrintIndent (s, indent);
457 s << "TERM: \"" << term << "\"\n";
458 PrintIndent (s, indent+2);
459 s << "termWeight: " << termWeight << "\n";
460 PrintIndent (s, indent+2);
461 s << "stemMethod: " << stemMethod << "\n";
462 PrintIndent (s, indent+2);
463 s << "startRange: " << startRange << "\n";
464 PrintIndent (s, indent+2);
465 s << "endRange: " << endRange << "\n";
466}
467
468
469ProxMatchQueryNode::ProxMatchQueryNode () {
470 tagNodePtr = NULL;
471}
472
473ProxMatchQueryNode::~ProxMatchQueryNode () {
474 Free ();
475}
476
477void ProxMatchQueryNode::Calculate (IndexData &indexData,
478 const QueryInfo &queryInfo,
479 QueryResult &result) const {
480 result.Clear ();
481
482 bool needFragFreqs = (queryInfo.sortByRank || queryInfo.needRankInfo);
483
484 // read in the tag if needed
485 FragRangeArray fragLimits;
486 FragRangeArray *fragLimitsPtr = NULL;
487 if (tagNodePtr == NULL && terms.size() > 1) {
488 // multiple terms must be compared relative to some tag
489 // otherwise phrase matches could span documents
490 TagNode tempTagNode;
491 tempTagNode.tagName = indexData.curLevel;
492 tempTagNode.Calculate (indexData, fragLimits);
493 fragLimitsPtr = &fragLimits;
494
495 } else if (tagNodePtr != NULL) {
496 (*tagNodePtr).Calculate (indexData, fragLimits);
497 fragLimitsPtr = &fragLimits;
498 }
499
500 UCArray tagOrLevel = indexData.curLevel;
501 if (tagNodePtr != NULL) tagOrLevel = (*tagNodePtr).tagName;
502
503 // read in the first term
504 FragData termData;
505 TermNodeArray::const_iterator termHere=terms.begin(), termEnd = terms.end();
506 if (termHere != termEnd) {
507 (*termHere).Calculate (indexData, needFragFreqs, fragLimitsPtr, termData);
508
509 // convert initial fragment information
510 FragsToQueryResult (indexData,
511 queryInfo,
512 termData,
513 tagOrLevel,
514 (*termHere).term,
515 (*termHere).stemMethod,
516 (*termHere).termWeight,
517 result);
518
519 termHere++;
520
521 if (termHere == termEnd) return; // nothing more to do
522 }
523
524 // read and combine the rest of the terms
525 FragData comTermData;
526 while (termHere != termEnd) {
527 (*termHere).Calculate (indexData, needFragFreqs,
528 fragLimitsPtr, comTermData);
529
530 AndFragsToQueryResult (indexData,
531 queryInfo,
532 comTermData,
533 tagOrLevel,
534 (*termHere).term,
535 (*termHere).stemMethod,
536 (*termHere).termWeight,
537 result);
538
539 AndCombineFragData (needFragFreqs, termData, comTermData,
540 (*termHere).startRange,
541 (*termHere).endRange,
542 fragLimitsPtr);
543 termHere++;
544 }
545
546 // remove unwanted document numbers
547 RemoveUnwantedResults (indexData,
548 queryInfo,
549 termData,
550 result);
551}
552
553void ProxMatchQueryNode::Free () {
554 if (tagNodePtr != NULL) {
555 delete tagNodePtr;
556 tagNodePtr = NULL;
557 }
558 terms.erase (terms.begin(), terms.end());
559}
560
561void ProxMatchQueryNode::Print (ostream &s, int indent) const {
562 PrintIndentText (s, "PROXMATCH\n", indent);
563 if (tagNodePtr != NULL) tagNodePtr->Print (s, indent+2);
564
565 TermNodeArray::const_iterator here = terms.begin();
566 TermNodeArray::const_iterator end = terms.end();
567 while (here != end) {
568 (*here).Print (s, indent+2);
569 here++;
570 }
571}
572
573
574void MGQuery (IndexData &indexData,
575 const QueryInfo &queryInfo,
576 const QueryNode *queryTree,
577 QueryResult &result) {
578 result.Clear ();
579
580 // make sure level is current
581 indexData.LoadLevel (queryInfo.docLevel);
582
583 // do query
584 if (queryTree != NULL)
585 (*queryTree).Calculate (indexData, queryInfo, result);
586
587 // make weights into ranks if needed
588 unsigned long i;
589 if (queryInfo.sortByRank || queryInfo.needRankInfo) {
590 for (i=0; i<result.ranks.size(); i++) {
591 result.ranks[i] /=
592 indexData.weightData.GetLowerApproxDocWeight (result.docs[i]);
593 }
594 }
595
596 unsigned long resultsSize = queryInfo.maxDocs;
597 if (resultsSize == 0 || resultsSize > result.docs.size())
598 resultsSize = result.docs.size();
599
600 // sort results by rank if needed
601 GTRank gtRank;
602 if (queryInfo.sortByRank) {
603 // need in ascending order for SelectAddHeap
604 MakeHeap (result.docs.begin(), result.ranks.begin(),
605 resultsSize, gtRank);
606 SelectAddHeap (result.docs.begin(), result.ranks.begin(), resultsSize,
607 result.docs.begin()+resultsSize,
608 result.ranks.begin()+resultsSize,
609 result.ranks.size()-resultsSize, gtRank);
610
611 // sort into descending order
612 SortHeap (result.docs.begin(), result.ranks.begin(), resultsSize, gtRank);
613 }
614
615 // get exact weights if needed
616 if (queryInfo.exactWeights &&
617 (queryInfo.sortByRank || queryInfo.needRankInfo)) {
618 unsigned long exactDiskPtr =
619 indexData.levels.levelInfo[indexData.curLevel].exactWeightsDiskPtr;
620
621 for (i=0; i<resultsSize; i++) {
622 result.ranks[i] = result.ranks[i] *
623 indexData.weightData.GetLowerApproxDocWeight (result.docs[i]) /
624 GetExactDocWeight (indexData.exactWeightsFile, exactDiskPtr,
625 result.docs[i]);
626 }
627
628 // re-sort top candidates based on exact weights
629 if (queryInfo.sortByRank) {
630 MakeHeap (result.docs.begin(), result.ranks.begin(),
631 resultsSize, gtRank);
632 SortHeap (result.docs.begin(), result.ranks.begin(),
633 resultsSize, gtRank);
634 }
635 }
636
637 // get rid of unwanted ranking results
638 if (result.ranks.empty()) {
639 // do nothing
640 } else if (!queryInfo.needRankInfo) {
641 result.ranks.erase(result.ranks.begin(), result.ranks.end());
642 } else {
643 result.ranks.erase(result.ranks.begin()+resultsSize, result.ranks.end());
644 }
645 result.docs.erase(result.docs.begin()+resultsSize, result.docs.end());
646}
647
Note: See TracBrowser for help on using the repository browser.