source: trunk/gsdl/src/mgpp/text/MGQuery.cpp@ 855

Last change on this file since 855 was 855, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:keywords set to Author Date Id Revision
File size: 16.6 KB
Line 
1/**************************************************************************
2 *
3 * MGQuery.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: MGQuery.cpp 855 2000-01-14 02:17:52Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "MGQuery.h"
25#include "bitio_m_stdio.h"
26#include "bitio_gen.h"
27#include "Terms.h"
28#include "QueryResultsSort.h"
29#include <assert.h>
30
31
32void PrintIndent (ostream &s, int indent) {
33 while (indent-- > 0) s << " ";
34}
35
36
37void PrintIndentText (ostream &s, char *text, int indent) {
38 PrintIndent (s, indent);
39 s << text;
40}
41
42void PrintNode (ostream &s, QueryNode *node, int indent) {
43 if (node == NULL) {
44 PrintIndentText (s, "NULL\n", indent);
45 } else {
46 node->Print (s, indent+2);
47 }
48}
49
50
51
52QueryNode::QueryNode () {
53}
54
55QueryNode::~QueryNode () {
56}
57
58void QueryNode::Calculate (IndexData &/*indexData*/,
59 const QueryInfo &/*queryInfo*/,
60 QueryResult &result) const {
61 result.Clear();
62}
63
64void QueryNode::Free () {
65}
66
67void QueryNode::Print (ostream &/*s*/, int /*indent*/) const {
68}
69
70
71
72AndQueryNode::AndQueryNode () {
73 leftNode = NULL;
74 rightNode = NULL;
75}
76
77AndQueryNode::~AndQueryNode () {
78 Free ();
79}
80
81void AndQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
82 QueryResult &result) const {
83 result.Clear();
84
85 // an And between nothing and something is nothing...
86 if (leftNode == NULL || rightNode == NULL) return;
87
88 // calculate the result from the left tree and the result
89 // from the right tree
90 QueryResult rightResult;
91 leftNode->Calculate (indexData, queryInfo, result);
92 rightNode->Calculate (indexData, queryInfo, rightResult);
93
94 // merge the results, this can be done in place as the results
95 // will always shrink with an And
96
97 bool haveAccum = (queryInfo.sortByRank || queryInfo.needRankInfo);
98 if (haveAccum && (result.ranks.size() != result.docs.size() ||
99 rightResult.ranks.size() != rightResult.docs.size())) {
100 // shouldn't ever get here
101 haveAccum = false;
102 assert (0);
103 }
104
105 // combine document numbers and corresponding ranks
106 unsigned long leftI = 0;
107 unsigned long rightI = 0;
108 unsigned long outI = 0;
109 while (leftI < result.docs.size() &&
110 rightI < rightResult.docs.size()) {
111 if (result.docs[leftI] < rightResult.docs[rightI]) {
112 leftI++;
113 } else if (result.docs[leftI] > rightResult.docs[rightI]) {
114 rightI++;
115 } else {
116 // the documents are equal
117 result.docs[outI] = result.docs[leftI];
118 if (haveAccum)
119 result.ranks[outI] = result.ranks[leftI] + rightResult.ranks[rightI];
120 leftI++;
121 rightI++;
122 outI++;
123 }
124 }
125
126 // erase unused document numbers and ranks
127 result.docs.erase(result.docs.begin()+outI, result.docs.end());
128 if (haveAccum)
129 result.ranks.erase(result.ranks.begin()+outI, result.ranks.end());
130
131 // combine term frequency information
132 if (queryInfo.needTermFreqs)
133 result.termFreqs.insert (result.termFreqs.end(),
134 rightResult.termFreqs.begin(),
135 rightResult.termFreqs.end());
136}
137
138void AndQueryNode::Free () {
139 if (leftNode != NULL) {
140 delete leftNode;
141 leftNode = NULL;
142 }
143 if (rightNode != NULL) {
144 delete rightNode;
145 rightNode = NULL;
146 }
147}
148
149void AndQueryNode::Print (ostream &s, int indent) const {
150 PrintIndentText (s, "leftNode:\n", indent);
151 PrintNode (s, leftNode, indent+2);
152 PrintIndentText (s, "AND\n", indent);
153 PrintIndentText (s, "rightNode:\n", indent);
154 PrintNode (s, rightNode, indent+2);
155}
156
157
158OrQueryNode::OrQueryNode () {
159 leftNode = NULL;
160 rightNode = NULL;
161}
162
163OrQueryNode::~OrQueryNode () {
164 Free ();
165}
166
167void OrQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
168 QueryResult &result) const {
169 result.Clear();
170
171 // calculate the result from the left tree and the result
172 // from the right tree
173 QueryResult leftResult;
174 QueryResult rightResult;
175 if (leftNode != NULL)
176 leftNode->Calculate (indexData, queryInfo, leftResult);
177 if (rightNode != NULL)
178 rightNode->Calculate (indexData, queryInfo, rightResult);
179
180 // merge the results
181
182 bool haveAccum = (queryInfo.sortByRank || queryInfo.needRankInfo);
183 if (haveAccum && (leftResult.ranks.size() != leftResult.docs.size() ||
184 rightResult.ranks.size() != rightResult.docs.size())) {
185 // shouldn't ever get here
186 haveAccum = false;
187 assert (0);
188 }
189
190 // combine document numbers and corresponding ranks
191 unsigned long leftSize = leftResult.docs.size();
192 unsigned long rightSize = rightResult.docs.size();
193 unsigned long leftI = 0;
194 unsigned long rightI = 0;
195 unsigned long leftDocNum = 0;
196 unsigned long rightDocNum = 0;
197 while (leftI < leftSize || rightI < rightSize) {
198 // check leftI
199 if (leftI < leftResult.docs.size())
200 leftDocNum = leftResult.docs[leftI];
201 else leftDocNum = ULONG_MAX;
202
203 // check rightI
204 if (rightI < rightResult.docs.size())
205 rightDocNum = rightResult.docs[rightI];
206 else rightDocNum = ULONG_MAX;
207
208 // combine
209 if (leftDocNum < rightDocNum) {
210 result.docs.push_back (leftDocNum);
211 if (haveAccum)
212 result.ranks.push_back (leftResult.ranks[leftI]);
213 leftI++;
214
215 } else if (leftDocNum > rightDocNum) {
216 result.docs.push_back (rightDocNum);
217 if (haveAccum)
218 result.ranks.push_back (rightResult.ranks[rightI]);
219 rightI++;
220
221 } else { // equal
222 result.docs.push_back (leftDocNum);
223 if (haveAccum)
224 result.ranks.push_back (leftResult.ranks[leftI] +
225 rightResult.ranks[rightI]);
226 leftI++;
227 rightI++;
228 }
229 }
230
231 // combine term frequency information
232 if (queryInfo.needTermFreqs) {
233 result.termFreqs.insert (result.termFreqs.end(),
234 leftResult.termFreqs.begin(),
235 leftResult.termFreqs.end());
236 result.termFreqs.insert (result.termFreqs.end(),
237 rightResult.termFreqs.begin(),
238 rightResult.termFreqs.end());
239 }
240}
241
242void OrQueryNode::Free () {
243 if (leftNode != NULL) {
244 delete leftNode;
245 leftNode = NULL;
246 }
247 if (rightNode != NULL) {
248 delete rightNode;
249 rightNode = NULL;
250 }
251}
252
253void OrQueryNode::Print (ostream &s, int indent) const {
254 PrintIndentText (s, "leftNode:\n", indent);
255 PrintNode (s, leftNode, indent+2);
256 PrintIndentText (s, "OR\n", indent);
257 PrintIndentText (s, "rightNode:\n", indent);
258 PrintNode (s, rightNode, indent+2);
259}
260
261
262
263NotQueryNode::NotQueryNode () {
264 queryNode = NULL;
265 notNode = NULL;
266}
267
268NotQueryNode::~NotQueryNode () {
269 Free ();
270}
271
272void NotQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
273 QueryResult &result) const {
274 result.Clear();
275
276 // check for nothing
277 if (queryNode == NULL) return;
278 if (notNode == NULL) {
279 queryNode->Calculate (indexData, queryInfo, result);
280 return;
281 }
282
283 // calculate the result from the query tree and the result
284 // from the not tree
285 QueryResult notResult;
286 queryNode->Calculate (indexData, queryInfo, result);
287 notNode->Calculate (indexData, queryInfo, notResult);
288
289 // merge the results, this can be done in place as the results
290 // will always shrink with a Not
291
292 bool haveAccum = (queryInfo.sortByRank || queryInfo.needRankInfo);
293 if (haveAccum && (result.ranks.size() != result.docs.size() ||
294 notResult.ranks.size() != notResult.docs.size())) {
295 // shouldn't ever get here
296 haveAccum = false;
297 assert (0);
298 }
299
300 // combine document numbers and corresponding ranks
301 unsigned long queryI = 0;
302 unsigned long notI = 0;
303 unsigned long outI = 0;
304 while (queryI < result.docs.size() &&
305 notI < notResult.docs.size()) {
306 if (result.docs[queryI] < notResult.docs[notI]) {
307 // found a document not in the notResults
308 result.docs[outI] = result.docs[queryI];
309 if (haveAccum)
310 result.ranks[outI] = result.ranks[queryI];
311 queryI++;
312 outI++;
313 } else if (result.docs[queryI] > notResult.docs[notI]) {
314 notI++;
315 } else {
316 // the documents are equal, ignore both
317 queryI++;
318 notI++;
319 }
320 }
321
322 // erase unused document numbers and ranks
323 result.docs.erase(result.docs.begin()+outI, result.docs.end());
324 if (haveAccum)
325 result.ranks.erase(result.ranks.begin()+outI, result.ranks.end());
326
327 // combine term frequency information
328 if (queryInfo.needTermFreqs)
329 result.termFreqs.insert (result.termFreqs.end(),
330 notResult.termFreqs.begin(),
331 notResult.termFreqs.end());
332}
333
334void NotQueryNode::Free () {
335 if (queryNode != NULL) {
336 delete queryNode;
337 queryNode = NULL;
338 }
339 if (notNode != NULL) {
340 delete notNode;
341 notNode = NULL;
342 }
343}
344
345void NotQueryNode::Print (ostream &s, int indent) const {
346 PrintIndentText (s, "queryNode:\n", indent);
347 PrintNode (s, queryNode, indent+2);
348 PrintIndentText (s, "NOT\n", indent);
349 PrintIndentText (s, "notNode:\n", indent);
350 PrintNode (s, notNode, indent+2);
351}
352
353
354void TagNode::Clear () {
355 UCArrayClear (tagName);
356}
357
358void TagNode::Calculate (IndexData &indexData,
359 FragRangeArray &fragRange) const {
360 fragRange.erase (fragRange.begin(), fragRange.end());
361 if (tagName.empty()) return;
362
363 // get information about this tag
364 block_dict_el tagEl;
365 unsigned long tagElNum;
366 if (!SearchBlockDictEl (indexData.dictFile, indexData.biTags,
367 indexData.bdh.entries_per_tblk,
368 indexData.bdh.tag_dict_size,
369 tagName, tagEl, tagElNum))
370 return;
371
372 // seek to the appropriate place in the inverted file
373 fseek (indexData.invfFile, tagEl.invf_ptr, SEEK_SET);
374
375 stdio_bitio_buffer buffer(indexData.invfFile);
376
377 unsigned long pTag = tagEl.frag_occur*2;
378 unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags+pTag, pTag);
379 unsigned long fragNum = 0;
380 unsigned long i;
381 FragRange thisFrag;
382 for (i=0; i<tagEl.frag_occur; i++) {
383 // get start
384 unsigned long delta = buffer.bblock_decode (B, NULL)-1;
385 fragNum += delta;
386
387 thisFrag.rangeStart = fragNum;
388
389 // get end
390 delta = buffer.bblock_decode (B, NULL)-1;
391 fragNum += delta;
392
393 thisFrag.rangeEnd = fragNum;
394 fragRange.push_back (thisFrag);
395 }
396
397 buffer.done();
398}
399
400void TagNode::Free () {
401 Clear ();
402}
403
404void TagNode::Print (ostream &s, int indent) const {
405 PrintIndent (s, indent);
406 s << "TAG: \"" << tagName << "\"\n";
407}
408
409
410void TermNode::Clear () {
411 UCArrayClear (term);
412 termWeight = 1;
413 stemMethod = 0;
414 startRange = NO_TERM_RANGE_START;
415 endRange = NO_TERM_RANGE_END;
416}
417
418TermNode::TermNode () {
419 Clear ();
420}
421
422void TermNode::Calculate (IndexData &indexData,
423 bool needFragFreqs,
424 FragRangeArray *fragLimits,
425 FragData &fragData) const {
426 fragData.Clear ();
427
428 // get a list of term numbers
429 vector<unsigned long> equivWords;
430 FindWordNumbers (indexData, term, stemMethod, equivWords);
431
432 // get the information for each word and merge it with
433 // previous results
434 FragData tempFragData1;
435 FragData tempFragData2;
436 vector<unsigned long>::iterator here = equivWords.begin();
437 vector<unsigned long>::iterator end = equivWords.end();
438 while (here != end) {
439 // get the information for this word
440 ReadTermFragData (indexData, needFragFreqs, *here,
441 tempFragData1, fragLimits);
442
443 // combine with last results
444 tempFragData2 = fragData;
445 CombineFragData (needFragFreqs, tempFragData1, tempFragData2, fragData);
446
447 here++;
448 }
449}
450
451void TermNode::Free () {
452 Clear ();
453}
454
455void TermNode::Print (ostream &s, int indent) const {
456 PrintIndent (s, indent);
457 s << "TERM: \"" << term << "\"\n";
458 PrintIndent (s, indent+2);
459 s << "termWeight: " << termWeight << "\n";
460 PrintIndent (s, indent+2);
461 s << "stemMethod: " << stemMethod << "\n";
462 PrintIndent (s, indent+2);
463 s << "startRange: " << startRange << "\n";
464 PrintIndent (s, indent+2);
465 s << "endRange: " << endRange << "\n";
466}
467
468
469ProxMatchQueryNode::ProxMatchQueryNode () {
470 tagNodePtr = NULL;
471}
472
473ProxMatchQueryNode::~ProxMatchQueryNode () {
474 Free ();
475}
476
477void ProxMatchQueryNode::Calculate (IndexData &indexData,
478 const QueryInfo &queryInfo,
479 QueryResult &result) const {
480 result.Clear ();
481
482 bool needFragFreqs = (queryInfo.sortByRank || queryInfo.needRankInfo);
483
484 // read in the tag if needed
485 FragRangeArray fragLimits;
486 FragRangeArray *fragLimitsPtr = NULL;
487 if (tagNodePtr != NULL) {
488 (*tagNodePtr).Calculate (indexData, fragLimits);
489 fragLimitsPtr = &fragLimits;
490 }
491
492 UCArray tagOrLevel = indexData.curLevel;
493 if (tagNodePtr != NULL) tagOrLevel = (*tagNodePtr).tagName;
494
495 // read in the first term
496 FragData termData;
497 TermNodeArray::const_iterator termHere=terms.begin(), termEnd = terms.end();
498 if (termHere != termEnd) {
499 (*termHere).Calculate (indexData, needFragFreqs, fragLimitsPtr, termData);
500
501 // convert initial fragment information
502 FragsToQueryResult (indexData,
503 queryInfo,
504 termData,
505 tagOrLevel,
506 (*termHere).term,
507 (*termHere).stemMethod,
508 (*termHere).termWeight,
509 result);
510
511 termHere++;
512
513 if (termHere == termEnd) return; // nothing more to do
514 }
515
516 // read and combine the rest of the terms
517 FragData comTermData;
518 while (termHere != termEnd) {
519 (*termHere).Calculate (indexData, needFragFreqs,
520 fragLimitsPtr, comTermData);
521
522 AndFragsToQueryResult (indexData,
523 queryInfo,
524 comTermData,
525 tagOrLevel,
526 (*termHere).term,
527 (*termHere).stemMethod,
528 (*termHere).termWeight,
529 result);
530
531 AndCombineFragData (needFragFreqs, termData, comTermData,
532 (*termHere).startRange,
533 (*termHere).endRange,
534 fragLimitsPtr);
535 termHere++;
536 }
537
538 // remove unwanted document numbers
539 RemoveUnwantedResults (indexData,
540 queryInfo,
541 termData,
542 result);
543}
544
545void ProxMatchQueryNode::Free () {
546 if (tagNodePtr != NULL) {
547 delete tagNodePtr;
548 tagNodePtr = NULL;
549 }
550 terms.erase (terms.begin(), terms.end());
551}
552
553void ProxMatchQueryNode::Print (ostream &s, int indent) const {
554 PrintIndentText (s, "PROXMATCH\n", indent);
555 if (tagNodePtr != NULL) tagNodePtr->Print (s, indent+2);
556
557 TermNodeArray::const_iterator here = terms.begin();
558 TermNodeArray::const_iterator end = terms.end();
559 while (here != end) {
560 (*here).Print (s, indent+2);
561 here++;
562 }
563}
564
565
566void MGQuery (IndexData &indexData,
567 const QueryInfo &queryInfo,
568 const QueryNode *queryTree,
569 QueryResult &result) {
570 result.Clear ();
571
572 // make sure level is current
573 indexData.LoadLevel (queryInfo.docLevel);
574
575 // do query
576 if (queryTree != NULL)
577 (*queryTree).Calculate (indexData, queryInfo, result);
578
579 // make weights into ranks if needed
580 unsigned long i;
581 if (queryInfo.sortByRank || queryInfo.needRankInfo) {
582 for (i=0; i<result.ranks.size(); i++) {
583 result.ranks[i] /=
584 indexData.weightData.GetLowerApproxDocWeight (result.docs[i]);
585 }
586 }
587
588 unsigned long resultsSize = queryInfo.maxDocs;
589 if (resultsSize == 0 || resultsSize > result.ranks.size())
590 resultsSize = result.ranks.size();
591
592 // sort results by rank if needed
593 LTRank ltRank;
594 if (queryInfo.sortByRank) {
595 MakeHeap (result.docs.begin(), result.ranks.begin(),
596 resultsSize, ltRank);
597 SelectAddHeap (result.docs.begin(), result.ranks.begin(), resultsSize,
598 result.docs.begin()+resultsSize,
599 result.ranks.begin()+resultsSize,
600 result.ranks.size()-resultsSize, ltRank);
601 SortHeap (result.docs.begin(), result.ranks.begin(), resultsSize, ltRank);
602 }
603
604 // get exact weights if needed
605 if (queryInfo.exactWeights &&
606 (queryInfo.sortByRank || queryInfo.needRankInfo)) {
607 unsigned long exactDiskPtr =
608 indexData.levels.levelInfo[indexData.curLevel].exactWeightsDiskPtr;
609
610 for (i=0; i<resultsSize; i++) {
611 result.ranks[i] = result.ranks[i] *
612 indexData.weightData.GetLowerApproxDocWeight (result.docs[i]) /
613 GetExactDocWeight (indexData.exactWeightsFile, exactDiskPtr,
614 result.docs[i]);
615 }
616
617 // re-sort top candidates based on exact weights
618 if (queryInfo.sortByRank) {
619 MakeHeap (result.docs.begin(), result.ranks.begin(),
620 resultsSize, ltRank);
621 SortHeap (result.docs.begin(), result.ranks.begin(),
622 resultsSize, ltRank);
623 }
624 }
625
626 // get rid of unwanted ranking results
627 if (!result.ranks.empty() && !queryInfo.needRankInfo) {
628 result.ranks.erase(result.ranks.begin(), result.ranks.end());
629 } else {
630 result.ranks.erase(result.ranks.begin()+resultsSize, result.ranks.end());
631 }
632 result.docs.erase(result.docs.begin()+resultsSize, result.docs.end());
633}
634
Note: See TracBrowser for help on using the repository browser.