root/trunk/gsdl/src/mgpp/text/MGQuery.cpp @ 861

Revision 861, 16.7 KB (checked in by rjmcnab, 20 years ago)

fixed a few more bugs

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * MGQuery.cpp -- Query related functions
4 * Copyright (C) 1999  Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id$
21 *
22 **************************************************************************/
23
24#include "MGQuery.h"
25#include "bitio_m_stdio.h"
26#include "bitio_gen.h"
27#include "Terms.h"
28#include "QueryResultsSort.h"
29#include <assert.h>
30
31
32void PrintIndent (ostream &s, int indent) {
33  while (indent-- > 0) s << " ";
34}
35
36
37void PrintIndentText (ostream &s, char *text, int indent) {
38  PrintIndent (s, indent);
39  s << text;
40}
41
42void PrintNode (ostream &s, QueryNode *node, int indent) {
43  if (node == NULL) {
44    PrintIndentText (s, "NULL\n", indent);
45  } else {
46    node->Print (s, indent+2);
47  }
48}
49
50
51
52QueryNode::QueryNode () {
53}
54
55QueryNode::~QueryNode () {
56}
57
58void QueryNode::Calculate (IndexData &/*indexData*/,
59               const QueryInfo &/*queryInfo*/,
60               QueryResult &result) const {
61  result.Clear();
62}
63
64void QueryNode::Free () {
65}
66
67void QueryNode::Print (ostream &/*s*/, int /*indent*/) const {
68}
69
70
71
72AndQueryNode::AndQueryNode () {
73  leftNode = NULL;
74  rightNode = NULL;
75}
76
77AndQueryNode::~AndQueryNode () {
78  Free ();
79}
80
81void AndQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
82                  QueryResult &result) const {
83  result.Clear();
84
85  // an And between nothing and something is nothing...
86  if (leftNode == NULL || rightNode == NULL) return;
87 
88  // calculate the result from the left tree and the result
89  // from the right tree
90  QueryResult rightResult;
91  leftNode->Calculate (indexData, queryInfo, result);
92  rightNode->Calculate (indexData, queryInfo, rightResult);
93
94  // merge the results, this can be done in place as the results
95  // will always shrink with an And
96
97  bool haveAccum = (queryInfo.sortByRank || queryInfo.needRankInfo);
98  if (haveAccum && (result.ranks.size() != result.docs.size() ||
99            rightResult.ranks.size() != rightResult.docs.size())) {
100    // shouldn't ever get here
101    haveAccum = false;
102    assert (0);
103  }
104
105  // combine document numbers and corresponding ranks
106  unsigned long leftI = 0;
107  unsigned long rightI = 0;
108  unsigned long outI = 0;
109  while (leftI < result.docs.size() &&
110     rightI < rightResult.docs.size()) {
111    if (result.docs[leftI] < rightResult.docs[rightI]) {
112      leftI++;
113    } else if (result.docs[leftI] > rightResult.docs[rightI]) {
114      rightI++;
115    } else {
116      // the documents are equal
117      result.docs[outI] = result.docs[leftI];
118      if (haveAccum)
119    result.ranks[outI] = result.ranks[leftI] + rightResult.ranks[rightI];
120      leftI++;
121      rightI++;
122      outI++;
123    }
124  }
125
126  // erase unused document numbers and ranks
127  result.docs.erase(result.docs.begin()+outI, result.docs.end());
128  if (haveAccum)
129    result.ranks.erase(result.ranks.begin()+outI, result.ranks.end());
130   
131  // combine term frequency information
132  if (queryInfo.needTermFreqs)
133    result.termFreqs.insert (result.termFreqs.end(),
134                 rightResult.termFreqs.begin(),
135                 rightResult.termFreqs.end());
136}
137
138void AndQueryNode::Free () {
139  if (leftNode != NULL) {
140    delete leftNode;
141    leftNode = NULL;
142  }
143  if (rightNode != NULL) {
144    delete rightNode;
145    rightNode = NULL;
146  }
147}
148
149void AndQueryNode::Print (ostream &s, int indent) const {
150  PrintIndentText (s, "leftNode:\n", indent);
151  PrintNode (s, leftNode, indent+2);
152  PrintIndentText (s, "AND\n", indent);
153  PrintIndentText (s, "rightNode:\n", indent);
154  PrintNode (s, rightNode, indent+2);
155}
156
157
158OrQueryNode::OrQueryNode () {
159  leftNode = NULL;
160  rightNode = NULL;
161}
162
163OrQueryNode::~OrQueryNode () {
164  Free ();
165}
166
167void OrQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
168                  QueryResult &result) const {
169  result.Clear();
170 
171  // calculate the result from the left tree and the result
172  // from the right tree
173  QueryResult leftResult;
174  QueryResult rightResult;
175  if (leftNode != NULL)
176    leftNode->Calculate (indexData, queryInfo, leftResult);
177  if (rightNode != NULL)
178    rightNode->Calculate (indexData, queryInfo, rightResult);
179
180  // merge the results
181 
182  bool haveAccum = (queryInfo.sortByRank || queryInfo.needRankInfo);
183  if (haveAccum && (leftResult.ranks.size() != leftResult.docs.size() ||
184            rightResult.ranks.size() != rightResult.docs.size())) {
185    // shouldn't ever get here
186    haveAccum = false;
187    assert (0);
188  }
189
190  // combine document numbers and corresponding ranks
191  unsigned long leftSize = leftResult.docs.size();
192  unsigned long rightSize = rightResult.docs.size();
193  unsigned long leftI = 0;
194  unsigned long rightI = 0;
195  unsigned long leftDocNum = 0;
196  unsigned long rightDocNum = 0;
197  while (leftI < leftSize || rightI < rightSize) {
198    // check leftI
199    if (leftI < leftResult.docs.size())
200      leftDocNum = leftResult.docs[leftI];
201    else leftDocNum = ULONG_MAX;
202   
203    // check rightI
204    if (rightI < rightResult.docs.size())
205      rightDocNum = rightResult.docs[rightI];
206    else rightDocNum = ULONG_MAX;
207   
208    // combine
209    if (leftDocNum < rightDocNum) {
210      result.docs.push_back (leftDocNum);
211      if (haveAccum)
212    result.ranks.push_back (leftResult.ranks[leftI]);
213      leftI++;
214     
215    } else if (leftDocNum > rightDocNum) {
216      result.docs.push_back (rightDocNum);
217      if (haveAccum)
218    result.ranks.push_back (rightResult.ranks[rightI]);
219      rightI++;
220     
221    } else { // equal
222      result.docs.push_back (leftDocNum);
223      if (haveAccum)
224    result.ranks.push_back (leftResult.ranks[leftI] +
225                rightResult.ranks[rightI]);
226      leftI++;
227      rightI++;
228    }
229  }
230
231  // combine term frequency information
232  if (queryInfo.needTermFreqs) {
233    result.termFreqs.insert (result.termFreqs.end(),
234                 leftResult.termFreqs.begin(),
235                 leftResult.termFreqs.end());
236    result.termFreqs.insert (result.termFreqs.end(),
237                 rightResult.termFreqs.begin(),
238                 rightResult.termFreqs.end());
239  }
240}
241
242void OrQueryNode::Free () {
243  if (leftNode != NULL) {
244    delete leftNode;
245    leftNode = NULL;
246  }
247  if (rightNode != NULL) {
248    delete rightNode;
249    rightNode = NULL;
250  }
251}
252
253void OrQueryNode::Print (ostream &s, int indent) const {
254  PrintIndentText (s, "leftNode:\n", indent);
255  PrintNode (s, leftNode, indent+2);
256  PrintIndentText (s, "OR\n", indent);
257  PrintIndentText (s, "rightNode:\n", indent);
258  PrintNode (s, rightNode, indent+2);
259}
260
261
262
263NotQueryNode::NotQueryNode () {
264  queryNode = NULL;
265  notNode = NULL;
266}
267
268NotQueryNode::~NotQueryNode () {
269  Free ();
270}
271
272void NotQueryNode::Calculate (IndexData &indexData, const QueryInfo &queryInfo,
273                  QueryResult &result) const {
274  result.Clear();
275
276  // check for nothing
277  if (queryNode == NULL) return;
278  if (notNode == NULL) {
279    queryNode->Calculate (indexData, queryInfo, result);
280    return;
281  }
282 
283  // calculate the result from the query tree and the result
284  // from the not tree
285  QueryResult notResult;
286  queryNode->Calculate (indexData, queryInfo, result);
287  notNode->Calculate (indexData, queryInfo, notResult);
288
289  // merge the results, this can be done in place as the results
290  // will always shrink with a Not
291
292  bool haveAccum = (queryInfo.sortByRank || queryInfo.needRankInfo);
293  if (haveAccum && (result.ranks.size() != result.docs.size() ||
294            notResult.ranks.size() != notResult.docs.size())) {
295    // shouldn't ever get here
296    haveAccum = false;
297    assert (0);
298  }
299
300  // combine document numbers and corresponding ranks
301  unsigned long queryI = 0;
302  unsigned long notI = 0;
303  unsigned long outI = 0;
304  while (queryI < result.docs.size() &&
305     notI < notResult.docs.size()) {
306    if (result.docs[queryI] < notResult.docs[notI]) {
307      // found a document not in the notResults
308      result.docs[outI] = result.docs[queryI];
309      if (haveAccum)
310    result.ranks[outI] = result.ranks[queryI];
311      queryI++;
312      outI++;
313    } else if (result.docs[queryI] > notResult.docs[notI]) {
314      notI++;
315    } else {
316      // the documents are equal, ignore both
317      queryI++;
318      notI++;
319    }
320  }
321
322  // erase unused document numbers and ranks
323  result.docs.erase(result.docs.begin()+outI, result.docs.end());
324  if (haveAccum)
325    result.ranks.erase(result.ranks.begin()+outI, result.ranks.end());
326 
327  // combine term frequency information
328  if (queryInfo.needTermFreqs)
329    result.termFreqs.insert (result.termFreqs.end(),
330                 notResult.termFreqs.begin(),
331                 notResult.termFreqs.end());
332}
333
334void NotQueryNode::Free () {
335  if (queryNode != NULL) {
336    delete queryNode;
337    queryNode = NULL;
338  }
339  if (notNode != NULL) {
340    delete notNode;
341    notNode = NULL;
342  }
343}
344
345void NotQueryNode::Print (ostream &s, int indent) const {
346  PrintIndentText (s, "queryNode:\n", indent);
347  PrintNode (s, queryNode, indent+2);
348  PrintIndentText (s, "NOT\n", indent);
349  PrintIndentText (s, "notNode:\n", indent);
350  PrintNode (s, notNode, indent+2);
351}
352
353
354void TagNode::Clear () {
355  UCArrayClear (tagName);
356}
357
358void TagNode::Calculate (IndexData &indexData,
359             FragRangeArray &fragRange) const {
360  fragRange.erase (fragRange.begin(), fragRange.end());
361  if (tagName.empty()) return;
362
363  // get information about this tag
364  block_dict_el tagEl;
365  unsigned long tagElNum;
366  if (!SearchBlockDictEl (indexData.dictFile, indexData.biTags,
367              indexData.bdh.entries_per_tblk,
368              indexData.bdh.tag_dict_size,
369              tagName, tagEl, tagElNum))
370    return;
371
372  // seek to the appropriate place in the inverted file
373  fseek (indexData.invfFile, tagEl.invf_ptr, SEEK_SET);
374
375  stdio_bitio_buffer buffer(indexData.invfFile);
376 
377  unsigned long pTag = tagEl.frag_occur*2;
378  unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags+pTag, pTag);
379  unsigned long fragNum = 0;
380  unsigned long i;
381  FragRange thisFrag;
382  for (i=0; i<tagEl.frag_occur; i++) {
383    // get start
384    unsigned long delta = buffer.bblock_decode (B, NULL)-1;
385    fragNum += delta;
386
387    thisFrag.rangeStart = fragNum;
388
389    // get end
390    delta = buffer.bblock_decode (B, NULL)-1;
391    fragNum += delta;
392
393    thisFrag.rangeEnd = fragNum;
394    fragRange.push_back (thisFrag);
395  }
396
397  buffer.done();
398}
399
400void TagNode::Free () {
401  Clear ();
402}
403
404void TagNode::Print (ostream &s, int indent) const {
405  PrintIndent (s, indent);
406  s << "TAG: \"" << tagName << "\"\n";
407}
408
409
410void TermNode::Clear () {
411  UCArrayClear (term);
412  termWeight = 1;
413  stemMethod = 0;
414  startRange = NO_TERM_RANGE_START;
415  endRange = NO_TERM_RANGE_END;
416}
417
418TermNode::TermNode () {
419  Clear ();
420}
421
422void TermNode::Calculate (IndexData &indexData,
423              bool needFragFreqs,
424              FragRangeArray *fragLimits,
425              FragData &fragData) const {
426  fragData.Clear ();
427
428  // get a list of term numbers
429  vector<unsigned long> equivWords;
430  FindWordNumbers (indexData, term, stemMethod, equivWords);
431
432  // get the information for each word and merge it with
433  // previous results
434  FragData tempFragData1;
435  FragData tempFragData2;
436  vector<unsigned long>::iterator here = equivWords.begin();
437  vector<unsigned long>::iterator end = equivWords.end();
438  while (here != end) {
439    // get the information for this word
440    ReadTermFragData (indexData, needFragFreqs, *here,
441              tempFragData1, fragLimits);
442
443    // combine with last results
444    tempFragData2 = fragData;
445    CombineFragData (needFragFreqs, tempFragData1, tempFragData2, fragData);
446   
447    here++;
448  }
449}
450
451void TermNode::Free () {
452  Clear ();
453}
454
455void TermNode::Print (ostream &s, int indent) const {
456  PrintIndent (s, indent);
457  s << "TERM: \"" << term << "\"\n";
458  PrintIndent (s, indent+2);
459  s << "termWeight: " << termWeight << "\n";
460  PrintIndent (s, indent+2);
461  s << "stemMethod: " << stemMethod << "\n";
462  PrintIndent (s, indent+2);
463  s << "startRange: " << startRange << "\n";
464  PrintIndent (s, indent+2);
465  s << "endRange: " << endRange << "\n";
466}
467
468
469ProxMatchQueryNode::ProxMatchQueryNode () {
470  tagNodePtr = NULL;
471}
472
473ProxMatchQueryNode::~ProxMatchQueryNode () {
474  Free ();
475}
476
477void ProxMatchQueryNode::Calculate (IndexData &indexData,
478                    const QueryInfo &queryInfo,
479                    QueryResult &result) const {
480  result.Clear ();
481
482  bool needFragFreqs = (queryInfo.sortByRank || queryInfo.needRankInfo);
483
484  // read in the tag if needed
485  FragRangeArray fragLimits;
486  FragRangeArray *fragLimitsPtr = NULL;
487  if (tagNodePtr != NULL) {
488    (*tagNodePtr).Calculate (indexData, fragLimits);
489    fragLimitsPtr = &fragLimits;
490  }
491
492  UCArray tagOrLevel = indexData.curLevel;
493  if (tagNodePtr != NULL) tagOrLevel = (*tagNodePtr).tagName;
494
495  // read in the first term
496  FragData termData;
497  TermNodeArray::const_iterator termHere=terms.begin(), termEnd = terms.end();
498  if (termHere != termEnd) {
499    (*termHere).Calculate (indexData, needFragFreqs, fragLimitsPtr, termData);
500
501    // convert initial fragment information
502    FragsToQueryResult (indexData,
503            queryInfo,
504            termData,
505            tagOrLevel,
506            (*termHere).term,
507            (*termHere).stemMethod,
508            (*termHere).termWeight,
509            result);
510 
511    termHere++;
512
513    if (termHere == termEnd) return; // nothing more to do
514  }
515
516  // read and combine the rest of the terms
517  FragData comTermData;
518  while (termHere != termEnd) {
519    (*termHere).Calculate (indexData, needFragFreqs,
520               fragLimitsPtr, comTermData);
521
522    AndFragsToQueryResult (indexData,
523               queryInfo,
524               comTermData,
525               tagOrLevel,
526               (*termHere).term,
527               (*termHere).stemMethod,
528               (*termHere).termWeight,
529               result);
530   
531    AndCombineFragData (needFragFreqs, termData, comTermData,
532            (*termHere).startRange,
533            (*termHere).endRange,
534            fragLimitsPtr);
535    termHere++;
536  }
537
538  // remove unwanted document numbers
539  RemoveUnwantedResults (indexData,
540             queryInfo,
541             termData,
542             result);
543}
544
545void ProxMatchQueryNode::Free () {
546  if (tagNodePtr != NULL) {
547    delete tagNodePtr;
548    tagNodePtr = NULL;
549  }
550  terms.erase (terms.begin(), terms.end());
551}
552
553void ProxMatchQueryNode::Print (ostream &s, int indent) const {
554  PrintIndentText (s, "PROXMATCH\n", indent);
555  if (tagNodePtr != NULL) tagNodePtr->Print (s, indent+2);
556 
557  TermNodeArray::const_iterator here = terms.begin();
558  TermNodeArray::const_iterator end = terms.end();
559  while (here != end) {
560    (*here).Print (s, indent+2);
561    here++;
562  }
563}
564
565
566void MGQuery (IndexData &indexData,
567          const QueryInfo &queryInfo,
568          const QueryNode *queryTree,
569          QueryResult &result) {
570  result.Clear ();
571
572  // make sure level is current
573  indexData.LoadLevel (queryInfo.docLevel);
574 
575  // do query
576  if (queryTree != NULL)
577    (*queryTree).Calculate (indexData, queryInfo, result);
578
579  // make weights into ranks if needed
580  unsigned long i;
581  if (queryInfo.sortByRank || queryInfo.needRankInfo) {
582    for (i=0; i<result.ranks.size(); i++) {
583      result.ranks[i] /=
584    indexData.weightData.GetLowerApproxDocWeight (result.docs[i]);
585    }
586  }
587
588  unsigned long resultsSize = queryInfo.maxDocs;
589  if (resultsSize == 0 || resultsSize > result.docs.size())
590    resultsSize = result.docs.size();
591 
592  // sort results by rank if needed
593  GTRank gtRank;
594  if (queryInfo.sortByRank) {
595    // need in ascending order for SelectAddHeap
596    MakeHeap (result.docs.begin(), result.ranks.begin(),
597          resultsSize, gtRank);
598    SelectAddHeap (result.docs.begin(), result.ranks.begin(), resultsSize,
599           result.docs.begin()+resultsSize,
600           result.ranks.begin()+resultsSize,
601           result.ranks.size()-resultsSize, gtRank);
602
603    // sort into descending order
604    SortHeap (result.docs.begin(), result.ranks.begin(), resultsSize, gtRank);
605  }
606 
607  // get exact weights if needed
608  if (queryInfo.exactWeights &&
609      (queryInfo.sortByRank || queryInfo.needRankInfo)) {
610    unsigned long exactDiskPtr =
611      indexData.levels.levelInfo[indexData.curLevel].exactWeightsDiskPtr;
612   
613    for (i=0; i<resultsSize; i++) {
614      result.ranks[i] =  result.ranks[i] *
615    indexData.weightData.GetLowerApproxDocWeight (result.docs[i]) /
616    GetExactDocWeight (indexData.exactWeightsFile, exactDiskPtr,
617               result.docs[i]);
618    }
619
620    // re-sort top candidates based on exact weights
621    if (queryInfo.sortByRank) {
622      MakeHeap (result.docs.begin(), result.ranks.begin(),
623        resultsSize, gtRank);
624      SortHeap (result.docs.begin(), result.ranks.begin(),
625        resultsSize, gtRank);
626    }
627  }
628
629  // get rid of unwanted ranking results
630  if (result.ranks.empty()) {
631    // do nothing
632  } else if (!queryInfo.needRankInfo) {
633    result.ranks.erase(result.ranks.begin(), result.ranks.end());
634  } else {
635    result.ranks.erase(result.ranks.begin()+resultsSize, result.ranks.end());
636  }
637  result.docs.erase(result.docs.begin()+resultsSize, result.docs.end());
638}
639
Note: See TracBrowser for help on using the browser.