root/main/trunk/greenstone2/common-src/indexers/mgpp/text/Terms.cpp @ 25147

Revision 25147, 21.7 KB (checked in by kjdon, 8 years ago)

merged 64_bit_Greenstone branch into trunk, rev 25139

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999  Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "Terms.h"
23#include "words.h"
24#include "stemmer.h"
25#include "bitio_gen.h"
26#include "bitio_m_stdio.h"
27
28void QueryInfo::Clear () {
29  UCArrayClear (docLevel);
30  maxDocs = 0;
31  sortByRank = true;
32  exactWeights = false;
33  needRankInfo = false;
34  needTermFreqs = false;
35}
36
37
38
39void TermFreqData::Clear () {
40  UCArrayClear (tag);
41  UCArrayClear (term);
42  equivTerms.erase(equivTerms.begin(), equivTerms.end());
43  stemMethod = 0;
44  matchDocs = 0;
45  termFreq = 0;
46}
47
48ostream &operator<< (ostream &s, const TermFreqData &t) {
49  s << "<" << t.tag << ">\"" << t.term << "\"stem("
50    << t.stemMethod << ")equiv terms(";
51 
52  mg_u_long i;
53  for (i=0; i<t.equivTerms.size(); ++i) {
54    s << t.equivTerms[i] << ", ";
55  }
56  s <<")docs(" << t.matchDocs << ")"
57    << "count("<<t.termFreq<<")";
58  return s;
59}
60
61bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
62  return ((t1.tag == t2.tag) &&
63      (t1.term == t2.term) &&
64      (t1.stemMethod == t2.stemMethod) &&
65      (t1.equivTerms == t2.equivTerms) &&
66      (t1.matchDocs == t2.matchDocs) &&
67      (t1.termFreq == t2.termFreq));
68}
69
70
71void QueryResult::Clear () {
72  docs.erase (docs.begin(), docs.end());
73  ranks.erase (ranks.begin(), ranks.end());
74  termFreqs.erase (termFreqs.begin(), termFreqs.end());
75  actualNumDocs = 0;
76}
77
78QueryResult::QueryResult () {
79  Clear ();
80}
81
82void QueryResult::printShort(ostream &s) {
83
84  s << "termFreqs: ";
85  for (mg_u_long i=0; i<termFreqs.size(); ++i)
86    s << termFreqs[i] << ", ";
87 
88  s << "\nactual number of docs found: " << actualNumDocs;
89  s << "\n\n";
90
91}
92
93
94ostream &operator<< (ostream &s, const QueryResult &r) {
95  s << "docs: ";
96  mg_u_long i;
97  for (i=0; i<r.docs.size(); ++i)
98    s << r.docs[i] << ", ";
99 
100  s << "\nranks: ";
101  for (i=0; i<r.ranks.size(); ++i)
102    s << r.ranks[i] << ", ";
103
104  s << "\ntermFreqs: ";
105  for (i=0; i<r.termFreqs.size(); ++i)
106    s << r.termFreqs[i] << ", ";
107
108  s << "\nactual number of docs found: " << r.actualNumDocs;
109  s << "\n\n";
110
111  return s;
112}
113
114
115bool operator== (const QueryResult &r1, const QueryResult &r2) {
116  return ((r1.docs == r2.docs) &&
117      (r1.ranks == r2.ranks) &&
118      (r1.termFreqs == r2.termFreqs) &&
119      (r1.actualNumDocs == r2.actualNumDocs));
120}
121
122//---------------------------------------------------
123// new ExtQueryResult stuff
124void ExtQueryResult::Clear () {
125  docs.erase (docs.begin(), docs.end());
126  levels.erase (levels.begin(), levels.end());
127  ranks.erase (ranks.begin(), ranks.end());
128  termFreqs.erase (termFreqs.begin(), termFreqs.end());
129  actualNumDocs = 0;
130}
131
132ExtQueryResult::ExtQueryResult () {
133  Clear ();
134}
135
136ostream &operator<< (ostream &s, const ExtQueryResult &r) {
137  s << "docs: ";
138  mg_u_long i;
139  for (i=0; i<r.docs.size(); ++i)
140    s << r.docs[i] << ", ";
141
142  s << "\nlevels: ";
143  for (i=0; i<r.levels.size(); ++i)
144    s << r.levels[i] << ", ";
145
146 
147  s << "\nranks: ";
148  for (i=0; i<r.ranks.size(); ++i)
149    s << r.ranks[i] << ", ";
150
151  s << "\ntermFreqs: ";
152  for (i=0; i<r.termFreqs.size(); ++i)
153    s << r.termFreqs[i] << ", ";
154  s << "\nactual number of docs found: " << r.actualNumDocs;
155  s << "\n\n";
156
157  return s;
158}
159
160
161bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {
162  return ((r1.docs == r2.docs) &&
163      (r1.levels == r2.levels) &&
164      (r1.ranks == r2.ranks) &&
165      (r1.termFreqs == r2.termFreqs) &&
166      (r1.actualNumDocs == r2.actualNumDocs));
167}
168
169//-------------------------------------------------------
170// new BrowseQueryResult stuff
171void BrowseQueryResult::Clear () {
172  termFreqs.erase (termFreqs.begin(), termFreqs.end());
173}
174
175BrowseQueryResult::BrowseQueryResult () {
176  Clear ();
177}
178
179
180
181ostream &operator<< (ostream &s, const BrowseQueryResult &r) {
182  s << "terms: ";
183  mg_u_long i;
184  for (i=0; i<r.termFreqs.size(); ++i)
185    s << r.termFreqs[i] << ", ";
186    s << "\n\n";
187  return s;
188}
189
190
191bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2) {
192  return ((r1.termFreqs == r2.termFreqs));
193     
194}
195
196
197
198
199//--------------------------------------
200void FragData::Clear () {
201  matchDocs = 0;
202  fragNums.erase (fragNums.begin(), fragNums.end());
203  fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
204}
205
206
207void FindWordNumbers (IndexData &indexData,
208              const UCArray &term,
209              mg_u_long stemMethod,
210              vector<mg_u_long> &equivWords) {
211  equivWords.erase (equivWords.begin(), equivWords.end());
212
213  // if the stem method specified is not a valid one (i.e. there was no appropriate stem index, then we set it to 0)
214  // unless we have partial matching, in which case we are not doing stem indexes anyway.
215  if (!(stemMethod & STEM_PARTIAL_MATCH) && indexData.stemFile[stemMethod-1] == NULL) {
216    cerr << "Stem index for method "<<stemMethod<< " was not built, so not doing stemming\n";
217    stemMethod = 0;
218  }
219  /* [JFG - Mar 06: Accent folding patch] */
220  /* use flag PARTIAL_MATCH */ 
221  if (stemMethod == 0 || (stemMethod & STEM_PARTIAL_MATCH)) {
222    // don't need to stem the word,
223    // find the word number(s) for this term
224    mg_u_long wordElNum = 0;
225    mg_u_long numLevels = indexData.bdh.num_levels;
226    word_block_dict_el wordDictEl;
227    wordDictEl.SetNumLevels (numLevels);
228    if (stemMethod == 0) {
229      if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
230                 indexData.bdh.entries_per_wblk,
231                 indexData.bdh.word_dict_size,
232                 numLevels, term, wordDictEl, wordElNum))
233    equivWords.push_back (wordElNum);
234     
235      return;
236    } else {
237      // partial matching,
238      PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod & STEM_CaseFolding)? true : false);
239      // TODO: Accent Folding is not handled here!!
240      return;
241    }
242  }
243             
244  // need to stem this word and find it in the blocked stem index
245  unsigned char mgWord[MAXSTEMLEN + 1];
246  UCArray stemTerm;
247  mg_u_long stemmerNum = 0;
248
249  /* [JFG - Mar 06: Accent folding patch] */
250  if(stemMethod > STEM_MAX) {
251    return;
252    //TODO: throw an error here
253  }
254
255  stemmerNum = indexData.sih[stemMethod-1].stemmer_num;
256 
257  // convert the word to an "mg word"
258  mgWord[0] = term.size();
259  memcpy ((char *)&mgWord[1], &(term[0]), term.size());
260 
261  // stem the word
262  mgpp_stemmer (stemMethod, stemmerNum, mgWord);
263  // convert the result back to a UCArray
264  stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
265
266  // need to look up this term in the appropriate dictionary
267  stem_block_dict_el stemDictEl;
268  mg_u_long stemElNum;
269  bool result = false;
270 
271  /* [JFG - Mar 06: Accent folding patch] */
272  result = SearchStemBlockDictEl (indexData.stemFile[stemMethod-1],
273               indexData.sii[stemMethod-1],
274               indexData.sih[stemMethod-1].entries_per_block,
275               indexData.sih[stemMethod-1].dict_size,
276               stemTerm,
277               stemDictEl,
278               stemElNum);
279 
280  if (result) {
281    equivWords = stemDictEl.equivWords; 
282  }
283}
284
285
286
287void ReadTermFragData (IndexData &indexData,
288               bool needFragFreqs,
289               mg_u_long termNum,
290               FragData &fragData,
291               FragRangeArray *fragLimits,
292               UCArray & termWord) {
293  fragData.Clear();
294
295  // look up the word in the dictionary
296  mg_u_long numLevels = indexData.bdh.num_levels;
297  word_block_dict_el wordDictEl;
298  wordDictEl.SetNumLevels (numLevels);
299  if (!SearchWordBlockDictElNum (indexData.dictFile,
300                 indexData.biWords,
301                 indexData.bdh.entries_per_wblk,
302                 indexData.bdh.word_dict_size,
303                 numLevels,
304                 termNum, wordDictEl))
305    return; // nothing more to do
306
307  fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
308  termWord = wordDictEl.el;
309  // seek to the appropriate place in the inverted file
310  fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
311  stdio_bitio_buffer buffer (indexData.invfFile);
312   
313  mg_u_long B = BIO_Bblock_Init (indexData.bdh.num_frags,
314                     wordDictEl.frag_occur);
315  mg_u_long fragNum = 0;
316  mg_u_long termFreq = 0;
317
318  mg_u_long fragLimitI = 0;
319  mg_u_long i;
320  for (i=0; i<wordDictEl.frag_occur; ++i) {
321    fragNum += buffer.bblock_decode (B, NULL);
322    if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
323    else termFreq = 1;
324
325    // get the right fragment range
326    if (fragLimits != NULL) {
327      while (fragLimitI+1 < (*fragLimits).size() &&
328         fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
329    ++fragLimitI;
330      }
331    }
332
333    // add the entry if it is within the limits
334    if ((fragLimits == NULL) ||
335    (fragLimitI < (*fragLimits).size() &&
336     fragNum > (*fragLimits)[fragLimitI].rangeStart &&
337     fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
338      fragData.fragNums.push_back (fragNum);
339      if (needFragFreqs)
340    fragData.fragFreqs.push_back (termFreq);
341    }
342  }
343
344  buffer.done();
345}
346
347
348void CombineFragData (bool needFragFreqs,
349              const FragData &f1,
350              const FragData &f2,
351              FragData &outFragData) {
352  outFragData.Clear();
353
354  // the new number of matching documents is the maximum
355  // of the two input matching number of documents -- it
356  // is assumed that these are at the same document level
357  outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
358    f1.matchDocs : f2.matchDocs;
359
360  // do or
361  mg_u_long f1I = 0, f1Size = f1.fragNums.size();
362  mg_u_long f2I = 0, f2Size = f2.fragNums.size();
363  while (f1I < f1Size || f2I < f2Size) {
364    if (f2I < f2Size &&
365    (f1I >= f1Size ||
366     f1.fragNums[f1I] > f2.fragNums[f2I])) {
367      // output f2I
368      outFragData.fragNums.push_back (f2.fragNums[f2I]);
369      if (needFragFreqs)
370    outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
371      ++f2I;
372     
373    } else if (f1I < f1Size &&
374           (f2I >= f2Size ||
375        f1.fragNums[f1I] < f2.fragNums[f2I])) {
376      // output f1I
377      outFragData.fragNums.push_back (f1.fragNums[f1I]);
378      if (needFragFreqs)
379    outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
380      ++f1I;
381     
382    } else {
383      // must be equal combine f1I and f2I
384      outFragData.fragNums.push_back (f1.fragNums[f1I]);
385      if (needFragFreqs)
386    outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
387      ++f1I;
388      ++f2I;
389    }
390  }
391}
392
393
394void AndCombineFragData (bool needFragFreqs,
395             FragData &fragData,
396             const FragData &comFragData,
397             mg_s_long startRange,
398             mg_s_long endRange,
399             const FragRangeArray *fragLimits) {
400  // sanity check on range
401  if (startRange > endRange) {
402    mg_s_long temp = endRange;
403    endRange = startRange;
404    startRange = temp;
405  }
406
407  // get min matchdocs
408  if (comFragData.matchDocs < fragData.matchDocs)
409    fragData.matchDocs = comFragData.matchDocs;
410 
411  mg_u_long fragDataI = 0;
412  mg_u_long fragDataSize = fragData.fragNums.size();
413  mg_u_long comFragDataI = 0;
414  mg_u_long comFragDataSize = comFragData.fragNums.size();
415  mg_u_long fragLimitI = 0;
416  mg_u_long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
417  mg_u_long outI = 0;
418
419  while (fragDataI < fragDataSize &&
420     comFragDataI < comFragDataSize) {
421    mg_s_long fragNum = (mg_s_long)fragData.fragNums[fragDataI];
422    mg_s_long comFragNum = (mg_s_long)comFragData.fragNums[comFragDataI];
423   
424    // go to the right fragment limit (for the com frag)
425    if (fragLimits != NULL) {
426      while (fragLimitI+1 < fragLimitSize &&
427         comFragNum > (mg_s_long)(*fragLimits)[fragLimitI+1].rangeStart) {
428    ++fragLimitI;
429      }
430    }
431
432    if (fragNum <= comFragNum+startRange ||
433    (fragLimits!=NULL &&
434     fragNum<=(mg_s_long)(*fragLimits)[fragLimitI].rangeStart)) {
435      ++fragDataI;
436     
437    } else if (fragNum > comFragNum+endRange ||
438           (fragLimits!=NULL &&
439        fragNum>(mg_s_long)(*fragLimits)[fragLimitI].rangeEnd)) {
440      ++comFragDataI;
441     
442    } else {
443      // equal and within tag
444      fragData.fragNums[outI] = comFragNum;
445      if (needFragFreqs) {
446    fragData.fragFreqs[outI] =
447      (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
448      fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
449      }
450      ++fragDataI;
451      ++comFragDataI;
452      ++outI;
453    }
454  }
455
456  // erase unused part of fragData
457  fragData.fragNums.erase (fragData.fragNums.begin()+outI,
458               fragData.fragNums.end());
459  if (needFragFreqs)
460    fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
461                  fragData.fragFreqs.end());
462  else
463    fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
464                  fragData.fragFreqs.end());
465}
466
467
468void FragsToQueryResult (IndexData &indexData,
469             const QueryInfo &queryInfo,
470             const FragData &termData,
471             const UCArray &tag,
472             const UCArray &term,
473             mg_u_long stemMethod,
474             mg_u_long termWeight,
475             UCArrayVector &equivTerms,
476             QueryResult &result) {
477  bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
478 
479  result.Clear();
480
481  // log (N / ft)
482  mg_u_long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
483  float wordLog = log((double)N / (double)termData.matchDocs);
484
485  // Wqt = fqt * log (N / ft)
486  // note: terms are allowed to have a weight of zero so
487  // they can be excluded from the ranking
488  float Wqt = termWeight * wordLog;
489
490  // Wdt = fdt * log (N / ft)
491  float Wdt;
492 
493  mg_u_long termDataI = 0;
494  mg_u_long termDataSize = termData.fragNums.size();
495  mg_u_long levelDocNum = 0;
496 
497  mg_u_long termDocFreq = 0;
498  mg_u_long lastLevelDocNum = 0;
499  mg_u_long overallwordfreq = 0;
500 
501  while (termDataI < termDataSize) {
502    if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
503                          levelDocNum)) {
504      if (levelDocNum != lastLevelDocNum) {
505    if (lastLevelDocNum > 0) {
506      // add this doc information
507      if (needRanks) {
508        Wdt = termDocFreq * wordLog;
509        result.ranks.push_back (Wqt * Wdt);
510      }
511      result.docs.push_back (lastLevelDocNum);
512    }
513   
514    lastLevelDocNum = levelDocNum;
515    termDocFreq = 0;
516      }
517
518      if (needRanks){
519    termDocFreq += termData.fragFreqs[termDataI];
520    overallwordfreq += termData.fragFreqs[termDataI];
521      }
522    }
523    ++termDataI;
524  }
525
526  if (lastLevelDocNum > 0) {
527    // add the last document information
528    if (needRanks) {
529      Wdt = termDocFreq * wordLog;
530      result.ranks.push_back (Wqt * Wdt);
531    }
532    result.docs.push_back (lastLevelDocNum);
533  }
534
535  // add the term frequency information
536  if (queryInfo.needTermFreqs) {
537    TermFreqData termFreqData;
538    termFreqData.tag = tag;
539    termFreqData.term = term;
540    termFreqData.stemMethod = stemMethod;
541    termFreqData.equivTerms = equivTerms;
542    termFreqData.matchDocs = termData.matchDocs;
543    termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo
544                                              //not true
545    result.termFreqs.push_back (termFreqData);
546  }
547}
548
549void AndFragsToQueryResult (IndexData &indexData,
550                const QueryInfo &queryInfo,
551                const FragData &termData,
552                const UCArray &tag,
553                const UCArray &term,
554                mg_u_long stemMethod,
555                mg_u_long termWeight,
556                UCArrayVector &equivTerms,
557                QueryResult &result) {
558  bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
559 
560  // log (N / ft)
561  float wordLog =
562    log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
563    (double)termData.matchDocs);
564
565  // Wqt = fqt * log (N / ft)
566  // note: terms are allowed to have a weight of zero so
567  // they can be excluded from the ranking
568  float Wqt = termWeight * wordLog;
569
570  // Wdt = fdt * log (N / ft)
571  float Wdt;
572 
573  mg_u_long termDataI = 0;
574  mg_u_long termDataSize = termData.fragNums.size();
575  mg_u_long levelDocNum = 0;
576 
577  mg_u_long termDocFreq = 0;
578  mg_u_long lastLevelDocNum = 0;
579  mg_u_long overallwordfreq = 0;
580  mg_u_long resultI = 0;
581  mg_u_long resultSize = result.docs.size();
582  mg_u_long resultOutI = 0;
583 
584 
585  while (termDataI < termDataSize) {
586    if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
587                          levelDocNum)) {
588      if (levelDocNum != lastLevelDocNum) {
589    if (lastLevelDocNum > 0) {
590      // add this doc information
591      Wdt = termDocFreq * wordLog;
592     
593      // find this document number
594      while (resultI < resultSize &&
595         result.docs[resultI] < lastLevelDocNum)
596        ++resultI;
597     
598      // store the result
599      if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
600        result.docs[resultOutI] = lastLevelDocNum;
601        if (needRanks)
602          result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
603        ++resultI;
604        ++resultOutI;
605      }
606    }
607   
608    lastLevelDocNum = levelDocNum;
609    termDocFreq = 0;
610      }
611
612      if (needRanks)
613    termDocFreq += termData.fragFreqs[termDataI];
614     overallwordfreq += termData.fragFreqs[termDataI];
615    }
616   
617    ++termDataI;
618  } // while
619
620  if (lastLevelDocNum > 0) {
621    // add the last document information
622    Wdt = termDocFreq * wordLog;
623
624    // find this document number
625    while (resultI < resultSize &&
626       result.docs[resultI] < lastLevelDocNum)
627      ++resultI;
628   
629    // store the result
630    if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
631      result.docs[resultOutI] = lastLevelDocNum;
632      if (needRanks)
633    result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
634      ++resultI;
635      ++resultOutI;
636    }
637  }
638
639  // remove unneeded entries
640  result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
641  if (needRanks)
642    result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
643  else
644    result.ranks.erase (result.ranks.begin(), result.ranks.end());
645 
646  // add the term frequency information
647  if (queryInfo.needTermFreqs) {
648    TermFreqData termFreqData;
649    termFreqData.tag = tag;
650    termFreqData.term = term;
651    termFreqData.stemMethod = stemMethod;
652    termFreqData.equivTerms = equivTerms;
653    termFreqData.matchDocs = termData.matchDocs;
654    termFreqData.termFreq = overallwordfreq;
655    result.termFreqs.push_back (termFreqData);
656  }
657}
658
659
660void RemoveUnwantedResults (IndexData &indexData,
661                const QueryInfo &queryInfo,
662                const FragData &termData,
663                QueryResult &result) {
664  bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
665
666  mg_u_long termDataI = 0;
667  mg_u_long termDataSize = termData.fragNums.size();
668  mg_u_long levelDocNum = 0;
669 
670  mg_u_long lastLevelDocNum = 0;
671
672  mg_u_long resultI = 0;
673  mg_u_long resultSize = result.docs.size();
674  mg_u_long resultOutI = 0;
675 
676  while (termDataI < termDataSize) {
677    if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
678                          levelDocNum)) {
679      if (levelDocNum != lastLevelDocNum) {
680    if (lastLevelDocNum > 0) {
681      // find this document number
682      while (resultI < resultSize &&
683         result.docs[resultI] < lastLevelDocNum)
684        ++resultI;
685     
686      // store the result
687      if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
688        result.docs[resultOutI] = lastLevelDocNum;
689        if (needRanks)
690          result.ranks[resultOutI] = result.ranks[resultI];
691        ++resultI;
692        ++resultOutI;
693      }
694    }
695   
696    lastLevelDocNum = levelDocNum;
697      }
698    }
699   
700    ++termDataI;
701  }
702
703  if (lastLevelDocNum > 0) {
704    // find this document number
705    while (resultI < resultSize &&
706       result.docs[resultI] < lastLevelDocNum)
707      ++resultI;
708   
709    // store the result
710    if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
711      result.docs[resultOutI] = lastLevelDocNum;
712      if (needRanks)
713    result.ranks[resultOutI] = result.ranks[resultI];
714      ++resultI;
715      ++resultOutI;
716    }
717  }
718
719  // remove unneeded entries
720  result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
721  if (needRanks)
722    result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
723  else
724    result.ranks.erase (result.ranks.begin(), result.ranks.end());
725}
726
727
728
729//--------------------------------------------------------------
730// functions to support full text browse
731
732void FindNearestWordNumber (IndexData &indexData,
733                const UCArray &term,
734                mg_u_long &number) {
735
736    // find the word number for this term
737    mg_u_long wordElNum = 0;
738    mg_u_long numLevels = indexData.bdh.num_levels;
739    word_block_dict_el wordDictEl;
740    wordDictEl.SetNumLevels (numLevels);
741    if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
742                      indexData.bdh.entries_per_wblk,
743                      indexData.bdh.word_dict_size,
744                      numLevels, term, wordDictEl, wordElNum))
745      number = wordElNum;
746
747}
748
749void GetTermList(IndexData &indexData,
750         mg_u_long startTerm,
751         mg_u_long numTerms,
752         TermFreqArray &terms) {
753
754  word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array();
755  TermFreqData termdata;
756
757  terms.erase(terms.begin(), terms.end());
758
759  SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
760                 indexData.bdh.entries_per_wblk,
761                 indexData.bdh.word_dict_size,
762                 indexData.bdh.num_levels, startTerm,
763                 numTerms, wordBlocks);
764
765  word_block_dict_el_array::iterator here = wordBlocks.begin();
766  word_block_dict_el_array::iterator end = wordBlocks.end();
767
768  while (here != end) {
769    termdata.Clear();
770    termdata.term = (*here).el;
771    termdata.termFreq = (*here).freq;
772    terms.push_back(termdata);
773    ++here;
774  }
775
776}
777
778void GetTermList(IndexData &indexData,
779         mg_u_long startTerm,
780         mg_u_long numTerms,
781         UCArrayVector &terms) {
782
783 
784 
785  SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
786                 indexData.bdh.entries_per_wblk,
787                 indexData.bdh.word_dict_size,
788                 indexData.bdh.num_levels, startTerm,
789                 numTerms, terms);
790
791}
Note: See TracBrowser for help on using the browser.