root/main/trunk/greenstone2/common-src/indexers/mgpp/text/Terms.cpp @ 26137

Revision 26137, 21.8 KB (checked in by kjdon, 8 years ago)

FindWordNumbers? was doing a test to see if the appropriate stem index was built. eg if you ask for a casefolded search but there was no casefolded index built, then you can't do it, so set the stemMethod to 0, which means no stem/casefolding/accentfolding. Unfortunately, if the stemMethod was 0, then it was looking up an array at 0-1, which is some huge number (stem method is mg_u_long). For some reason, this didn't cause a seg fault on 32 bit machines, but did on 64 bit machines which is where the bug showed up. So we must test for stem method being 0 before looking into the stem file array

  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999  Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "Terms.h"
23#include "words.h"
24#include "stemmer.h"
25#include "bitio_gen.h"
26#include "bitio_m_stdio.h"
27
28void QueryInfo::Clear () {
29  UCArrayClear (docLevel);
30  maxDocs = 0;
31  sortByRank = true;
32  exactWeights = false;
33  needRankInfo = false;
34  needTermFreqs = false;
35}
36
37
38
39void TermFreqData::Clear () {
40  UCArrayClear (tag);
41  UCArrayClear (term);
42  equivTerms.erase(equivTerms.begin(), equivTerms.end());
43  stemMethod = 0;
44  matchDocs = 0;
45  termFreq = 0;
46}
47
48ostream &operator<< (ostream &s, const TermFreqData &t) {
49  s << "<" << t.tag << ">\"" << t.term << "\"stem("
50    << t.stemMethod << ")equiv terms(";
51 
52  mg_u_long i;
53  for (i=0; i<t.equivTerms.size(); ++i) {
54    s << t.equivTerms[i] << ", ";
55  }
56  s <<")docs(" << t.matchDocs << ")"
57    << "count("<<t.termFreq<<")";
58  return s;
59}
60
61bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
62  return ((t1.tag == t2.tag) &&
63      (t1.term == t2.term) &&
64      (t1.stemMethod == t2.stemMethod) &&
65      (t1.equivTerms == t2.equivTerms) &&
66      (t1.matchDocs == t2.matchDocs) &&
67      (t1.termFreq == t2.termFreq));
68}
69
70
71void QueryResult::Clear () {
72  docs.erase (docs.begin(), docs.end());
73  ranks.erase (ranks.begin(), ranks.end());
74  termFreqs.erase (termFreqs.begin(), termFreqs.end());
75  actualNumDocs = 0;
76}
77
78QueryResult::QueryResult () {
79  Clear ();
80}
81
82void QueryResult::printShort(ostream &s) {
83
84  s << "termFreqs: ";
85  for (mg_u_long i=0; i<termFreqs.size(); ++i)
86    s << termFreqs[i] << ", ";
87 
88  s << "\nactual number of docs found: " << actualNumDocs;
89  s << "\n\n";
90
91}
92
93
94ostream &operator<< (ostream &s, const QueryResult &r) {
95  s << "docs: ";
96  mg_u_long i;
97  for (i=0; i<r.docs.size(); ++i)
98    s << r.docs[i] << ", ";
99 
100  s << "\nranks: ";
101  for (i=0; i<r.ranks.size(); ++i)
102    s << r.ranks[i] << ", ";
103
104  s << "\ntermFreqs: ";
105  for (i=0; i<r.termFreqs.size(); ++i)
106    s << r.termFreqs[i] << ", ";
107
108  s << "\nactual number of docs found: " << r.actualNumDocs;
109  s << "\n\n";
110
111  return s;
112}
113
114
115bool operator== (const QueryResult &r1, const QueryResult &r2) {
116  return ((r1.docs == r2.docs) &&
117      (r1.ranks == r2.ranks) &&
118      (r1.termFreqs == r2.termFreqs) &&
119      (r1.actualNumDocs == r2.actualNumDocs));
120}
121
122//---------------------------------------------------
123// new ExtQueryResult stuff
124void ExtQueryResult::Clear () {
125  docs.erase (docs.begin(), docs.end());
126  levels.erase (levels.begin(), levels.end());
127  ranks.erase (ranks.begin(), ranks.end());
128  termFreqs.erase (termFreqs.begin(), termFreqs.end());
129  actualNumDocs = 0;
130}
131
132ExtQueryResult::ExtQueryResult () {
133  Clear ();
134}
135
136ostream &operator<< (ostream &s, const ExtQueryResult &r) {
137  s << "docs: ";
138  mg_u_long i;
139  for (i=0; i<r.docs.size(); ++i)
140    s << r.docs[i] << ", ";
141
142  s << "\nlevels: ";
143  for (i=0; i<r.levels.size(); ++i)
144    s << r.levels[i] << ", ";
145
146 
147  s << "\nranks: ";
148  for (i=0; i<r.ranks.size(); ++i)
149    s << r.ranks[i] << ", ";
150
151  s << "\ntermFreqs: ";
152  for (i=0; i<r.termFreqs.size(); ++i)
153    s << r.termFreqs[i] << ", ";
154  s << "\nactual number of docs found: " << r.actualNumDocs;
155  s << "\n\n";
156
157  return s;
158}
159
160
161bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {
162  return ((r1.docs == r2.docs) &&
163      (r1.levels == r2.levels) &&
164      (r1.ranks == r2.ranks) &&
165      (r1.termFreqs == r2.termFreqs) &&
166      (r1.actualNumDocs == r2.actualNumDocs));
167}
168
169//-------------------------------------------------------
170// new BrowseQueryResult stuff
171void BrowseQueryResult::Clear () {
172  termFreqs.erase (termFreqs.begin(), termFreqs.end());
173}
174
175BrowseQueryResult::BrowseQueryResult () {
176  Clear ();
177}
178
179
180
181ostream &operator<< (ostream &s, const BrowseQueryResult &r) {
182  s << "terms: ";
183  mg_u_long i;
184  for (i=0; i<r.termFreqs.size(); ++i)
185    s << r.termFreqs[i] << ", ";
186    s << "\n\n";
187  return s;
188}
189
190
191bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2) {
192  return ((r1.termFreqs == r2.termFreqs));
193     
194}
195
196
197
198
199//--------------------------------------
200void FragData::Clear () {
201  matchDocs = 0;
202  fragNums.erase (fragNums.begin(), fragNums.end());
203  fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
204}
205
206
207void FindWordNumbers (IndexData &indexData,
208              const UCArray &term,
209              mg_u_long stemMethod,
210              vector<mg_u_long> &equivWords) {
211  equivWords.erase (equivWords.begin(), equivWords.end());
212
213  // if the stem method specified is not a valid one (i.e. there was no appropriate stem index, then we set it to 0)
214  // unless we have partial matching, in which case we are not doing stem indexes anyway.
215
216  if (!(stemMethod & STEM_PARTIAL_MATCH)) {
217
218    if(stemMethod > STEM_MAX) {
219      cerr << "Stem method "<<stemMethod<< " is greater than maximum allowed ("<<STEM_MAX<<"). Not doing stemming\n";
220      stemMethod=0;
221    }
222    else if (stemMethod > 0 && indexData.stemFile[stemMethod-1] == NULL) {
223      cerr << "Stem index for method "<<stemMethod<< " was not built, so not doing stemming\n";
224      stemMethod = 0;
225    }
226  }
227  /* [JFG - Mar 06: Accent folding patch] */
228  /* use flag PARTIAL_MATCH */ 
229  if (stemMethod == 0 || (stemMethod & STEM_PARTIAL_MATCH)) {
230    // don't need to stem the word,
231    // find the word number(s) for this term
232    mg_u_long wordElNum = 0;
233    mg_u_long numLevels = indexData.bdh.num_levels;
234    word_block_dict_el wordDictEl;
235    wordDictEl.SetNumLevels (numLevels);
236    if (stemMethod == 0) {
237      if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
238                 indexData.bdh.entries_per_wblk,
239                 indexData.bdh.word_dict_size,
240                 numLevels, term, wordDictEl, wordElNum))
241    equivWords.push_back (wordElNum);
242     
243      return;
244    } else {
245      // partial matching,
246      PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod & STEM_CaseFolding)? true : false);
247      // TODO: Accent Folding is not handled here!!
248      return;
249    }
250  }
251             
252  // need to stem this word and find it in the blocked stem index
253  unsigned char mgWord[MAXSTEMLEN + 1];
254  UCArray stemTerm;
255  mg_u_long stemmerNum = 0;
256
257
258  stemmerNum = indexData.sih[stemMethod-1].stemmer_num;
259 
260  // convert the word to an "mg word"
261  mgWord[0] = term.size();
262  memcpy ((char *)&mgWord[1], &(term[0]), term.size());
263 
264  // stem the word
265  mgpp_stemmer (stemMethod, stemmerNum, mgWord);
266  // convert the result back to a UCArray
267  stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
268
269  // need to look up this term in the appropriate dictionary
270  stem_block_dict_el stemDictEl;
271  mg_u_long stemElNum;
272  bool result = false;
273 
274  /* [JFG - Mar 06: Accent folding patch] */
275  result = SearchStemBlockDictEl (indexData.stemFile[stemMethod-1],
276               indexData.sii[stemMethod-1],
277               indexData.sih[stemMethod-1].entries_per_block,
278               indexData.sih[stemMethod-1].dict_size,
279               stemTerm,
280               stemDictEl,
281               stemElNum);
282 
283  if (result) {
284    equivWords = stemDictEl.equivWords; 
285  }
286}
287
288
289
290void ReadTermFragData (IndexData &indexData,
291               bool needFragFreqs,
292               mg_u_long termNum,
293               FragData &fragData,
294               FragRangeArray *fragLimits,
295               UCArray & termWord) {
296  fragData.Clear();
297
298  // look up the word in the dictionary
299  mg_u_long numLevels = indexData.bdh.num_levels;
300  word_block_dict_el wordDictEl;
301  wordDictEl.SetNumLevels (numLevels);
302  if (!SearchWordBlockDictElNum (indexData.dictFile,
303                 indexData.biWords,
304                 indexData.bdh.entries_per_wblk,
305                 indexData.bdh.word_dict_size,
306                 numLevels,
307                 termNum, wordDictEl))
308    return; // nothing more to do
309
310  fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
311  termWord = wordDictEl.el;
312  // seek to the appropriate place in the inverted file
313  fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
314  stdio_bitio_buffer buffer (indexData.invfFile);
315   
316  mg_u_long B = BIO_Bblock_Init (indexData.bdh.num_frags,
317                     wordDictEl.frag_occur);
318  mg_u_long fragNum = 0;
319  mg_u_long termFreq = 0;
320
321  mg_u_long fragLimitI = 0;
322  mg_u_long i;
323  for (i=0; i<wordDictEl.frag_occur; ++i) {
324    fragNum += buffer.bblock_decode (B, NULL);
325    if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
326    else termFreq = 1;
327
328    // get the right fragment range
329    if (fragLimits != NULL) {
330      while (fragLimitI+1 < (*fragLimits).size() &&
331         fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
332    ++fragLimitI;
333      }
334    }
335
336    // add the entry if it is within the limits
337    if ((fragLimits == NULL) ||
338    (fragLimitI < (*fragLimits).size() &&
339     fragNum > (*fragLimits)[fragLimitI].rangeStart &&
340     fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
341      fragData.fragNums.push_back (fragNum);
342      if (needFragFreqs)
343    fragData.fragFreqs.push_back (termFreq);
344    }
345  }
346
347  buffer.done();
348}
349
350
351void CombineFragData (bool needFragFreqs,
352              const FragData &f1,
353              const FragData &f2,
354              FragData &outFragData) {
355  outFragData.Clear();
356
357  // the new number of matching documents is the maximum
358  // of the two input matching number of documents -- it
359  // is assumed that these are at the same document level
360  outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
361    f1.matchDocs : f2.matchDocs;
362
363  // do or
364  mg_u_long f1I = 0, f1Size = f1.fragNums.size();
365  mg_u_long f2I = 0, f2Size = f2.fragNums.size();
366  while (f1I < f1Size || f2I < f2Size) {
367    if (f2I < f2Size &&
368    (f1I >= f1Size ||
369     f1.fragNums[f1I] > f2.fragNums[f2I])) {
370      // output f2I
371      outFragData.fragNums.push_back (f2.fragNums[f2I]);
372      if (needFragFreqs)
373    outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
374      ++f2I;
375     
376    } else if (f1I < f1Size &&
377           (f2I >= f2Size ||
378        f1.fragNums[f1I] < f2.fragNums[f2I])) {
379      // output f1I
380      outFragData.fragNums.push_back (f1.fragNums[f1I]);
381      if (needFragFreqs)
382    outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
383      ++f1I;
384     
385    } else {
386      // must be equal combine f1I and f2I
387      outFragData.fragNums.push_back (f1.fragNums[f1I]);
388      if (needFragFreqs)
389    outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
390      ++f1I;
391      ++f2I;
392    }
393  }
394}
395
396
397void AndCombineFragData (bool needFragFreqs,
398             FragData &fragData,
399             const FragData &comFragData,
400             mg_s_long startRange,
401             mg_s_long endRange,
402             const FragRangeArray *fragLimits) {
403  // sanity check on range
404  if (startRange > endRange) {
405    mg_s_long temp = endRange;
406    endRange = startRange;
407    startRange = temp;
408  }
409
410  // get min matchdocs
411  if (comFragData.matchDocs < fragData.matchDocs)
412    fragData.matchDocs = comFragData.matchDocs;
413 
414  mg_u_long fragDataI = 0;
415  mg_u_long fragDataSize = fragData.fragNums.size();
416  mg_u_long comFragDataI = 0;
417  mg_u_long comFragDataSize = comFragData.fragNums.size();
418  mg_u_long fragLimitI = 0;
419  mg_u_long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
420  mg_u_long outI = 0;
421
422  while (fragDataI < fragDataSize &&
423     comFragDataI < comFragDataSize) {
424    mg_s_long fragNum = (mg_s_long)fragData.fragNums[fragDataI];
425    mg_s_long comFragNum = (mg_s_long)comFragData.fragNums[comFragDataI];
426   
427    // go to the right fragment limit (for the com frag)
428    if (fragLimits != NULL) {
429      while (fragLimitI+1 < fragLimitSize &&
430         comFragNum > (mg_s_long)(*fragLimits)[fragLimitI+1].rangeStart) {
431    ++fragLimitI;
432      }
433    }
434
435    if (fragNum <= comFragNum+startRange ||
436    (fragLimits!=NULL &&
437     fragNum<=(mg_s_long)(*fragLimits)[fragLimitI].rangeStart)) {
438      ++fragDataI;
439     
440    } else if (fragNum > comFragNum+endRange ||
441           (fragLimits!=NULL &&
442        fragNum>(mg_s_long)(*fragLimits)[fragLimitI].rangeEnd)) {
443      ++comFragDataI;
444     
445    } else {
446      // equal and within tag
447      fragData.fragNums[outI] = comFragNum;
448      if (needFragFreqs) {
449    fragData.fragFreqs[outI] =
450      (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
451      fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
452      }
453      ++fragDataI;
454      ++comFragDataI;
455      ++outI;
456    }
457  }
458
459  // erase unused part of fragData
460  fragData.fragNums.erase (fragData.fragNums.begin()+outI,
461               fragData.fragNums.end());
462  if (needFragFreqs)
463    fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
464                  fragData.fragFreqs.end());
465  else
466    fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
467                  fragData.fragFreqs.end());
468}
469
470
471void FragsToQueryResult (IndexData &indexData,
472             const QueryInfo &queryInfo,
473             const FragData &termData,
474             const UCArray &tag,
475             const UCArray &term,
476             mg_u_long stemMethod,
477             mg_u_long termWeight,
478             UCArrayVector &equivTerms,
479             QueryResult &result) {
480  bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
481 
482  result.Clear();
483
484  // log (N / ft)
485  mg_u_long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
486  float wordLog = log((double)N / (double)termData.matchDocs);
487
488  // Wqt = fqt * log (N / ft)
489  // note: terms are allowed to have a weight of zero so
490  // they can be excluded from the ranking
491  float Wqt = termWeight * wordLog;
492
493  // Wdt = fdt * log (N / ft)
494  float Wdt;
495 
496  mg_u_long termDataI = 0;
497  mg_u_long termDataSize = termData.fragNums.size();
498  mg_u_long levelDocNum = 0;
499 
500  mg_u_long termDocFreq = 0;
501  mg_u_long lastLevelDocNum = 0;
502  mg_u_long overallwordfreq = 0;
503 
504  while (termDataI < termDataSize) {
505    if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
506                          levelDocNum)) {
507      if (levelDocNum != lastLevelDocNum) {
508    if (lastLevelDocNum > 0) {
509      // add this doc information
510      if (needRanks) {
511        Wdt = termDocFreq * wordLog;
512        result.ranks.push_back (Wqt * Wdt);
513      }
514      result.docs.push_back (lastLevelDocNum);
515    }
516   
517    lastLevelDocNum = levelDocNum;
518    termDocFreq = 0;
519      }
520
521      if (needRanks){
522    termDocFreq += termData.fragFreqs[termDataI];
523    overallwordfreq += termData.fragFreqs[termDataI];
524      }
525    }
526    ++termDataI;
527  }
528
529  if (lastLevelDocNum > 0) {
530    // add the last document information
531    if (needRanks) {
532      Wdt = termDocFreq * wordLog;
533      result.ranks.push_back (Wqt * Wdt);
534    }
535    result.docs.push_back (lastLevelDocNum);
536  }
537
538  // add the term frequency information
539  if (queryInfo.needTermFreqs) {
540    TermFreqData termFreqData;
541    termFreqData.tag = tag;
542    termFreqData.term = term;
543    termFreqData.stemMethod = stemMethod;
544    termFreqData.equivTerms = equivTerms;
545    termFreqData.matchDocs = termData.matchDocs;
546    termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo
547                                              //not true
548    result.termFreqs.push_back (termFreqData);
549  }
550}
551
552void AndFragsToQueryResult (IndexData &indexData,
553                const QueryInfo &queryInfo,
554                const FragData &termData,
555                const UCArray &tag,
556                const UCArray &term,
557                mg_u_long stemMethod,
558                mg_u_long termWeight,
559                UCArrayVector &equivTerms,
560                QueryResult &result) {
561  bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
562 
563  // log (N / ft)
564  float wordLog =
565    log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
566    (double)termData.matchDocs);
567
568  // Wqt = fqt * log (N / ft)
569  // note: terms are allowed to have a weight of zero so
570  // they can be excluded from the ranking
571  float Wqt = termWeight * wordLog;
572
573  // Wdt = fdt * log (N / ft)
574  float Wdt;
575 
576  mg_u_long termDataI = 0;
577  mg_u_long termDataSize = termData.fragNums.size();
578  mg_u_long levelDocNum = 0;
579 
580  mg_u_long termDocFreq = 0;
581  mg_u_long lastLevelDocNum = 0;
582  mg_u_long overallwordfreq = 0;
583  mg_u_long resultI = 0;
584  mg_u_long resultSize = result.docs.size();
585  mg_u_long resultOutI = 0;
586 
587 
588  while (termDataI < termDataSize) {
589    if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
590                          levelDocNum)) {
591      if (levelDocNum != lastLevelDocNum) {
592    if (lastLevelDocNum > 0) {
593      // add this doc information
594      Wdt = termDocFreq * wordLog;
595     
596      // find this document number
597      while (resultI < resultSize &&
598         result.docs[resultI] < lastLevelDocNum)
599        ++resultI;
600     
601      // store the result
602      if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
603        result.docs[resultOutI] = lastLevelDocNum;
604        if (needRanks)
605          result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
606        ++resultI;
607        ++resultOutI;
608      }
609    }
610   
611    lastLevelDocNum = levelDocNum;
612    termDocFreq = 0;
613      }
614
615      if (needRanks)
616    termDocFreq += termData.fragFreqs[termDataI];
617     overallwordfreq += termData.fragFreqs[termDataI];
618    }
619   
620    ++termDataI;
621  } // while
622
623  if (lastLevelDocNum > 0) {
624    // add the last document information
625    Wdt = termDocFreq * wordLog;
626
627    // find this document number
628    while (resultI < resultSize &&
629       result.docs[resultI] < lastLevelDocNum)
630      ++resultI;
631   
632    // store the result
633    if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
634      result.docs[resultOutI] = lastLevelDocNum;
635      if (needRanks)
636    result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
637      ++resultI;
638      ++resultOutI;
639    }
640  }
641
642  // remove unneeded entries
643  result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
644  if (needRanks)
645    result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
646  else
647    result.ranks.erase (result.ranks.begin(), result.ranks.end());
648 
649  // add the term frequency information
650  if (queryInfo.needTermFreqs) {
651    TermFreqData termFreqData;
652    termFreqData.tag = tag;
653    termFreqData.term = term;
654    termFreqData.stemMethod = stemMethod;
655    termFreqData.equivTerms = equivTerms;
656    termFreqData.matchDocs = termData.matchDocs;
657    termFreqData.termFreq = overallwordfreq;
658    result.termFreqs.push_back (termFreqData);
659  }
660}
661
662
663void RemoveUnwantedResults (IndexData &indexData,
664                const QueryInfo &queryInfo,
665                const FragData &termData,
666                QueryResult &result) {
667  bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
668
669  mg_u_long termDataI = 0;
670  mg_u_long termDataSize = termData.fragNums.size();
671  mg_u_long levelDocNum = 0;
672 
673  mg_u_long lastLevelDocNum = 0;
674
675  mg_u_long resultI = 0;
676  mg_u_long resultSize = result.docs.size();
677  mg_u_long resultOutI = 0;
678 
679  while (termDataI < termDataSize) {
680    if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
681                          levelDocNum)) {
682      if (levelDocNum != lastLevelDocNum) {
683    if (lastLevelDocNum > 0) {
684      // find this document number
685      while (resultI < resultSize &&
686         result.docs[resultI] < lastLevelDocNum)
687        ++resultI;
688     
689      // store the result
690      if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
691        result.docs[resultOutI] = lastLevelDocNum;
692        if (needRanks)
693          result.ranks[resultOutI] = result.ranks[resultI];
694        ++resultI;
695        ++resultOutI;
696      }
697    }
698   
699    lastLevelDocNum = levelDocNum;
700      }
701    }
702   
703    ++termDataI;
704  }
705
706  if (lastLevelDocNum > 0) {
707    // find this document number
708    while (resultI < resultSize &&
709       result.docs[resultI] < lastLevelDocNum)
710      ++resultI;
711   
712    // store the result
713    if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
714      result.docs[resultOutI] = lastLevelDocNum;
715      if (needRanks)
716    result.ranks[resultOutI] = result.ranks[resultI];
717      ++resultI;
718      ++resultOutI;
719    }
720  }
721
722  // remove unneeded entries
723  result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
724  if (needRanks)
725    result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
726  else
727    result.ranks.erase (result.ranks.begin(), result.ranks.end());
728}
729
730
731
732//--------------------------------------------------------------
733// functions to support full text browse
734
735void FindNearestWordNumber (IndexData &indexData,
736                const UCArray &term,
737                mg_u_long &number) {
738
739    // find the word number for this term
740    mg_u_long wordElNum = 0;
741    mg_u_long numLevels = indexData.bdh.num_levels;
742    word_block_dict_el wordDictEl;
743    wordDictEl.SetNumLevels (numLevels);
744    if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
745                      indexData.bdh.entries_per_wblk,
746                      indexData.bdh.word_dict_size,
747                      numLevels, term, wordDictEl, wordElNum))
748      number = wordElNum;
749
750}
751
752void GetTermList(IndexData &indexData,
753         mg_u_long startTerm,
754         mg_u_long numTerms,
755         TermFreqArray &terms) {
756
757  word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array();
758  TermFreqData termdata;
759
760  terms.erase(terms.begin(), terms.end());
761
762  SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
763                 indexData.bdh.entries_per_wblk,
764                 indexData.bdh.word_dict_size,
765                 indexData.bdh.num_levels, startTerm,
766                 numTerms, wordBlocks);
767
768  word_block_dict_el_array::iterator here = wordBlocks.begin();
769  word_block_dict_el_array::iterator end = wordBlocks.end();
770
771  while (here != end) {
772    termdata.Clear();
773    termdata.term = (*here).el;
774    termdata.termFreq = (*here).freq;
775    terms.push_back(termdata);
776    ++here;
777  }
778
779}
780
781void GetTermList(IndexData &indexData,
782         mg_u_long startTerm,
783         mg_u_long numTerms,
784         UCArrayVector &terms) {
785
786 
787 
788  SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
789                 indexData.bdh.entries_per_wblk,
790                 indexData.bdh.word_dict_size,
791                 indexData.bdh.num_levels, startTerm,
792                 numTerms, terms);
793
794}
Note: See TracBrowser for help on using the browser.