1  /**************************************************************************


2  *


3  * Terms.cpp  Query related functions


4  * Copyright (C) 1999 Rodger McNab


5  *


6  * This program is free software; you can redistribute it and/or modify


7  * it under the terms of the GNU General Public License as published by


8  * the Free Software Foundation; either version 2 of the License, or


9  * (at your option) any later version.


10  *


11  * This program is distributed in the hope that it will be useful,


12  * but WITHOUT ANY WARRANTY; without even the implied warranty of


13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the


14  * GNU General Public License for more details.


15  *


16  * You should have received a copy of the GNU General Public License


17  * along with this program; if not, write to the Free Software


18  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.


19  *


20  * $Id: Terms.cpp 1847 20010122 01:47:56Z kjm18 $


21  *


22  **************************************************************************/


23 


24  #include "Terms.h"


25  #include "words.h"


26  #include "stemmer.h"


27  #include "bitio_gen.h"


28  #include "bitio_m_stdio.h"


29 


30  void QueryInfo::Clear () {


31  UCArrayClear (docLevel);


32  maxDocs = 0;


33  sortByRank = true;


34  exactWeights = false;


35  needRankInfo = false;


36  needTermFreqs = false;


37  }


38 


39 


40 


41  void TermFreqData::Clear () {


42  UCArrayClear (tag);


43  UCArrayClear (term);


44  equivTerms.erase(equivTerms.begin(), equivTerms.end());


45  stemMethod = 0;


46  matchDocs = 0;


47  termFreq = 0;


48  }


49 


50  ostream &operator<< (ostream &s, const TermFreqData &t) {


51  s << "<" << t.tag << ">\"" << t.term << "\"stem("


52  << t.stemMethod << ")equiv terms(";


53 


54  unsigned long i;


55  for (i=0; i<t.equivTerms.size(); i++) {


56  s << t.equivTerms[i] << ", ";


57  }


58  s <<")docs(" << t.matchDocs << ")"


59  << "count("<<t.termFreq<<")";


60  return s;


61  }


62 


63  bool operator== (const TermFreqData &t1, const TermFreqData &t2) {


64  return ((t1.tag == t2.tag) &&


65  (t1.term == t2.term) &&


66  (t1.stemMethod == t2.stemMethod) &&


67  (t1.equivTerms == t2.equivTerms) &&


68  (t1.matchDocs == t2.matchDocs) &&


69  (t1.termFreq == t2.termFreq));


70  }


71 


72 


73  void QueryResult::Clear () {


74  docs.erase (docs.begin(), docs.end());


75  ranks.erase (ranks.begin(), ranks.end());


76  termFreqs.erase (termFreqs.begin(), termFreqs.end());


77  actualNumDocs = 0;


78  }


79 


80  QueryResult::QueryResult () {


81  Clear ();


82  }


83 


84  void QueryResult::printShort(ostream &s) {


85 


86  s << "termFreqs: ";


87  for (unsigned long i=0; i<termFreqs.size(); i++)


88  s << termFreqs[i] << ", ";


89 


90  s << "\nactual number of docs found: " << actualNumDocs;


91  s << "\n\n";


92 


93  }


94 


95 


96  ostream &operator<< (ostream &s, const QueryResult &r) {


97  s << "docs: ";


98  unsigned long i;


99  for (i=0; i<r.docs.size(); i++)


100  s << r.docs[i] << ", ";


101 


102  s << "\nranks: ";


103  for (i=0; i<r.ranks.size(); i++)


104  s << r.ranks[i] << ", ";


105 


106  s << "\ntermFreqs: ";


107  for (i=0; i<r.termFreqs.size(); i++)


108  s << r.termFreqs[i] << ", ";


109 


110  s << "\nactual number of docs found: " << r.actualNumDocs;


111  s << "\n\n";


112 


113  return s;


114  }


115 


116 


117  bool operator== (const QueryResult &r1, const QueryResult &r2) {


118  return ((r1.docs == r2.docs) &&


119  (r1.ranks == r2.ranks) &&


120  (r1.termFreqs == r2.termFreqs) &&


121  (r1.actualNumDocs == r2.actualNumDocs));


122  }


123 


124  //


125  // new ExtQueryResult stuff


126  void ExtQueryResult::Clear () {


127  docs.erase (docs.begin(), docs.end());


128  levels.erase (levels.begin(), levels.end());


129  ranks.erase (ranks.begin(), ranks.end());


130  termFreqs.erase (termFreqs.begin(), termFreqs.end());


131  actualNumDocs = 0;


132  }


133 


134  ExtQueryResult::ExtQueryResult () {


135  Clear ();


136  }


137 


138  ostream &operator<< (ostream &s, const ExtQueryResult &r) {


139  s << "docs: ";


140  unsigned long i;


141  for (i=0; i<r.docs.size(); i++)


142  s << r.docs[i] << ", ";


143 


144  s << "\nlevels: ";


145  for (i=0; i<r.levels.size(); i++)


146  s << r.levels[i] << ", ";


147 


148 


149  s << "\nranks: ";


150  for (i=0; i<r.ranks.size(); i++)


151  s << r.ranks[i] << ", ";


152 


153  s << "\ntermFreqs: ";


154  for (i=0; i<r.termFreqs.size(); i++)


155  s << r.termFreqs[i] << ", ";


156  s << "\nactual number of docs found: " << r.actualNumDocs;


157  s << "\n\n";


158 


159  return s;


160  }


161 


162 


163  bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {


164  return ((r1.docs == r2.docs) &&


165  (r1.levels == r2.levels) &&


166  (r1.ranks == r2.ranks) &&


167  (r1.termFreqs == r2.termFreqs) &&


168  (r1.actualNumDocs == r2.actualNumDocs));


169  }


170 


171  //


172  // new BrowseQueryResult stuff


173  void BrowseQueryResult::Clear () {


174  termFreqs.erase (termFreqs.begin(), termFreqs.end());


175  }


176 


177  BrowseQueryResult::BrowseQueryResult () {


178  Clear ();


179  }


180 


181 


182 


183  ostream &operator<< (ostream &s, const BrowseQueryResult &r) {


184  s << "terms: ";


185  unsigned long i;


186  for (i=0; i<r.termFreqs.size(); i++)


187  s << r.termFreqs[i] << ", ";


188  s << "\n\n";


189  return s;


190  }


191 


192 


193  bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2) {


194  return ((r1.termFreqs == r2.termFreqs));


195 


196  }


197 


198 


199 


200 


201  //


202  void FragData::Clear () {


203  matchDocs = 0;


204  fragNums.erase (fragNums.begin(), fragNums.end());


205  fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());


206  }


207 


208 


209 


210 


211  void FindWordNumbers (IndexData &indexData,


212  const UCArray &term,


213  unsigned long stemMethod,


214  vector<unsigned long> &equivWords) {


215  equivWords.erase (equivWords.begin(), equivWords.end());


216 


217  if (stemMethod == 0) {


218  // don't need to stem the word,


219  // find the word number for this term


220  unsigned long wordElNum = 0;


221  unsigned long numLevels = indexData.bdh.num_levels;


222  word_block_dict_el wordDictEl;


223  wordDictEl.SetNumLevels (numLevels);


224  if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,


225  indexData.bdh.entries_per_wblk,


226  indexData.bdh.word_dict_size,


227  numLevels, term, wordDictEl, wordElNum))


228  equivWords.push_back (wordElNum);


229 


230  return;


231 


232  }


233 


234 


235  // need to stem this word and find it in the blocked stem index


236 


237  unsigned char mgWord[MAXSTEMLEN + 1];


238  UCArray stemTerm;


239  unsigned long stemmerNum = 0;


240 


241  if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;


242  else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;


243  else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;


244 


245 


246  // convert the word to an "mg word"


247  mgWord[0] = term.size();


248  bcopy ((char *)term.begin(), (char *)&mgWord[1], term.size());


249 


250  // stem the word


251  stemmer (stemMethod, stemmerNum, mgWord);


252 


253  // convert the result back to a UCArray


254  stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);


255 


256  // need to look up this term in the appropriate dictionary


257  stem_block_dict_el stemDictEl;


258  unsigned long stemElNum;


259  bool result = false;


260  if (stemMethod == 1) {


261  result = SearchStemBlockDictEl (indexData.stem1File,


262  indexData.sii1,


263  indexData.sih1.entries_per_block,


264  indexData.sih1.dict_size,


265  stemTerm,


266  stemDictEl,


267  stemElNum);


268 


269  } else if (stemMethod == 2) {


270  result = SearchStemBlockDictEl (indexData.stem2File,


271  indexData.sii2,


272  indexData.sih2.entries_per_block,


273  indexData.sih2.dict_size,


274  stemTerm,


275  stemDictEl,


276  stemElNum);


277 


278  } else if (stemMethod == 3) {


279  result = SearchStemBlockDictEl (indexData.stem3File,


280  indexData.sii3,


281  indexData.sih3.entries_per_block,


282  indexData.sih3.dict_size,


283  stemTerm,


284  stemDictEl,


285  stemElNum);


286  }


287 


288  if (result) {


289  equivWords = stemDictEl.equivWords;


290  }


291  }


292 


293 


294 


295  void ReadTermFragData (IndexData &indexData,


296  bool needFragFreqs,


297  unsigned long termNum,


298  FragData &fragData,


299  FragRangeArray *fragLimits,


300  UCArray & termWord) {


301  fragData.Clear();


302 


303  // look up the word in the dictionary


304  unsigned long numLevels = indexData.bdh.num_levels;


305  word_block_dict_el wordDictEl;


306  wordDictEl.SetNumLevels (numLevels);


307  if (!SearchWordBlockDictElNum (indexData.dictFile,


308  indexData.biWords,


309  indexData.bdh.entries_per_wblk,


310  indexData.bdh.word_dict_size,


311  numLevels,


312  termNum, wordDictEl))


313  return; // nothing more to do


314 


315  fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];


316  termWord = wordDictEl.el;


317  // seek to the appropriate place in the inverted file


318  fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);


319  stdio_bitio_buffer buffer (indexData.invfFile);


320 


321  unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags,


322  wordDictEl.frag_occur);


323  unsigned long fragNum = 0;


324  unsigned long termFreq = 0;


325 


326  unsigned long fragLimitI = 0;


327  unsigned long i;


328  for (i=0; i<wordDictEl.frag_occur; i++) {


329  fragNum += buffer.bblock_decode (B, NULL);


330  if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);


331  else termFreq = 1;


332 


333  // get the right fragment range


334  if (fragLimits != NULL) {


335  while (fragLimitI+1 < (*fragLimits).size() &&


336  fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {


337  fragLimitI++;


338  }


339  }


340 


341  // add the entry if it is within the limits


342  if ((fragLimits == NULL) 


343  (fragLimitI < (*fragLimits).size() &&


344  fragNum > (*fragLimits)[fragLimitI].rangeStart &&


345  fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {


346  fragData.fragNums.push_back (fragNum);


347  if (needFragFreqs)


348  fragData.fragFreqs.push_back (termFreq);


349  }


350  }


351 


352  buffer.done();


353  }


354 


355 


356  void CombineFragData (bool needFragFreqs,


357  const FragData &f1,


358  const FragData &f2,


359  FragData &outFragData) {


360  outFragData.Clear();


361 


362  // the new number of matching documents is the maximum


363  // of the two input matching number of documents  it


364  // is assumed that these are at the same document level


365  outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?


366  f1.matchDocs : f2.matchDocs;


367 


368  // do or


369  unsigned long f1I = 0, f1Size = f1.fragNums.size();


370  unsigned long f2I = 0, f2Size = f2.fragNums.size();


371  while (f1I < f1Size  f2I < f2Size) {


372  if (f2I < f2Size &&


373  (f1I >= f1Size 


374  f1.fragNums[f1I] > f2.fragNums[f2I])) {


375  // output f2I


376  outFragData.fragNums.push_back (f2.fragNums[f2I]);


377  if (needFragFreqs)


378  outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);


379  f2I++;


380 


381  } else if (f1I < f1Size &&


382  (f2I >= f2Size 


383  f1.fragNums[f1I] < f2.fragNums[f2I])) {


384  // output f1I


385  outFragData.fragNums.push_back (f1.fragNums[f1I]);


386  if (needFragFreqs)


387  outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);


388  f1I++;


389 


390  } else {


391  // must be equal combine f1I and f2I


392  outFragData.fragNums.push_back (f1.fragNums[f1I]);


393  if (needFragFreqs)


394  outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);


395  f1I++;


396  f2I++;


397  }


398  }


399  }


400 


401 


402  void AndCombineFragData (bool needFragFreqs,


403  FragData &fragData,


404  const FragData &comFragData,


405  signed long startRange,


406  signed long endRange,


407  const FragRangeArray *fragLimits) {


408  // sanity check on range


409  if (startRange > endRange) {


410  signed long temp = endRange;


411  endRange = startRange;


412  startRange = temp;


413  }


414 


415  // get min matchdocs


416  if (comFragData.matchDocs < fragData.matchDocs)


417  fragData.matchDocs = comFragData.matchDocs;


418 


419  unsigned long fragDataI = 0;


420  unsigned long fragDataSize = fragData.fragNums.size();


421  unsigned long comFragDataI = 0;


422  unsigned long comFragDataSize = comFragData.fragNums.size();


423  unsigned long fragLimitI = 0;


424  unsigned long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();


425  unsigned long outI = 0;


426 


427  while (fragDataI < fragDataSize &&


428  comFragDataI < comFragDataSize) {


429  signed long fragNum = (signed long)fragData.fragNums[fragDataI];


430  signed long comFragNum = (signed long)comFragData.fragNums[comFragDataI];


431 


432  // go to the right fragment limit (for the com frag)


433  if (fragLimits != NULL) {


434  while (fragLimitI+1 < fragLimitSize &&


435  comFragNum > (signed long)(*fragLimits)[fragLimitI+1].rangeStart) {


436  fragLimitI++;


437  }


438  }


439 


440  if (fragNum <= comFragNum+startRange 


441  (fragLimits!=NULL &&


442  fragNum<=(signed long)(*fragLimits)[fragLimitI].rangeStart)) {


443  fragDataI++;


444 


445  } else if (fragNum > comFragNum+endRange 


446  (fragLimits!=NULL &&


447  fragNum>(signed long)(*fragLimits)[fragLimitI].rangeEnd)) {


448  comFragDataI++;


449 


450  } else {


451  // equal and within tag


452  fragData.fragNums[outI] = comFragNum;


453  if (needFragFreqs) {


454  fragData.fragFreqs[outI] =


455  (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?


456  fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];


457  }


458  fragDataI++;


459  comFragDataI++;


460  outI++;


461  }


462  }


463 


464  // erase unused part of fragData


465  fragData.fragNums.erase (fragData.fragNums.begin()+outI,


466  fragData.fragNums.end());


467  if (needFragFreqs)


468  fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,


469  fragData.fragFreqs.end());


470  else


471  fragData.fragFreqs.erase (fragData.fragFreqs.begin(),


472  fragData.fragFreqs.end());


473  }


474 


475 


476  void FragsToQueryResult (IndexData &indexData,


477  const QueryInfo &queryInfo,


478  const FragData &termData,


479  const UCArray &tag,


480  const UCArray &term,


481  unsigned long stemMethod,


482  unsigned long termWeight,


483  UCArrayVector &equivTerms,


484  QueryResult &result) {


485  bool needRanks = (queryInfo.sortByRank  queryInfo.needRankInfo);


486 


487  result.Clear();


488 


489  // log (N / ft)


490  unsigned long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;


491  float wordLog = log((double)N / (double)termData.matchDocs);


492 


493  // Wqt = fqt * log (N / ft)


494  // note: terms are allowed to have a weight of zero so


495  // they can be excluded from the ranking


496  float Wqt = termWeight * wordLog;


497 


498  // Wdt = fdt * log (N / ft)


499  float Wdt;


500 


501  unsigned long termDataI = 0;


502  unsigned long termDataSize = termData.fragNums.size();


503  unsigned long levelDocNum = 0;


504 


505  unsigned long termDocFreq = 0;


506  unsigned long lastLevelDocNum = 0;


507  unsigned long overallwordfreq = 0;


508 


509  while (termDataI < termDataSize) {


510  if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],


511  levelDocNum)) {


512  if (levelDocNum != lastLevelDocNum) {


513  if (lastLevelDocNum > 0) {


514  // add this doc information


515  if (needRanks) {


516  Wdt = termDocFreq * wordLog;


517  result.ranks.push_back (Wqt * Wdt);


518  }


519  result.docs.push_back (lastLevelDocNum);


520  }


521 


522  lastLevelDocNum = levelDocNum;


523  termDocFreq = 0;


524  }


525 


526  if (needRanks){


527  termDocFreq += termData.fragFreqs[termDataI];


528  overallwordfreq += termData.fragFreqs[termDataI];


529  }


530  }


531  termDataI++;


532  }


533 


534  if (lastLevelDocNum > 0) {


535  // add the last document information


536  if (needRanks) {


537  Wdt = termDocFreq * wordLog;


538  result.ranks.push_back (Wqt * Wdt);


539  }


540  result.docs.push_back (lastLevelDocNum);


541  }


542 


543  // add the term frequency information


544  if (queryInfo.needTermFreqs) {


545  TermFreqData termFreqData;


546  termFreqData.tag = tag;


547  termFreqData.term = term;


548  termFreqData.stemMethod = stemMethod;


549  termFreqData.equivTerms = equivTerms;


550  termFreqData.matchDocs = termData.matchDocs;


551  termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo


552  //not true


553  result.termFreqs.push_back (termFreqData);


554  }


555  }


556 


557  void AndFragsToQueryResult (IndexData &indexData,


558  const QueryInfo &queryInfo,


559  const FragData &termData,


560  const UCArray &tag,


561  const UCArray &term,


562  unsigned long stemMethod,


563  unsigned long termWeight,


564  UCArrayVector &equivTerms,


565  QueryResult &result) {


566  bool needRanks = (queryInfo.sortByRank  queryInfo.needRankInfo);


567 


568  // log (N / ft)


569  float wordLog =


570  log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/


571  (double)termData.matchDocs);


572 


573  // Wqt = fqt * log (N / ft)


574  // note: terms are allowed to have a weight of zero so


575  // they can be excluded from the ranking


576  float Wqt = termWeight * wordLog;


577 


578  // Wdt = fdt * log (N / ft)


579  float Wdt;


580 


581  unsigned long termDataI = 0;


582  unsigned long termDataSize = termData.fragNums.size();


583  unsigned long levelDocNum = 0;


584 


585  unsigned long termDocFreq = 0;


586  unsigned long lastLevelDocNum = 0;


587  unsigned long overallwordfreq = 0;


588  unsigned long resultI = 0;


589  unsigned long resultSize = result.docs.size();


590  unsigned long resultOutI = 0;


591 


592 


593  while (termDataI < termDataSize) {


594  if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],


595  levelDocNum)) {


596  if (levelDocNum != lastLevelDocNum) {


597  if (lastLevelDocNum > 0) {


598  // add this doc information


599  Wdt = termDocFreq * wordLog;


600 


601  // find this document number


602  while (resultI < resultSize &&


603  result.docs[resultI] < lastLevelDocNum)


604  resultI++;


605 


606  // store the result


607  if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {


608  result.docs[resultOutI] = lastLevelDocNum;


609  if (needRanks)


610  result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;


611  resultI++;


612  resultOutI++;


613  }


614  }


615 


616  lastLevelDocNum = levelDocNum;


617  termDocFreq = 0;


618  }


619 


620  if (needRanks)


621  termDocFreq += termData.fragFreqs[termDataI];


622  overallwordfreq += termData.fragFreqs[termDataI];


623  }


624 


625  termDataI++;


626  } // while


627 


628  if (lastLevelDocNum > 0) {


629  // add the last document information


630  Wdt = termDocFreq * wordLog;


631 


632  // find this document number


633  while (resultI < resultSize &&


634  result.docs[resultI] < lastLevelDocNum)


635  resultI++;


636 


637  // store the result


638  if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {


639  result.docs[resultOutI] = lastLevelDocNum;


640  if (needRanks)


641  result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;


642  resultI++;


643  resultOutI++;


644  }


645  }


646 


647  // remove unneeded entries


648  result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());


649  if (needRanks)


650  result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());


651  else


652  result.ranks.erase (result.ranks.begin(), result.ranks.end());


653 


654  // add the term frequency information


655  if (queryInfo.needTermFreqs) {


656  TermFreqData termFreqData;


657  termFreqData.tag = tag;


658  termFreqData.term = term;


659  termFreqData.stemMethod = stemMethod;


660  termFreqData.equivTerms = equivTerms;


661  termFreqData.matchDocs = termData.matchDocs;


662  termFreqData.termFreq = overallwordfreq;


663  result.termFreqs.push_back (termFreqData);


664  }


665  }


666 


667 


668  void RemoveUnwantedResults (IndexData &indexData,


669  const QueryInfo &queryInfo,


670  const FragData &termData,


671  QueryResult &result) {


672  bool needRanks = (queryInfo.sortByRank  queryInfo.needRankInfo);


673 


674  unsigned long termDataI = 0;


675  unsigned long termDataSize = termData.fragNums.size();


676  unsigned long levelDocNum = 0;


677 


678  unsigned long lastLevelDocNum = 0;


679 


680  unsigned long resultI = 0;


681  unsigned long resultSize = result.docs.size();


682  unsigned long resultOutI = 0;


683 


684  while (termDataI < termDataSize) {


685  if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],


686  levelDocNum)) {


687  if (levelDocNum != lastLevelDocNum) {


688  if (lastLevelDocNum > 0) {


689  // find this document number


690  while (resultI < resultSize &&


691  result.docs[resultI] < lastLevelDocNum)


692  resultI++;


693 


694  // store the result


695  if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {


696  result.docs[resultOutI] = lastLevelDocNum;


697  if (needRanks)


698  result.ranks[resultOutI] = result.ranks[resultI];


699  resultI++;


700  resultOutI++;


701  }


702  }


703 


704  lastLevelDocNum = levelDocNum;


705  }


706  }


707 


708  termDataI++;


709  }


710 


711  if (lastLevelDocNum > 0) {


712  // find this document number


713  while (resultI < resultSize &&


714  result.docs[resultI] < lastLevelDocNum)


715  resultI++;


716 


717  // store the result


718  if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {


719  result.docs[resultOutI] = lastLevelDocNum;


720  if (needRanks)


721  result.ranks[resultOutI] = result.ranks[resultI];


722  resultI++;


723  resultOutI++;


724  }


725  }


726 


727  // remove unneeded entries


728  result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());


729  if (needRanks)


730  result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());


731  else


732  result.ranks.erase (result.ranks.begin(), result.ranks.end());


733  }


734 


735 


736 


737  //


738  // functions to support full text browse


739 


740  void FindNearestWordNumber (IndexData &indexData,


741  const UCArray &term,


742  unsigned long &number) {


743 


744  // find the word number for this term


745  unsigned long wordElNum = 0;


746  unsigned long numLevels = indexData.bdh.num_levels;


747  word_block_dict_el wordDictEl;


748  wordDictEl.SetNumLevels (numLevels);


749  if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords,


750  indexData.bdh.entries_per_wblk,


751  indexData.bdh.word_dict_size,


752  numLevels, term, wordDictEl, wordElNum))


753  number = wordElNum;


754 


755  }


756 


757  void GetTermList(IndexData &indexData,


758  unsigned long startTerm,


759  unsigned long numTerms,


760  TermFreqArray &terms) {


761 


762  word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array();


763  TermFreqData termdata;


764 


765  terms.erase(terms.begin(), terms.end());


766 


767  SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,


768  indexData.bdh.entries_per_wblk,


769  indexData.bdh.word_dict_size,


770  indexData.bdh.num_levels, startTerm,


771  numTerms, wordBlocks);


772 


773  word_block_dict_el_array::iterator here = wordBlocks.begin();


774  word_block_dict_el_array::iterator end = wordBlocks.end();


775 


776  while (here != end) {


777  termdata.Clear();


778  termdata.term = (*here).el;


779  termdata.termFreq = (*here).freq;


780  terms.push_back(termdata);


781  here++;


782  }


783 


784  }


785 


786  void GetTermList(IndexData &indexData,


787  unsigned long startTerm,


788  unsigned long numTerms,


789  UCArrayVector &terms) {


790 


791 


792 


793  SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,


794  indexData.bdh.entries_per_wblk,


795  indexData.bdh.word_dict_size,


796  indexData.bdh.num_levels, startTerm,


797  numTerms, terms);


798 


799  }


800 


801 


802 

