Changeset 8242


Ignore:
Timestamp:
2004-10-08T13:03:36+13:00 (20 years ago)
Author:
kjdon
Message:

added in Partial matching for query terms. Cna type comp* as a query, and it will find all words that begin with comp. This search can be case sensitive or insensitive. Changes made to invf.h/cpp, UCArray.h/cpp, Terms.cpp, GSDLQueryLex.h/cpp, GSDLQueryParser.cpp

Location:
trunk
Files:
16 edited

Legend:

Unmodified
Added
Removed
  • trunk/indexers/mgpp/text/GSDLQueryLex.cpp

    r6119 r8242  
    185185    return true;
    186186   
     187  } else if (c == '*') {
     188    el.lexType = StarE;
     189    AddNChar (here, el.text, charLen);
     190    return true;
     191   
    187192  } else if (c == '^') {
    188193    el.lexType = RangeE;
  • trunk/indexers/mgpp/text/GSDLQueryLex.h

    r6117 r8242  
    3030          NotOpE, NearOpE, WithinOpE, QuoteE, IntegerE, TermWeightE,
    3131          StemMethodE, RangeE, AtE, TagE, OpenSquareBracketE,
    32           CloseSquareBracketE, UnknownE};
     32          CloseSquareBracketE, StarE, UnknownE};
    3333
    3434struct LexEl {
  • trunk/indexers/mgpp/text/GSDLQueryParser.cpp

    r6129 r8242  
    178178 
    179179  termNode.stemMethod = defaultStemMethod;
    180 
     180  bool partial_match = false;
    181181  LexEl el;
    182182  UCArray::const_iterator oldHere = here;
     
    202202    } else if (el.lexType == AtE) {
    203203      termNode.startRange = termNode.endRange = ParseInt (here, end);
    204      
     204    } else if (el.lexType == StarE) {
     205      partial_match = true;
    205206    } else {
    206207      // no term modifiers
     
    209210    }
    210211   
     212    if (partial_match) {
     213      termNode.stemMethod |= 4; // set bit 2 to 1
     214      termNode.stemMethod &=0xd; // set bit 1 to 0 // we dont have stemming on if doing partial matching.
     215    }
    211216    oldHere = here;
    212217  } 
  • trunk/indexers/mgpp/text/Terms.cpp

    r3365 r8242  
    205205
    206206
    207 
    208 
    209207void FindWordNumbers (IndexData &indexData,
    210208              const UCArray &term,
     
    213211  equivWords.erase (equivWords.begin(), equivWords.end());
    214212 
    215   if (stemMethod == 0) {
     213  if (stemMethod == 0 || stemMethod==4 || stemMethod==5) {
    216214    // don't need to stem the word,
    217     // find the word number for this term
     215    // find the word number(s) for this term
    218216    unsigned long wordElNum = 0;
    219217    unsigned long numLevels = indexData.bdh.num_levels;
    220218    word_block_dict_el wordDictEl;
    221219    wordDictEl.SetNumLevels (numLevels);
    222     if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
    223                    indexData.bdh.entries_per_wblk,
    224                    indexData.bdh.word_dict_size,
    225                    numLevels, term, wordDictEl, wordElNum))
    226       equivWords.push_back (wordElNum);
    227    
    228     return;
    229    
    230   }
    231 
    232 
     220    if (stemMethod ==0) {
     221      if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
     222                 indexData.bdh.entries_per_wblk,
     223                 indexData.bdh.word_dict_size,
     224                 numLevels, term, wordDictEl, wordElNum))
     225    equivWords.push_back (wordElNum);
     226     
     227      return;
     228    } else {
     229      // partial matching,
     230      PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod==5?true:false) );
     231      return;
     232    }
     233  }
     234             
    233235  // need to stem this word and find it in the blocked stem index
    234236 
     
    236238  UCArray stemTerm;
    237239  unsigned long stemmerNum = 0;
    238  
    239240  if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;
    240241  else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;
    241242  else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;
    242  
    243  
     243   
    244244  // convert the word to an "mg word"
    245245  mgWord[0] = term.size();
  • trunk/indexers/mgpp/text/UCArray.cpp

    r7944 r8242  
    244244}
    245245
     246// does the first string start with the second?
     247bool StartsWith (const UCArray &a1, const UCArray &a2) {
     248  unsigned int l1 = a1.size();
     249  unsigned int l2 = a2.size();
     250  if (l2 > l1) {
     251    // if the prefix is longer than the string, it can't start with it
     252    return false;
     253  }
     254  unsigned int l =l2;
     255  UCArray::const_iterator a1Here = a1.begin();
     256  UCArray::const_iterator a2Here = a2.begin();
     257 
     258  while (l--) {
     259    if ((*a1Here != *a2Here))
     260      return false;
     261    ++a1Here;
     262    ++a2Here;
     263  }
     264  return true; // we have successfully matched the whole way
     265   
     266}
     267
     268// does the first string start with the second, ignoring case?
     269bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
     270  unsigned int l1 = a1.size();
     271  unsigned int l2 = a2.size();
     272  if (l2 > l1) {
     273    // if the prefix is longer than the string, it can't start with it
     274    return false;
     275  }
     276  unsigned int l =l2;
     277  UCArray::const_iterator a1Here = a1.begin();
     278  UCArray::const_iterator a2Here = a2.begin();
     279 
     280  while (l--) {
     281    if (casecharmap[*a1Here] != casecharmap[*a2Here])
     282      return false;
     283    ++a1Here;
     284    ++a2Here;
     285  }
     286  return true; // we have successfully matched the whole way
     287   
     288}
     289
     290
    246291unsigned long PrefixLen (const UCArray &a1, const UCArray &a2) {
    247292  unsigned long l = (a1.size() < a2.size()) ? a1.size() : a2.size();
  • trunk/indexers/mgpp/text/UCArray.h

    r7944 r8242  
    9797int DictCompare (const UCArray &a1, const UCArray &a2);
    9898
     99// does the first string start with the second?
     100bool StartsWith(const UCArray &a1, const UCArray &a2);
     101// does the first string start with the second, ignoring case?
     102bool StartsWithCasefold(const UCArray &a1, const UCArray &a2);
     103
    99104struct LTUCArray {
    100105  bool operator()(const UCArray &a1, const UCArray &a2) const
  • trunk/indexers/mgpp/text/invf.cpp

    r3365 r8242  
    691691}
    692692
     693// ------------------------------------------
     694// functions for partial term matching
     695// ie find all words that start with xxx
     696bool PartialMatchSearchWordBlockDictEl (FILE *dictFile,
     697                 const block_idx &bIdx,
     698                 unsigned long entriesPerBlock,
     699                 unsigned long dictSize,
     700                 unsigned long numLevels,
     701                 const UCArray &el,
     702                 word_block_dict_el &dictEl,
     703                 vector<unsigned long> &elNumList,
     704                 bool casefold) {
     705
     706  UCArrayClear (dictEl.el);
     707  elNumList.erase (elNumList.begin(), elNumList.end());
     708  unsigned long elNum;
     709  // find the block that contains the element
     710  unsigned long blockIdxNum;
     711  // will this work??
     712  if (!SearchEl (bIdx, entriesPerBlock, el,
     713         blockIdxNum, elNum)) {
     714    return false;
     715  }
     716  unsigned long blockEndElNum = elNum + entriesPerBlock;
     717  if (blockEndElNum > dictSize) blockEndElNum = dictSize;
     718 
     719  bool still_looking = true;
     720  // look for the block
     721  fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
     722  // test each element
     723  while (elNum < blockEndElNum) {
     724    dictEl.Read (dictFile, numLevels);
     725    if (StartsWithCasefold(dictEl.el, el)) {
     726      if (casefold || StartsWith(dictEl.el, el)) {
     727    elNumList.push_back(elNum);
     728      }
     729      still_looking=false; // we have found one now, so stop the next time we don't have a match
     730    } else if (!still_looking) {
     731      // we have found a match previously, and now this doesn't match
     732      return true;
     733    } // else keep looking
     734   
     735    elNum++;
     736  }
     737  // if we get here, we are either still searching for the first
     738  // case, or we are collecting terms.
     739  if (still_looking) {
     740    //we haven't found a match yet, just check the next element,
     741    dictEl.Read (dictFile, numLevels);
     742    if (!StartsWithCasefold(dictEl.el, el)) {
     743      // the first element of the next block is not a match, so there are no matches
     744      return false;
     745    } else {
     746      if (casefold || StartsWith(dictEl.el, el)) {
     747    elNumList.push_back(elNum);
     748    elNum++;
     749      }
     750
     751    }
     752  }
     753  // just keep accumulating matches until there are no more
     754  dictEl.Read (dictFile, numLevels);
     755  while (StartsWithCasefold(dictEl.el, el)) {
     756    if (casefold || StartsWith(dictEl.el, el)) {
     757      elNumList.push_back(elNum);
     758    }
     759    elNum++;
     760    dictEl.Read (dictFile, numLevels);
     761  }
     762  return true;
     763 
     764}
     765
     766
    693767//----------------------------------------------------------------
    694768// functions for full text browse
  • trunk/indexers/mgpp/text/invf.h

    r3365 r8242  
    302302                unsigned long &elNum);
    303303
     304//-----------------------------------------------------
     305// New functions for partial matching
     306// returns a list of word numbers from the main block index whose
     307// words start with el.
     308bool PartialMatchSearchWordBlockDictEl (FILE *dictFile,
     309                 const block_idx &bIdx,
     310                 unsigned long entriesPerBlock,
     311                 unsigned long dictSize,
     312                 unsigned long numLevels,
     313                 const UCArray &el,
     314                 word_block_dict_el &dictEl,
     315                 vector<unsigned long> &elNumList,
     316                 bool casefold);
    304317//----------------------------------------------------------
    305318
  • trunk/mgpp/text/GSDLQueryLex.cpp

    r6119 r8242  
    185185    return true;
    186186   
     187  } else if (c == '*') {
     188    el.lexType = StarE;
     189    AddNChar (here, el.text, charLen);
     190    return true;
     191   
    187192  } else if (c == '^') {
    188193    el.lexType = RangeE;
  • trunk/mgpp/text/GSDLQueryLex.h

    r6117 r8242  
    3030          NotOpE, NearOpE, WithinOpE, QuoteE, IntegerE, TermWeightE,
    3131          StemMethodE, RangeE, AtE, TagE, OpenSquareBracketE,
    32           CloseSquareBracketE, UnknownE};
     32          CloseSquareBracketE, StarE, UnknownE};
    3333
    3434struct LexEl {
  • trunk/mgpp/text/GSDLQueryParser.cpp

    r6129 r8242  
    178178 
    179179  termNode.stemMethod = defaultStemMethod;
    180 
     180  bool partial_match = false;
    181181  LexEl el;
    182182  UCArray::const_iterator oldHere = here;
     
    202202    } else if (el.lexType == AtE) {
    203203      termNode.startRange = termNode.endRange = ParseInt (here, end);
    204      
     204    } else if (el.lexType == StarE) {
     205      partial_match = true;
    205206    } else {
    206207      // no term modifiers
     
    209210    }
    210211   
     212    if (partial_match) {
     213      termNode.stemMethod |= 4; // set bit 2 to 1
     214      termNode.stemMethod &=0xd; // set bit 1 to 0 // we dont have stemming on if doing partial matching.
     215    }
    211216    oldHere = here;
    212217  } 
  • trunk/mgpp/text/Terms.cpp

    r3365 r8242  
    205205
    206206
    207 
    208 
    209207void FindWordNumbers (IndexData &indexData,
    210208              const UCArray &term,
     
    213211  equivWords.erase (equivWords.begin(), equivWords.end());
    214212 
    215   if (stemMethod == 0) {
     213  if (stemMethod == 0 || stemMethod==4 || stemMethod==5) {
    216214    // don't need to stem the word,
    217     // find the word number for this term
     215    // find the word number(s) for this term
    218216    unsigned long wordElNum = 0;
    219217    unsigned long numLevels = indexData.bdh.num_levels;
    220218    word_block_dict_el wordDictEl;
    221219    wordDictEl.SetNumLevels (numLevels);
    222     if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
    223                    indexData.bdh.entries_per_wblk,
    224                    indexData.bdh.word_dict_size,
    225                    numLevels, term, wordDictEl, wordElNum))
    226       equivWords.push_back (wordElNum);
    227    
    228     return;
    229    
    230   }
    231 
    232 
     220    if (stemMethod ==0) {
     221      if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
     222                 indexData.bdh.entries_per_wblk,
     223                 indexData.bdh.word_dict_size,
     224                 numLevels, term, wordDictEl, wordElNum))
     225    equivWords.push_back (wordElNum);
     226     
     227      return;
     228    } else {
     229      // partial matching,
     230      PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod==5?true:false) );
     231      return;
     232    }
     233  }
     234             
    233235  // need to stem this word and find it in the blocked stem index
    234236 
     
    236238  UCArray stemTerm;
    237239  unsigned long stemmerNum = 0;
    238  
    239240  if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;
    240241  else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;
    241242  else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;
    242  
    243  
     243   
    244244  // convert the word to an "mg word"
    245245  mgWord[0] = term.size();
  • trunk/mgpp/text/UCArray.cpp

    r7944 r8242  
    244244}
    245245
     246// does the first string start with the second?
     247bool StartsWith (const UCArray &a1, const UCArray &a2) {
     248  unsigned int l1 = a1.size();
     249  unsigned int l2 = a2.size();
     250  if (l2 > l1) {
     251    // if the prefix is longer than the string, it can't start with it
     252    return false;
     253  }
     254  unsigned int l =l2;
     255  UCArray::const_iterator a1Here = a1.begin();
     256  UCArray::const_iterator a2Here = a2.begin();
     257 
     258  while (l--) {
     259    if ((*a1Here != *a2Here))
     260      return false;
     261    ++a1Here;
     262    ++a2Here;
     263  }
     264  return true; // we have successfully matched the whole way
     265   
     266}
     267
     268// does the first string start with the second, ignoring case?
     269bool StartsWithCasefold(const UCArray &a1, const UCArray &a2) {
     270  unsigned int l1 = a1.size();
     271  unsigned int l2 = a2.size();
     272  if (l2 > l1) {
     273    // if the prefix is longer than the string, it can't start with it
     274    return false;
     275  }
     276  unsigned int l =l2;
     277  UCArray::const_iterator a1Here = a1.begin();
     278  UCArray::const_iterator a2Here = a2.begin();
     279 
     280  while (l--) {
     281    if (casecharmap[*a1Here] != casecharmap[*a2Here])
     282      return false;
     283    ++a1Here;
     284    ++a2Here;
     285  }
     286  return true; // we have successfully matched the whole way
     287   
     288}
     289
     290
    246291unsigned long PrefixLen (const UCArray &a1, const UCArray &a2) {
    247292  unsigned long l = (a1.size() < a2.size()) ? a1.size() : a2.size();
  • trunk/mgpp/text/UCArray.h

    r7944 r8242  
    9797int DictCompare (const UCArray &a1, const UCArray &a2);
    9898
     99// does the first string start with the second?
     100bool StartsWith(const UCArray &a1, const UCArray &a2);
     101// does the first string start with the second, ignoring case?
     102bool StartsWithCasefold(const UCArray &a1, const UCArray &a2);
     103
    99104struct LTUCArray {
    100105  bool operator()(const UCArray &a1, const UCArray &a2) const
  • trunk/mgpp/text/invf.cpp

    r3365 r8242  
    691691}
    692692
     693// ------------------------------------------
     694// functions for partial term matching
     695// ie find all words that start with xxx
     696bool PartialMatchSearchWordBlockDictEl (FILE *dictFile,
     697                 const block_idx &bIdx,
     698                 unsigned long entriesPerBlock,
     699                 unsigned long dictSize,
     700                 unsigned long numLevels,
     701                 const UCArray &el,
     702                 word_block_dict_el &dictEl,
     703                 vector<unsigned long> &elNumList,
     704                 bool casefold) {
     705
     706  UCArrayClear (dictEl.el);
     707  elNumList.erase (elNumList.begin(), elNumList.end());
     708  unsigned long elNum;
     709  // find the block that contains the element
     710  unsigned long blockIdxNum;
     711  // will this work??
     712  if (!SearchEl (bIdx, entriesPerBlock, el,
     713         blockIdxNum, elNum)) {
     714    return false;
     715  }
     716  unsigned long blockEndElNum = elNum + entriesPerBlock;
     717  if (blockEndElNum > dictSize) blockEndElNum = dictSize;
     718 
     719  bool still_looking = true;
     720  // look for the block
     721  fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
     722  // test each element
     723  while (elNum < blockEndElNum) {
     724    dictEl.Read (dictFile, numLevels);
     725    if (StartsWithCasefold(dictEl.el, el)) {
     726      if (casefold || StartsWith(dictEl.el, el)) {
     727    elNumList.push_back(elNum);
     728      }
     729      still_looking=false; // we have found one now, so stop the next time we don't have a match
     730    } else if (!still_looking) {
     731      // we have found a match previously, and now this doesn't match
     732      return true;
     733    } // else keep looking
     734   
     735    elNum++;
     736  }
     737  // if we get here, we are either still searching for the first
     738  // case, or we are collecting terms.
     739  if (still_looking) {
     740    //we haven't found a match yet, just check the next element,
     741    dictEl.Read (dictFile, numLevels);
     742    if (!StartsWithCasefold(dictEl.el, el)) {
     743      // the first element of the next block is not a match, so there are no matches
     744      return false;
     745    } else {
     746      if (casefold || StartsWith(dictEl.el, el)) {
     747    elNumList.push_back(elNum);
     748    elNum++;
     749      }
     750
     751    }
     752  }
     753  // just keep accumulating matches until there are no more
     754  dictEl.Read (dictFile, numLevels);
     755  while (StartsWithCasefold(dictEl.el, el)) {
     756    if (casefold || StartsWith(dictEl.el, el)) {
     757      elNumList.push_back(elNum);
     758    }
     759    elNum++;
     760    dictEl.Read (dictFile, numLevels);
     761  }
     762  return true;
     763 
     764}
     765
     766
    693767//----------------------------------------------------------------
    694768// functions for full text browse
  • trunk/mgpp/text/invf.h

    r3365 r8242  
    302302                unsigned long &elNum);
    303303
     304//-----------------------------------------------------
     305// New functions for partial matching
     306// returns a list of word numbers from the main block index whose
     307// words start with el.
     308bool PartialMatchSearchWordBlockDictEl (FILE *dictFile,
     309                 const block_idx &bIdx,
     310                 unsigned long entriesPerBlock,
     311                 unsigned long dictSize,
     312                 unsigned long numLevels,
     313                 const UCArray &el,
     314                 word_block_dict_el &dictEl,
     315                 vector<unsigned long> &elNumList,
     316                 bool casefold);
    304317//----------------------------------------------------------
    305318
Note: See TracChangeset for help on using the changeset viewer.