Changeset 1300


Ignore:
Timestamp:
2000-07-24T14:46:11+12:00 (24 years ago)
Author:
kjm18
Message:

added full text browsing functionality

Location:
trunk/gsdl/src/mgpp/text
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/mgpp/text/MGQuery.cpp

    r927 r1300  
    572572
    573573
     574void BrowseQueryNode::Clear () {
     575  UCArrayClear(term);
     576}
     577
     578void BrowseQueryNode::Calculate (IndexData &indexData, BrowseQueryResult &result) const {
     579
     580  unsigned long number=0;
     581  FindNearestWordNumber(indexData, term, number);
     582  if (number + startPosition > 0 ) {
     583    number = number+startPosition;
     584  }
     585  else {
     586    number = 1;
     587  }
     588
     589  GetTermList (indexData, number, numTerms, result.termFreqs);
     590
     591}
     592
     593
     594
     595void BrowseQueryNode::Free () {
     596  Clear();
     597}
     598
     599
     600void BrowseQueryNode::Print (ostream &s, int indent) const {
     601  PrintIndentText(s, "BROWSEQUERYNODE\n", indent);
     602  PrintIndent (s, indent+2);
     603  s << "TERM:"<<term<<"\n";
     604  PrintIndent (s, indent+2);
     605  s << "Start position: "<< startPosition<<", Num terms: "<< numTerms<<"\n";
     606
     607
     608}
     609
     610
     611
    574612void MGQuery (IndexData &indexData,
    575613          const QueryInfo &queryInfo,
     
    693731
    694732
    695 
     733// new function for full text browsing,
     734void MGBrowseQuery (IndexData &indexData, UCArray &level,
     735          const BrowseQueryNode &node,
     736          BrowseQueryResult &result) {
     737
     738  indexData.LoadLevel(level);
     739  node.Calculate(indexData, result);
     740
     741}
     742
     743
     744
     745
     746
     747
     748
  • trunk/gsdl/src/mgpp/text/MGQuery.h

    r927 r1300  
    119119then 'the' has to be at position between -2 and -1 relative to 'cat'.
    120120"the cat" could also be searched for by 'cat' with no range limits, then 'the' with range 0 to 1.
     121range values are relative to the gaps between words:
     122   x   y   z   X   a   b   c
     123    -3   -2  -1  0   1   2   3
     124
    121125 */
    122126class TermNode {
     
    157161};
    158162
     163class BrowseQueryNode :public QueryNode {
     164 public:
     165  UCArray term;
     166  signed long startPosition;
     167  unsigned long numTerms;
     168
     169  void Clear();
     170  BrowseQueryNode () { Clear(); }
     171  //  ~BrowseQueryNode ();
     172
     173  void Calculate (IndexData &indexData, BrowseQueryResult &result) const;
     174  void Free ();
     175  void Print (ostream &s, int indent=0) const;
     176
     177
     178
     179
     180};
    159181
    160182void MGQuery (IndexData &indexData,
     
    163185          QueryResult &result);
    164186
    165 // this function for retriving results with both section doc nums
     187// this function for retrieving results with both section doc nums
    166188// and Document docnums
    167189void MGQuery (IndexData &indexData,
     
    171193
    172194
     195// new function for full text browsing,
     196void MGBrowseQuery (IndexData &indexData, UCArray &level,
     197          const BrowseQueryNode &node,
     198          BrowseQueryResult &result);
     199
    173200#endif
     201
     202
     203
     204
     205
     206
     207
     208
     209
     210
  • trunk/gsdl/src/mgpp/text/Terms.cpp

    r1124 r1300  
    143143      (r1.termFreqs == r2.termFreqs));
    144144}
     145
     146//-------------------------------------------------------
     147// new BrowseQueryResult stuff
     148void BrowseQueryResult::Clear () {
     149  termFreqs.erase (termFreqs.begin(), termFreqs.end());
     150}
     151
     152BrowseQueryResult::BrowseQueryResult () {
     153  Clear ();
     154}
     155
     156
     157
     158ostream &operator<< (ostream &s, const BrowseQueryResult &r) {
     159  s << "terms: ";
     160  unsigned long i;
     161  for (i=0; i<r.termFreqs.size(); i++)
     162    s << r.termFreqs[i] << ", ";
     163    s << "\n\n";
     164  return s;
     165}
     166
     167
     168bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2) {
     169  return ((r1.termFreqs == r2.termFreqs));
     170     
     171}
     172
     173
     174
    145175
    146176//--------------------------------------
     
    668698    result.ranks.erase (result.ranks.begin(), result.ranks.end());
    669699}
     700
     701
     702
     703//--------------------------------------------------------------
     704// functions to support full text browse
     705
     706void FindNearestWordNumber (IndexData &indexData,
     707                const UCArray &term,
     708                unsigned long &number) {
     709
     710    // find the word number for this term
     711    unsigned long wordElNum = 0;
     712    unsigned long numLevels = indexData.bdh.num_levels;
     713    word_block_dict_el wordDictEl;
     714    wordDictEl.SetNumLevels (numLevels);
     715    if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
     716                      indexData.bdh.entries_per_wblk,
     717                      indexData.bdh.word_dict_size,
     718                      numLevels, term, wordDictEl, wordElNum))
     719      number = wordElNum;
     720
     721}
     722
     723void GetTermList(IndexData &indexData,
     724         unsigned long startTerm,
     725         unsigned long numTerms,
     726         TermFreqArray &terms) {
     727
     728  word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array();
     729  TermFreqData termdata;
     730
     731  terms.erase(terms.begin(), terms.end());
     732
     733  SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
     734                 indexData.bdh.entries_per_wblk,
     735                 indexData.bdh.word_dict_size,
     736                 indexData.bdh.num_levels, startTerm,
     737                 numTerms, wordBlocks);
     738
     739  word_block_dict_el_array::iterator here = wordBlocks.begin();
     740  word_block_dict_el_array::iterator end = wordBlocks.end();
     741
     742  while (here != end) {
     743    termdata.Clear();
     744    termdata.term = (*here).el;
     745    termdata.termFreq = (*here).freq;
     746    terms.push_back(termdata);
     747    here++;
     748  }
     749
     750}
     751
     752void GetTermList(IndexData &indexData,
     753         unsigned long startTerm,
     754         unsigned long numTerms,
     755         UCArrayVector &terms) {
     756
     757 
     758 
     759  SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
     760                 indexData.bdh.entries_per_wblk,
     761                 indexData.bdh.word_dict_size,
     762                 indexData.bdh.num_levels, startTerm,
     763                 numTerms, terms);
     764
     765}
     766
     767
     768
  • trunk/gsdl/src/mgpp/text/Terms.h

    r1124 r1300  
    163163                QueryResult &result);
    164164
    165 
     165//-----------------------------------------------------------------
    166166// new QueryResult class to handle retrieval of doc and level nums.
    167167// Use this class with extended version of MGQuery
     
    180180bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2);
    181181
     182//------------------------------------------------------------
     183// new functions to handle full text browse
     184
     185class BrowseQueryResult {
     186 public:
     187  TermFreqArray termFreqs;
     188  void Clear();
     189  BrowseQueryResult ();
     190
     191};
     192
     193
     194ostream &operator<< (ostream &s, const BrowseQueryResult &r);
     195bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2);
     196
     197void FindNearestWordNumber (IndexData &indexData,
     198                const UCArray &term,
     199                unsigned long &number);
     200
     201void GetTermList(IndexData &indexData,
     202         unsigned long startTerm,
     203         unsigned long numTerms,
     204         TermFreqArray &terms);
     205
     206void GetTermList (IndexData &indexData,
     207         unsigned long startTerm,
     208         unsigned long numTerms,
     209         UCArrayVector &terms);
     210
    182211#endif
    183212
     213
     214
     215
     216
  • trunk/gsdl/src/mgpp/text/invf.cpp

    r856 r1300  
    513513
    514514
     515// use the block dictionary functions for tag entries, and word block dict
     516// functions for word entries.
     517
    515518
    516519bool SearchBlockDictElNum (FILE *dictFile,
     
    689692  return false;
    690693}
     694
     695//----------------------------------------------------------------
     696// functions for full text browse
     697
     698bool NearestSearchWordBlockDictEl (FILE *dictFile,
     699                   const block_idx &bIdx,
     700                   unsigned long entriesPerBlock,
     701                   unsigned long dictSize,
     702                   unsigned long numLevels,
     703                   const UCArray &el,
     704                   word_block_dict_el &dictEl,
     705                   unsigned long &elNum) {
     706
     707  UCArrayClear (dictEl.el);
     708
     709  // find the block that contains the element
     710  unsigned long blockIdxNum;
     711  if (!SearchEl (bIdx, entriesPerBlock, el,
     712         blockIdxNum, elNum))
     713    return false;
     714
     715  unsigned long blockEndElNum = elNum + entriesPerBlock;
     716  if (blockEndElNum > dictSize) blockEndElNum = dictSize;
     717
     718  // look for the block
     719  fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
     720  while (elNum < blockEndElNum) {
     721    dictEl.Read (dictFile, numLevels);
     722    int res = DictCompare (el, dictEl.el); // look for the first word that is
     723                                     // greater or equal to the el
     724    if (res <= 0) {
     725      return true; // found one
     726   }
     727   
     728    elNum++;
     729  }
     730  // it must be the last term
     731  return true;
     732
     733
     734}
     735
     736
     737bool SearchWordBlockDictElNumRange (FILE *dictFile,
     738                    const block_idx &bIdx,
     739                    unsigned long entriesPerBlock,
     740                    unsigned long dictSize,
     741                    unsigned long numLevels,
     742                    unsigned long elNum,
     743                    unsigned long numWords,
     744                    UCArrayVector &terms) {
     745
     746  word_block_dict_el dictEl; 
     747  dictEl.SetNumLevels (numLevels);
     748  UCArrayClear(dictEl.el);
     749
     750  terms.erase(terms.begin(), terms.end());
     751
     752  if (elNum >= dictSize) return false;
     753 
     754  // find the block that contains the element
     755  unsigned long blockIdxNum, curElNum;
     756  if (!SearchElNum (bIdx, entriesPerBlock, elNum,
     757            blockIdxNum, curElNum))
     758    return false;
     759
     760  unsigned long lastElNum = elNum + numWords - 1;
     761  if (lastElNum > dictSize) lastElNum = dictSize;
     762
     763  // look for the block
     764  fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
     765 
     766  // get the first term
     767  do {
     768    dictEl.Read (dictFile, numLevels);
     769  } while (curElNum++ < elNum);
     770
     771  terms.push_back(dictEl.el);
     772  while (curElNum <= lastElNum ) {
     773    dictEl.Read(dictFile, numLevels);
     774    terms.push_back(dictEl.el);
     775    curElNum++;
     776  }
     777 
     778
     779  return true;
     780}
     781
     782// NOte: before each addition of dictEl to the array, the level freqs array
     783// is deleted, as this was causing problems - generating a seg fault, I think if
     784// the vector had to be reallocated or something.
     785// setNumLevels has to be called each time before a read, now, to set up the level
     786//freqs array. this is necessary.
     787bool SearchWordBlockDictElNumRange (FILE *dictFile,
     788                    const block_idx &bIdx,
     789                    unsigned long entriesPerBlock,
     790                    unsigned long dictSize,
     791                    unsigned long numLevels,
     792                    unsigned long elNum,
     793                    unsigned long numWords,
     794                    word_block_dict_el_array &terms) {
     795
     796  word_block_dict_el dictEl;
     797  dictEl.SetNumLevels (numLevels);
     798  UCArrayClear(dictEl.el);
     799
     800  block_dict_el elem;
     801  terms.erase(terms.begin(), terms.end());
     802
     803  if (elNum >= dictSize) return false;
     804 
     805  // find the block that contains the element
     806  unsigned long blockIdxNum, curElNum;
     807  if (!SearchElNum (bIdx, entriesPerBlock, elNum,
     808            blockIdxNum, curElNum))
     809    return false;
     810
     811  unsigned long lastElNum = elNum + numWords - 1;
     812  if (lastElNum > dictSize) lastElNum = dictSize;
     813
     814  // look for the block
     815  fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET);
     816  // get the first term
     817  do {
     818    dictEl.Read (dictFile, numLevels);
     819  } while (curElNum++ < elNum);
     820
     821  dictEl.levelFreqs = NULL;
     822  terms.push_back(dictEl);
     823
     824  while (curElNum <= lastElNum ) {
     825    dictEl.SetNumLevels(numLevels);
     826    dictEl.Read(dictFile, numLevels);
     827    dictEl.levelFreqs = NULL;
     828    terms.push_back(dictEl);
     829    curElNum++;
     830  }
     831
     832  return true;
     833}
     834
     835
     836
     837
     838
     839
     840
     841
     842
  • trunk/gsdl/src/mgpp/text/invf.h

    r1122 r1300  
    158158};
    159159
    160 
     160typedef vector<word_block_dict_el> word_block_dict_el_array;
    161161
    162162struct block_idx_info {
     
    302302                unsigned long &elNum);
    303303
    304 
     304//----------------------------------------------------------
     305
     306// new functions for full text browse
     307
     308bool NearestSearchWordBlockDictEl (FILE *dictFile,
     309                const block_idx &bIdx,
     310                unsigned long entriesPerBlock,
     311                unsigned long dictSize,
     312                unsigned long numLevels,
     313                const UCArray &el,
     314                word_block_dict_el &dictEl,
     315                unsigned long &elNum);
     316
     317// returns a list of word_block_dict_el, with no levelfreqs
     318bool SearchWordBlockDictElNumRange (FILE *dictFile,
     319                    const block_idx &bIdx,
     320                    unsigned long entriesPerBlock,
     321                    unsigned long dictSize,
     322                    unsigned long numLevels,
     323                    unsigned long elNum,
     324                    unsigned long numWords,
     325                    word_block_dict_el_array &terms);
     326                   
     327// just returns a list of terms
     328bool SearchWordBlockDictElNumRange (FILE *dictFile,
     329                    const block_idx &bIdx,
     330                    unsigned long entriesPerBlock,
     331                    unsigned long dictSize,
     332                    unsigned long numLevels,
     333                    unsigned long elNum,
     334                    unsigned long numWords,
     335                    UCArrayVector &terms);
     336                   
    305337
    306338
Note: See TracChangeset for help on using the changeset viewer.