Changeset 1873


Ignore:
Timestamp:
2001-01-30T12:08:49+13:00 (23 years ago)
Author:
paynter
Message:

Fixed a bug iin the Phrase extraction alogrithm that had the
"candidates" in the GetMinimalExpansions function sorted backwards.

Location:
trunk/gsdl/src/phind/generate
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/phrase.cpp

    r1631 r1873  
    180180// Does the phrase have a unique suffix/prefix extension?
    181181//
    182 // If uniqueSuffixExtensions is 1, then we do.
    183 // If it is > 1 then we do not.
    184 // If it is -1 then we have to check in the suffix array.
     182// For suffix:
     183//   If uniqueSuffixExtensions is 1, then phrase has some unique expansion.
     184//   If it is 0 then it does not.  If it is -1 then we don't know, and have
     185//   to calculate it from the suffix array.  Same goes for prefixes.
    185186
    186187int Phrase::hasUniqueSuffixExtension() {
     
    189190  if (uniqueSuffixExtension == -1) {
    190191
    191     if (!suffixFound) {
    192       findFirstAndLastSuffix();
    193     }
    194 
    195     /*    if (firstSuffixIndex == 56957) {
    196       cout << "Testing suffix expansion: " << toString() << "\n";
    197       cout << "mode: " << phraseMode << endl;
    198     }
    199     */
     192    ensureSuffixFound();
    200193
    201194    // pointers to the phrase's first and last occurances in te suffixArray
     
    235228    ensurePrefixFound();
    236229
    237     // pointers to the phrase's first and last occurances in te prefixArray
     230    // pointers to the phrase's first and last occurances in the prefixArray
    238231    symbol *fst = firstPrefix - length;
    239232    symbol *lst = lastPrefix - length;
     
    249242    // in STOPWORDS mode, make sure there is a unique next content symbol
    250243    else {
    251       // if (firstSuffixIndex == 56962) cout << "Testing 1: s" << *fst << " = s" << *lst << endl;
    252244      uniquePrefixExtension = 0;
    253245      while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
     
    266258
    267259
    268 // Expand a phrase with a unique suffix/prefix extension by 1 symbol
     260// Expand a phrase with a unique suffix/prefix by 1 symbol
    269261//
    270262// Note that in STOPWORDS mode a "unique extension" means a unique
     
    274266
    275267int Phrase::expandUniqueSuffixExtensionByOne() {
    276   assert(forward);
    277268  assert(suffixFound);
    278269  assert(uniqueSuffixExtension == 1);
     
    299290
    300291int Phrase::expandUniquePrefixExtensionByOne() {
    301   assert(back);
    302292  assert(prefixFound);
    303293  assert(uniquePrefixExtension == 1);
     
    797787// Compare the length of two phrases
    798788//
    799 // Given two phrases, return true if the first is shorter,
    800 // otherwise return false.  For use with the STL sort function.
     789// Given two phrases, return true if the first is shorter/longer,
     790// otherwise return false.  For use in various sort functions.
    801791
    802792bool isShorter(Phrase p1, Phrase p2) {
     
    807797}
    808798
    809 
     799bool isLonger(Phrase p1, Phrase p2) {
     800  if (p1.length > p2.length) {
     801    return true;
     802  }
     803  return false;
     804}
     805
     806
  • trunk/gsdl/src/phind/generate/phrase.h

    r1631 r1873  
    126126
    127127bool isShorter(Phrase p1, Phrase p2);
     128bool isLonger(Phrase p1, Phrase p2);
    128129
    129130
  • trunk/gsdl/src/phind/generate/suffix.cpp

    r1683 r1873  
    11/**********************************************************************
    22 *
    3  * suffix.h -- Extract the repeated phrases in the input using
    4  *             suffix and prefix arrays.
     3 * suffix.cpp -- Extract the repeated phrases in the input using
     4 *               suffix and prefix arrays.
    55 *
    6  * Copyright 2000 Gordon W. Paynter ([email protected])
     6 * Copyright 2000 Gordon W. Paynter
    77 * Copyright 2000 The New Zealand Digital Library Project
    88 *
     
    507507
    508508  // 2.2 Sort the candidates by phrase length
    509   // sort(candidates.begin(), candidates.end(), isShorter);
    510   make_heap(candidates.begin(), candidates.end(), isShorter);
    511 
     509  make_heap(candidates.begin(), candidates.end(), isLonger);
    512510
    513511  // 3. While candidates is non-empty, confirm the phrases it
     
    516514
    517515    // 3.1 Get next candidate
    518     //Phrase c = candidates.front();
    519     //candidates.erase(candidates.begin());
    520     pop_heap(candidates.begin(), candidates.end(), isShorter);
     516    pop_heap(candidates.begin(), candidates.end(), isLonger);
    521517    Phrase c = candidates.back();
    522518    candidates.pop_back();
     
    538534    // cout << "expanding prefix " << c.toString() << "=> ";
    539535    c.expandUniquePrefixExtensionByOne();
    540     //candidates.push_back(c);
    541     //sort(candidates.begin(), candidates.end(), isShorter);
    542536    candidates.push_back(c);
    543     push_heap(candidates.begin(), candidates.end(), isShorter);
     537    push_heap(candidates.begin(), candidates.end(), isLonger);
    544538     }
    545539   
     
    572566      else if (c.hasUniqueSuffixExtension()) {
    573567    c.expandUniqueSuffixExtensionByOne();
    574     //candidates.push_back(c);
    575     //sort(candidates.begin(), candidates.end(), isShorter);
    576568    candidates.push_back(c);
    577     push_heap(candidates.begin(), candidates.end(), isShorter);
     569    push_heap(candidates.begin(), candidates.end(), isLonger);
    578570      }
    579571   
Note: See TracChangeset for help on using the changeset viewer.