Changeset 1873

Show
Ignore:
Timestamp:
30.01.2001 12:08:49 (19 years ago)
Author:
paynter
Message:

Fixed a bug iin the Phrase extraction alogrithm that had the
"candidates" in the GetMinimalExpansions? function sorted backwards.

Location:
trunk/gsdl/src/phind/generate
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/phrase.cpp

    r1631 r1873  
    180180// Does the phrase have a unique suffix/prefix extension? 
    181181// 
    182 // If uniqueSuffixExtensions is 1, then we do. 
    183 // If it is > 1 then we do not. 
    184 // If it is -1 then we have to check in the suffix array. 
     182// For suffix: 
     183//   If uniqueSuffixExtensions is 1, then phrase has some unique expansion.  
     184//   If it is 0 then it does not.  If it is -1 then we don't know, and have 
     185//   to calculate it from the suffix array.  Same goes for prefixes. 
    185186 
    186187int Phrase::hasUniqueSuffixExtension() { 
     
    189190  if (uniqueSuffixExtension == -1) { 
    190191 
    191     if (!suffixFound) { 
    192       findFirstAndLastSuffix(); 
    193     } 
    194  
    195     /*    if (firstSuffixIndex == 56957) { 
    196       cout << "Testing suffix expansion: " << toString() << "\n"; 
    197       cout << "mode: " << phraseMode << endl; 
    198     } 
    199     */ 
     192    ensureSuffixFound(); 
    200193 
    201194    // pointers to the phrase's first and last occurances in te suffixArray 
     
    235228    ensurePrefixFound(); 
    236229 
    237     // pointers to the phrase's first and last occurances in te prefixArray 
     230    // pointers to the phrase's first and last occurances in the prefixArray 
    238231    symbol *fst = firstPrefix - length; 
    239232    symbol *lst = lastPrefix - length; 
     
    249242    // in STOPWORDS mode, make sure there is a unique next content symbol 
    250243    else { 
    251       // if (firstSuffixIndex == 56962) cout << "Testing 1: s" << *fst << " = s" << *lst << endl; 
    252244      uniquePrefixExtension = 0; 
    253245      while ((*fst == *lst) && (*fst > LASTDELIMITER)) { 
     
    266258 
    267259 
    268 // Expand a phrase with a unique suffix/prefix extension by 1 symbol 
     260// Expand a phrase with a unique suffix/prefix by 1 symbol 
    269261// 
    270262// Note that in STOPWORDS mode a "unique extension" means a unique 
     
    274266 
    275267int Phrase::expandUniqueSuffixExtensionByOne() { 
    276   assert(forward); 
    277268  assert(suffixFound); 
    278269  assert(uniqueSuffixExtension == 1); 
     
    299290 
    300291int Phrase::expandUniquePrefixExtensionByOne() { 
    301   assert(back); 
    302292  assert(prefixFound); 
    303293  assert(uniquePrefixExtension == 1); 
     
    797787// Compare the length of two phrases 
    798788// 
    799 // Given two phrases, return true if the first is shorter, 
    800 // otherwise return false.  For use with the STL sort function. 
     789// Given two phrases, return true if the first is shorter/longer, 
     790// otherwise return false.  For use in various sort functions. 
    801791 
    802792bool isShorter(Phrase p1, Phrase p2) { 
     
    807797} 
    808798 
    809  
     799bool isLonger(Phrase p1, Phrase p2) { 
     800  if (p1.length > p2.length) { 
     801    return true; 
     802  } 
     803  return false; 
     804} 
     805 
     806 
  • trunk/gsdl/src/phind/generate/phrase.h

    r1631 r1873  
    126126 
    127127bool isShorter(Phrase p1, Phrase p2); 
     128bool isLonger(Phrase p1, Phrase p2); 
    128129 
    129130 
  • trunk/gsdl/src/phind/generate/suffix.cpp

    r1683 r1873  
    11/********************************************************************** 
    22 * 
    3  * suffix.h -- Extract the repeated phrases in the input using  
    4  *             suffix and prefix arrays. 
     3 * suffix.cpp -- Extract the repeated phrases in the input using  
     4 *               suffix and prefix arrays. 
    55 * 
    6  * Copyright 2000 Gordon W. Paynter (gwp@cs.waikato.ac.nz) 
     6 * Copyright 2000 Gordon W. Paynter 
    77 * Copyright 2000 The New Zealand Digital Library Project 
    88 * 
     
    507507 
    508508  // 2.2 Sort the candidates by phrase length 
    509   // sort(candidates.begin(), candidates.end(), isShorter); 
    510   make_heap(candidates.begin(), candidates.end(), isShorter); 
    511  
     509  make_heap(candidates.begin(), candidates.end(), isLonger); 
    512510 
    513511  // 3. While candidates is non-empty, confirm the phrases it  
     
    516514 
    517515    // 3.1 Get next candidate 
    518     //Phrase c = candidates.front(); 
    519     //candidates.erase(candidates.begin()); 
    520     pop_heap(candidates.begin(), candidates.end(), isShorter); 
     516    pop_heap(candidates.begin(), candidates.end(), isLonger); 
    521517    Phrase c = candidates.back(); 
    522518    candidates.pop_back(); 
     
    538534    // cout << "expanding prefix " << c.toString() << "=> "; 
    539535    c.expandUniquePrefixExtensionByOne(); 
    540     //candidates.push_back(c); 
    541     //sort(candidates.begin(), candidates.end(), isShorter); 
    542536    candidates.push_back(c); 
    543     push_heap(candidates.begin(), candidates.end(), isShorter); 
     537    push_heap(candidates.begin(), candidates.end(), isLonger); 
    544538     } 
    545539     
     
    572566      else if (c.hasUniqueSuffixExtension()) { 
    573567    c.expandUniqueSuffixExtensionByOne(); 
    574     //candidates.push_back(c); 
    575     //sort(candidates.begin(), candidates.end(), isShorter); 
    576568    candidates.push_back(c); 
    577     push_heap(candidates.begin(), candidates.end(), isShorter); 
     569    push_heap(candidates.begin(), candidates.end(), isLonger); 
    578570      } 
    579571