Changeset 2801


Ignore:
Timestamp:
2001-10-15T15:02:54+13:00 (23 years ago)
Author:
kjm18
Message:

new version of suffix, based on suffix2 (gordon and craigs simpler version)
with kaths improvements

Location:
trunk/gsdl/src/phind/generate
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/phrase.cpp

    r2704 r2801  
    153153  --length;
    154154  --back;
     155  if (phraseMode==STOPWORDS) {
     156    while (*back >=firstStopSymbol && *back <= lastStopSymbol) {
     157      --length;
     158      --back;
     159    }
     160  } 
    155161  clearSuffix();
    156162  clearPrefix();
     
    161167  --length;
    162168  ++forward;
     169  if (phraseMode==STOPWORDS) {
     170    while (*forward >=firstStopSymbol && *forward <= lastStopSymbol) {
     171      --length;
     172      ++forward;
     173    }
     174  }
    163175  clearSuffix();
    164176  clearPrefix();
     
    391403
    392404int Phrase::findFirstAndLastSuffix() {
     405 
    393406  return findFirstAndLastSuffix(0, inputLength-1);
    394407}
     
    399412  assert(begin <= end);
    400413
    401   // cout << "findFirstAndLastSuffix (" << begin << " - " << end << ") : " << toString() << endl;
    402  
    403414  // if we're only searching one cell, it is very easy
    404415  if (begin == end) {
     
    415426
    416427  do {
    417     // cout << "Find anywhere in " << begin << " - " << end << endl;
    418428    c = (end + begin) / 2;
    419429    cmp = compareSuffix(suffixArray[c], length);
     
    436446
    437447  do {
    438     // cout << "first-searching with " << begin << " - " << end << endl;
    439448    if (begin == end) {
    440449      c = begin;
     
    451460    // to find the first occurance, suffixArray[c] must be the same as the
    452461    // phrase, but suffixArray[c-1] must be different.
    453     cmp = compareSuffix(suffixArray[c-1], length);
    454     if (cmp == 0) {
    455       end = c - 1;
    456       assert(end >= begin);
    457       cmp = 1;
    458     } else {
    459       cmp = 0;
     462    if (c>0) {
     463      cmp = compareSuffix(suffixArray[c-1], length);
     464      if (cmp == 0) {
     465        end = c - 1;
     466        assert(end >= begin);
     467        cmp = 1;
     468      } else {
     469        cmp = 0;
     470      }
    460471    }
    461472      }
     
    473484 
    474485  do {
    475     // cout << "last-searching with " << begin << " - " << end << endl;
    476486    if (begin == end) {
    477487      c = begin;
     
    525535  assert(begin <= end);
    526536
    527   // cout << "findFirstAndLastPrefix (" << begin << " - " << end << ") : " << toString() << endl;
    528  
    529537  // if we're only searching one cell, it is very easy
    530538  if (begin == end) {
     
    541549
    542550  do {
    543     // cout << "Find anywhere in prefixarray " << begin << " - " << end << endl;
    544551    c = (end + begin) / 2;
    545552    cmp = comparePrefix(prefixArray[c], length);
     
    560567
    561568  do {
    562     // cout << "first-prefix-searching with " << begin << " - " << end << endl;
    563569    if (begin == end) {
    564570      c = begin;
     
    567573      c = (begin + end) / 2;
    568574      cmp = comparePrefix(prefixArray[c], length);
    569       // cout << "cmp: " << cmp << ", c: " << c << endl;
    570575      if (cmp == 1) {
    571576    // target phrase is lower than phrase at prefixArray[c]
     
    601606 
    602607  do {
    603     // cout << "last-prefix-searching with " << begin << " - " << end << endl;
    604608    if (begin == end) {
    605609      c = begin;
     
    829833}
    830834
    831 
    832 
    833 
    834 
    835 
    836 
    837 
    838 
    839 
    840 
    841 
    842835// Compare the length of two phrases
    843836//
  • trunk/gsdl/src/phind/generate/phrase.h

    r2694 r2801  
    126126  friend std::ostream &operator<<(std::ostream &stream, const Phrase &phrase);
    127127
     128  int uniqueSuffixExtension;
     129  int uniquePrefixExtension;
     130
    128131private:
    129132
    130133  // Does the phrase have a unique suffix/prefix extension?
    131134  // if yes, then 1; if no then 0; if unknown then -1;
    132   int uniqueSuffixExtension;
    133   int uniquePrefixExtension;
    134135
    135136  // reset a phrase
  • trunk/gsdl/src/phind/generate/suffix.cpp

    r2498 r2801  
    11/**********************************************************************
    22 *
    3  * suffix.cpp -- Extract the repeated phrases in the input using
    4  *               suffix and prefix arrays.
     3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
     4 *                and prefix arrays (cgn & gwp's simpler algorithm,
     5 *                 and kjm's improvements).
    56 *
    67 * Copyright 2000 Gordon W. Paynter
     
    6162symbol   *symbols;
    6263symbol  **suffixArray;
     64symbol  **prefixArray;
    6365check    *suffixCheck;
    64 symbol  **prefixArray;
    65 check    *prefixCheck;
    6666
    6767
     
    7070symbol  **documentArray;
    7171
     72
    7273// Do we accept any phrase, or do we eliminate those ending with stopwords ?
    7374int phraseMode = ANYPHRASE; //STOPWORDS;
    7475
     76
    7577// The filestem of the collection's phindex directory
    7678char collection[FILENAME_MAX];
    7779
    78 int suffixCompare(const void *, const void *);
    79 int prefixCompare(const void *, const void *);
    80 int pointerCompare(const void *, const void *);
    81 
    82 int readNumbers();
    83 void readStatistics();
    84 
    85 void getMinimalExpansions(Phrase &p, vector<Phrase> &results);
    86 cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
    8780
    8881// The ranges of the stopword and content-word symbols for the collection
     
    9386
    9487
    95 
    96 
    97 // Phrase memory
    98 // We have to "remember" each phrase that we've expanded
     88// Some useful comparison functions, defined below.
     89int suffixCompare(const void *, const void *);
     90int prefixCompare(const void *, const void *);
     91int pointerCompare(const void *, const void *);
     92
     93
     94// Functions for implementing "phrase memory".  These let us "remember"
     95// each phrase that we've expanded without using too much memory.
    9996void initialisePhraseMemory();
    10097void rememberThisPhrase(cellindex index, cellcount length);
     
    107104
    108105
    109 int main (int argc, char * argv[]) {
    110 
    111   // Command-line arguments
    112   // argv[1] is the phindex directory
    113   // argv[2] is the maximum array symbol length (optional)
    114   // argv[3] is the mode, where 1 is stopword mode (optional)
    115   if (argc < 2) {
    116     cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
    117     exit(1);
    118   }
    119 
    120   // collection directory
    121   strcpy(collection, argv[1]);
    122 
    123   // mode parameter
    124   phraseMode = atoi(argv[2]);
    125   assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
    126 
    127   // optional verbosity parameter
    128   if (argc == 4) {
    129     verbosity = atoi(argv[3]);
    130     assert (verbosity >= 0);
    131   }
    132 
    133   if (verbosity) {
    134     cout << "Suffix phrase extraction program" << endl;
    135   }
    136 
    137   if (verbosity > 1) {
    138     if (phraseMode == STOPWORDS) {
    139       cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
    140     } else {
    141       cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
    142     }
    143   }
    144 
    145   // Read the statistics file
    146   readStatistics();
    147 
    148   // Read the numbers file
    149   readNumbers();
    150 
    151   // Create the suffix & prefix arrays
    152   suffixArray = new symbol *[inputLength];
    153   prefixArray = new symbol *[inputLength];
    154   suffixCheck = new check[inputLength];
    155   prefixCheck = new check[inputLength];
    156   if (prefixCheck == NULL) {
    157     cerr << "Suffix error: not enough memory to hold " << inputLength
    158      << " symbols." << endl;
    159     exit(2);
    160   } 
    161 
    162   // Initialise prefix and suffix arrays
    163   for (cellcount j = 0; j < inputLength; j++) {
    164     suffixArray[j] = &symbols[j];
    165     prefixArray[j] = &symbols[j];
    166   }
    167   qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
    168   qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
    169 
    170 
    171   // Create the document arrays
    172   if (numberOfDocuments == 0) {
    173     cerr << "There are no documents in this collection!" << endl;
    174     exit(1);
    175   }
    176   if (verbosity > 1) {
    177     cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
    178   }
    179 
    180   // The document frequecy array is used to count the number of times
    181   // each phrase occurs in each document.  The number of documents in
    182   // which a phrase occurs is stored in df.
    183   frequency *documentFrequency = new frequency[numberOfDocuments];
    184   frequency df;
    185 
    186   // documentArray will be searched in order to discover which document
    187   // each phrase occurs in.
    188   documentArray = new symbol *[numberOfDocuments]; 
    189 
    190   // Discover all the DOCUMENTSTART symbols and store as a phrase
    191   cellindex d = 0;
    192   while (*suffixArray[d] != DOCUMENTSTART) {
    193     d++;
    194   }
    195   Phrase p(suffixArray[d], 1, SUFFIX);
    196   p.findFirstAndLastSuffix(d, inputLength-1);
     106// Get a phrase's expansions
     107//
     108// Get the set of "minimal", "maximal", non-unique expansions of a
     109// phrase p, using the simpler algorithm that Craig and Gordon came up
     110// with at Google.
     111//
     112// Returns a vector of Expansions.
     113
     114void getExpansions(Phrase &p, vector<Phrase> &results) {
     115
     116  // 1. Get the initial candidates
     117  vector<Phrase> candidates;
     118  p.initialSuffixCandidates(candidates);
     119  int suffcands = candidates.size();
     120  p.initialPrefixCandidates(candidates);
     121
     122  if (candidates.size() == 0)
     123    return;
     124
     125  vector<Phrase>::iterator i;
     126  for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
     127    i->expandWhileUniquePrefixExtension();
     128    i->ensureSuffixFound();
     129  }
     130  for (i; i != candidates.end(); ++i) {
     131    i->expandWhileUniqueSuffixExtension();
     132  }
     133
     134  // 3. Ensure minimality: ignore phrases whose subphrases are also found
     135  results.clear();
     136
     137  // Initialise the candidates, check array, and various variables.
     138  sort(candidates.begin(), candidates.end(), isShorter);
     139  for (cellcount j = 0; j < inputLength; j++)
     140    suffixCheck[j] = 0;
     141  unsigned minimum_length = candidates.begin()->length;
    197142 
    198   // Insert the document locations (as pointers) into documentArray
    199   for (cellcount i = 0; i < p.suffixFrequency; i++) {
    200     documentArray[i] = suffixArray[i + p.firstSuffixIndex];
    201   }
    202  
    203   // Sort the document array into ascending order of raw pointer value
    204   qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
    205 
    206 
    207   // Extract phrases
    208   //
    209   // We will make several passesover the data, in each case considering
    210   // a set of input phrases and generating a set of output phrases, which
    211   // we will expancd in later passes.
    212   //
    213   // The input phrases in the first pass will be the vocabulary.
    214   // In later passes, the input phrases will be the output phrases of the
    215   // previous pass.
    216   //
    217   // In each pass we will consider each input phrase in turn.  If we
    218   // have seen it before, we will ignore it.  Otherwise, we will expand
    219   // it and add its expansions to the set of output phrases.
    220 
    221   // Store the phrase data in the phrases file
    222   char phraseDataName[FILENAME_MAX];
    223   sprintf(phraseDataName, "%s/phrases", collection);
    224   ofstream phraseData(phraseDataName, ios::out);
    225   if (!phraseData) {
    226     cout << "File " << phraseDataName << " could not be opened\n";
    227     exit(1);
    228   }
    229 
    230   // Count the number of phrases output
    231   unsigned long int phraseCounter = 0;
    232 
    233   // Set up the phrase expansion memory.
    234   // We need this so that we don't expand a phrase more than once
    235   initialisePhraseMemory();
    236 
    237   // The current pass numebr
    238   int phrasePass = 1;
    239 
    240 
    241   // PASS NUMBER 1
    242   if (verbosity > 1) {
    243     cout << "Starting pass " << phrasePass << endl;
    244   }
    245 
    246   ofstream outPhrase;
    247   char     outPhraseName[FILENAME_MAX];
    248   unsigned long int outPhraseCounter = 0;
    249 
    250   // On the first pass, simply work through the vocabulary
    251   sprintf(outPhraseName, "%s/outPhrase.1", collection);
    252   outPhrase.open(outPhraseName, ios::out);
    253   if (!outPhrase) {
    254     cerr << "File " << outPhraseName << " could not be opened\n";
    255     exit(1);
    256   }
    257 
    258   // Iterate over the different symbols by working through the suffix array
    259   vector<Phrase> result;
    260   cellindex ij = 0;
    261   char *tmpString;
    262 
    263   while (ij < inputLength) {
    264 
    265     // make a new phrase of length 1
    266     p = Phrase(suffixArray[ij], 1, SUFFIX);
    267     p.findFirstAndLastSuffix(ij, inputLength-1);
    268 
    269     // cout << "cell " << ij << " - " << p.toString() << endl;
    270 
    271     // We ignore this symbol if it occurs only once, if it is a delimiter,
    272     // of if we are in stopwords mode and it is a stopword
    273     //
    274     // We could imagine a new mode/command-line option, which is like
    275     // STOPWORDS but without this restrictrion.  This would let you browse
    276     // from "the" to "the AGRIS" for example, but not from "AGRIS" to
    277     // "the AGRIS" (where the is a stopword and AGRIS a content word).
    278     // The system used to work like this; it is easy to implement, but
    279     // it explodes the size of the indexes.  So: would it be useful? 
    280     if (!((p.suffixFrequency <= 1) ||
    281       // (*suffixArray[ij] != 23054) ||
    282       (*suffixArray[ij] <= LASTDELIMITER) ||
    283       ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {
    284 
    285       // Get minimal expansions of the phrase
    286       getMinimalExpansions(p, result);
     143  // Try to add each candidate to the results set, ignoring the non-minimal
     144  for (vector<Phrase>::iterator candidate = candidates.begin();
     145       candidate != candidates.end(); candidate++) {
     146
     147    // Make a copy of candidate to mutilate while performing sub-phrase checks
     148    Phrase temp_phrase(*candidate);
     149    bool shorter_found = false;
     150   
     151    // Check for shorter and shorter versions of the tenporary phrase
     152    while (temp_phrase.length >= minimum_length && !shorter_found) {
     153      temp_phrase.ensureSuffixFound();
     154      if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
     155    temp_phrase.shortenByOneAtPrefix();
     156      else
     157    shorter_found = true;
     158
     159      // Possible efficiency here: we can finish if the prefix of c
     160      // and temp_phrase are the same for candidate->length symbols.
     161    }
    287162     
    288       if (!result.empty()) {
    289    
    290     // Remember that we have expanded this phrase
    291     rememberThisPhrase(ij, 1);
    292 
    293     // write the phrase text
    294     tmpString = p.toString();
    295     phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
    296            << result.size() << ":";
    297     delete [] tmpString;
    298 
    299     // write the results
    300     for (cellcount k = 0; k < result.size(); k++) {
    301       if (k) {
    302         phraseData << ",";
    303       }
    304       phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
    305       outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
    306       outPhraseCounter++;
    307     }
    308     result.clear();
    309    
    310     // Write the documents in which this phrase occurs
    311     df = getDocumentOccurrances(p, documentFrequency);
    312     phraseData << ":" << df << ":";
    313 
    314     // write the documents
    315     for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
    316       if (documentFrequency[m]) {
    317         if (first) {
    318           first = 0;
    319         } else {
    320           phraseData << ";";
    321         }
    322         // Output the document number.  Note that here we've numbered the
    323         // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
    324         // add 1 to the document id when we output it.
    325         phraseData << "d" << (m+1);
    326         // Next, output the frequency with which the document occurs, but
    327         // only if it is > 1.
    328         if (documentFrequency[m] > 1) {
    329           phraseData << "," << documentFrequency[m];
    330         }
    331       }
    332     }
    333 
    334     phraseData << endl;
    335     phraseCounter++;
    336 
    337     // feedback
    338     if (verbosity) {
    339       if (phraseCounter % 1000 == 0) {
    340         tmpString = p.toString();
    341         cout << "phrase " << phraseCounter << ": "
    342          << "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
    343         delete [] tmpString;
    344       }
    345     }
    346       }
    347     }
    348    ij = p.lastSuffixIndex + 1;
    349   }
    350   outPhrase.close();
    351 
    352   // REMAINING PASSES
    353   // The previous outPhrase file forms the input to each new pass
    354   cellcount start, length;
    355   while (outPhraseCounter > 0) {
    356 
    357     // Start a new pass
    358     phrasePass++;
    359     if (verbosity) {
    360       cout << "Starting pass " << phrasePass << endl;
    361     }
    362 
    363     // Open the input file
    364     char inPhraseName[FILENAME_MAX];
    365     sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
    366     ifstream inPhrase (inPhraseName, ios::in);
    367     if (!inPhrase) {
    368       cerr << "File " << inPhraseName << " could not be opened\n";
    369       exit(1);
    370     }
    371 
    372     // Open the output file
    373     sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
    374     outPhrase.open(outPhraseName, ios::out);
    375     if (!outPhrase) {
    376       cerr << "File " << outPhraseName << " could not be opened\n";
    377       exit(1);
    378     }
    379     outPhraseCounter = 0;
    380 
    381     // Process each phrase
    382     while(inPhrase >> start >> length) {
    383 
    384       // Ignore the phrase if we have expanded it before
    385       if (isPhraseStored(start, length)) {
    386     continue;
    387       }
    388 
    389       // Remember that we have examined this phrase
    390       rememberThisPhrase(start, length);
    391 
    392       // Find the phrase in the suffixarray
    393       p = Phrase(suffixArray[start], length, SUFFIX);
    394       p.findFirstAndLastSuffix(start, inputLength-1);
    395 
    396       // cout << "index " << start << ", length " << length << " - "  <<  p.toString() << endl;
    397      
    398 
    399       // Ignore the phrase if it only occurs once
    400       if (p.suffixFrequency < 2) {
    401     continue;
    402       }
    403 
    404 
    405       // Write the phrase text  tmpString = p.toString();
    406       tmpString = p.toString();
    407       phraseData << start << "-" << length << ":" << tmpString << ":"
    408          << p.suffixFrequency << ":";
    409       delete [] tmpString;
    410    
    411 
    412       // Expand the phrase, if it is fewer than 8 words long
    413       if (length <= 8) {
    414 
    415     // Get the minimal expansions for this phrase
    416     getMinimalExpansions(p, result);
    417      
    418     // write the results
    419     phraseData << result.size() << ":";
    420 
    421     for (cellcount i = 0; i < result.size(); i++) {
    422       if (i) {
    423         phraseData << ",";
    424       }
    425       phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
    426       outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
    427       outPhraseCounter++;
    428     }
    429     result.clear();
    430    
    431       } else {
    432     // phrase is too long to expand further
    433     phraseData << "0:";
    434       }
    435 
    436    
    437       // Write the documents in which this phrase occurs
    438       df = getDocumentOccurrances(p, documentFrequency);
    439       phraseData << ":" << df << ":";
    440 
    441       // write the documents
    442       for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
    443     if (documentFrequency[i]) {
    444       if (first) {
    445         first = 0;
    446       } else {
    447         phraseData << ";";
    448       }
    449       // Output the document number.  Note that here we've numbered the
    450       // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
    451       // add 1 to the document id when we output it.
    452       phraseData << "d" << (i+1);
    453       // Next, output the frequency with which the document occurs, but
    454       // only if it is > 1.
    455       if (documentFrequency[i] > 1) {
    456         phraseData << "," << documentFrequency[i];
    457       }
    458     }
    459       }
    460      
    461       phraseData << endl;
    462       phraseCounter++;
    463 
    464       // feedback
    465       if (verbosity) {
    466     if (phraseCounter % 1000 == 0) {
    467       tmpString = p.toString();
    468       cout << "phrase " << phraseCounter << ": "<< "start " << start
    469            << ", length " << length << " - " << tmpString << endl;
    470       delete [] tmpString;
    471     }
    472       }
    473 
    474     }
    475 
    476     inPhrase.close();
    477     outPhrase.close();
    478   }
    479    
    480   phraseData.close();
    481   deletePhraseMemory();
    482 
    483   delete [] documentFrequency;
    484   delete [] symbols;
    485   delete [] suffixArray;
    486   delete [] prefixArray;
    487   delete [] suffixCheck;
    488   delete [] prefixCheck;
    489   delete [] documentArray;
    490 
    491 
    492  
    493   cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
    494   return 0;
    495 }
    496 
    497 
    498 // Get Minimal Expansions
    499 //
    500 // Get the set of "minimal" expansions of a phrase p, using the
    501 // algorithm described in the documentation.
    502 //
    503 // Returns a vector of Expansions.
    504 
    505 void getMinimalExpansions(Phrase &p, vector<Phrase> &results) {
    506 
    507   // 1. Initialise the result and candiate vectors
    508   vector<Phrase> candidates;
    509   for (cellcount j = 0; j < inputLength; j++) {
    510     suffixCheck[j] = 0;
    511     prefixCheck[j] = 0;
    512   }
    513 
    514   // 2. Expand the phrase p
    515 
    516   // 2.1 Create the candidate set
    517   p.initialSuffixCandidates(candidates);
    518   p.initialPrefixCandidates(candidates);
    519 
    520   // 2.2 Sort the candidates by phrase length
    521   make_heap(candidates.begin(), candidates.end(), isLonger);
    522 
    523   // 3. While candidates is non-empty, confirm the phrases it
    524   //    contains, expanding them as required
    525   while (!candidates.empty()) {
    526 
    527     // 3.1 Get next candidate
    528     pop_heap(candidates.begin(), candidates.end(), isLonger);
    529     Phrase c = candidates.back();
    530     candidates.pop_back();
    531 
    532     // 3.2 If we know there are no unique right extensions
    533     //     (i.e. this is a phrase drawn from the suffix array)
    534     if (!c.hasUniqueSuffixExtension()) {
    535      
    536       c.ensurePrefixFound();
    537 
    538       // 3.2.1 Ignore candidate if we have used a subphrase instead
    539       if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
    540     // cout << "ignoring" << endl;
    541       }
    542 
    543       // 3.2.2 If candidate has a unique left (prefix) extension,
    544       //       Then extend it and add it back into Candidates.
    545       else if (c.hasUniquePrefixExtension()) {
    546     // cout << "expanding prefix " << c.toString() << "=> ";
    547     c.expandUniquePrefixExtensionByOne();
    548     candidates.push_back(c);
    549     push_heap(candidates.begin(), candidates.end(), isLonger);
    550      }
    551    
    552       // 3.2.3 If candidate has no unique left (prefix) extension,
    553       //       Then add it to the list of results.
    554       else {
    555     // cout << "no unique prefix, add to results" << endl;
    556     results.push_back(c);
    557     for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
    558       suffixCheck[i] = c.length;
    559     }
    560     for (cellcount ik = c.firstPrefixIndex; ik <= c.lastPrefixIndex; ik++) {
    561       prefixCheck[ik] = c.length;
    562     }
    563       }
    564     }
    565 
    566     // 3.3 If we know there are no unique left extensions,
    567     //     Then fdo the same as for 3.2 but exchange suffix & prefix
    568     else if (!c.hasUniquePrefixExtension()) {
    569      
    570       c.ensureSuffixFound();
    571 
    572       // 3.3.1
    573       if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
    574 
    575       }
    576 
    577       // 3.3.2
    578       else if (c.hasUniqueSuffixExtension()) {
    579     c.expandUniqueSuffixExtensionByOne();
    580     candidates.push_back(c);
    581     push_heap(candidates.begin(), candidates.end(), isLonger);
    582       }
    583    
    584       // 3.3.3
    585       else {
    586     results.push_back(c);
    587     for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
    588       suffixCheck[i] = c.length;
    589     }
    590     for (cellcount ijk = c.firstPrefixIndex; ijk <= c.lastPrefixIndex; ijk++) {
    591       prefixCheck[ijk] = c.length;
    592     }
    593 
    594       }
     163    if (!shorter_found) {
     164      results.push_back(*candidate);
     165      candidate->ensureSuffixFound();
     166      for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
     167    suffixCheck[k] = candidate->length;
    595168    }
    596169  }
     
    751324// Given a phrase, what documents does it occur in?
    752325
    753 cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
    754 
    755   // cout << "searching for \""<< p.toString() << "\" in documents "
     326cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
     327
     328  // cout << "searching for \""<< p << "\" in documents "
    756329  //      << 0 << "-" << numberOfDocuments - 1 << endl;
    757330
     
    1038611}
    1039612
     613
    1040614bool isLongPhraseStored(cellindex index, cellcount length) {
    1041615
     
    1079653}
    1080654
     655
    1081656void deleteLongPhraseMemory() {
    1082657  // remove the hash & other files
     
    1090665
    1091666
    1092 
    1093 
    1094667// Read the collection statistics file
     668//
    1095669void readStatistics() {
    1096670
     
    1129703}
    1130704
    1131 
    1132 
    1133 
    1134 
     705cellcount getContentCount(symbol firstContent) {
     706
     707  cellcount content=0;
     708  for (cellcount i=0; i<inputLength; i++) {
     709    if (symbols[i]>=firstContent) content++;
     710  }
     711
     712  return content;
     713}
     714
     715int main (int argc, char * argv[]) {
     716
     717  // Command-line arguments
     718  // argv[1] is the phindex directory
     719  // argv[2] is the maximum array symbol length (optional)
     720  // argv[3] is the mode, where 1 is stopword mode (optional)
     721  if (argc < 2) {
     722    cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
     723    exit(1);
     724  }
     725
     726  // collection directory
     727  strcpy(collection, argv[1]);
     728
     729  // mode parameter
     730  phraseMode = atoi(argv[2]);
     731  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
     732
     733  // optional verbosity parameter
     734  if (argc == 4) {
     735    verbosity = atoi(argv[3]);
     736    assert (verbosity >= 0);
     737  }
     738
     739  if (verbosity) {
     740    cout << "suffix: the phrase extraction program" << endl;
     741  }
     742
     743  if (verbosity > 1) {
     744    if (phraseMode == STOPWORDS) {
     745      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
     746    } else {
     747      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
     748    }
     749  }
     750
     751  // Read the statistics file
     752  readStatistics();
     753
     754  // Read the numbers file
     755  readNumbers();
     756
     757  if (numberOfDocuments == 0) {
     758    cerr << "There are no documents in this collection!" << endl;
     759    exit(1);
     760  }
     761
     762  symbol firstContent;
     763  if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
     764  else firstContent = firstStopSymbol;
     765
     766  cellcount contentLength = 0;
     767  contentLength = getContentCount(firstContent);
     768
     769  // Create the suffix & prefix arrays
     770  suffixArray = new symbol *[contentLength];
     771  prefixArray = new symbol *[contentLength];
     772
     773  cellcount here=0;
     774  // Initialise prefix and suffix arrays, only use the needed suffixes
     775  for (cellcount j = 0; j < inputLength; j++) {
     776    if (symbols[j]>=firstContent) {
     777      suffixArray[here] = &symbols[j];
     778      prefixArray[here] = &symbols[j];
     779      here++;
     780    }   
     781  }
     782  qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
     783  qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
     784
     785  suffixCheck = new check[contentLength];
     786  if (suffixCheck == NULL) {
     787    cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
     788    exit(2);
     789  } 
     790  for (cellcount j = 0; j < contentLength; j++)
     791    suffixCheck[j] = 0;
     792
     793  cout <<"\ngenerating the phrase hierarchy\n\n";
     794 
     795  // Create the document arrays
     796  if (verbosity > 1) {
     797    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
     798  }
     799
     800  // The document frequecy array is used to count the number of times
     801  // each phrase occurs in each document.  The number of documents in
     802  // which a phrase occurs is stored in df.
     803  frequency *documentFrequency = new frequency[numberOfDocuments];
     804  frequency df;
     805
     806  // documentArray will be searched in order to discover which document
     807  // each phrase occurs in.
     808  documentArray = new symbol *[numberOfDocuments]; 
     809
     810  // just scan through the input text to find the doc starts
     811  cellindex d = 0;
     812  for (cellindex i=0; i<inputLength; i++) {
     813    if (symbols[i] == DOCUMENTSTART) {
     814      documentArray[d] = &symbols[i];
     815      d++;
     816    }
     817  }
     818
     819  // the phrases stuff is expecting inputLength to be the length of the
     820  // suffix array, so change it.
     821  inputLength = contentLength;
     822
     823  // Extract phrases
     824  //
     825  // We will make several passesover the data, in each case considering
     826  // a set of input phrases and generating a set of output phrases, which
     827  // we will expancd in later passes.
     828  //
     829  // The input phrases in the first pass will be the vocabulary.
     830  // In later passes, the input phrases will be the output phrases of the
     831  // previous pass.
     832  //
     833  // In each pass we will consider each input phrase in turn.  If we
     834  // have seen it before, we will ignore it.  Otherwise, we will expand
     835  // it and add its expansions to the set of output phrases.
     836
     837  // Store the phrase data in the phrases file
     838  char phraseDataName[FILENAME_MAX];
     839  sprintf(phraseDataName, "%s/phrases", collection);
     840  ofstream phraseData(phraseDataName, ios::out);
     841  if (!phraseData) {
     842    cout << "File " << phraseDataName << " could not be opened\n";
     843    exit(1);
     844  }
     845
     846  // Count the number of phrases output
     847  unsigned long int phraseCounter = 0;
     848
     849  // Set up the phrase expansion memory.
     850  // We need this so that we don't expand a phrase more than once
     851  initialisePhraseMemory();
     852
     853  // The current pass numebr
     854  int phrasePass = 1;
     855
     856
     857  // PASS NUMBER 1
     858  if (verbosity > 1) {
     859    cout << "Starting pass " << phrasePass << endl;
     860  }
     861
     862  ofstream outPhrase;
     863  char     outPhraseName[FILENAME_MAX];
     864  unsigned long int outPhraseCounter = 0;
     865
     866  // On the first pass, simply work through the vocabulary
     867  sprintf(outPhraseName, "%s/outPhrase.1", collection);
     868  outPhrase.open(outPhraseName, ios::out);
     869  if (!outPhrase) {
     870    cerr << "File " << outPhraseName << " could not be opened\n";
     871    exit(1);
     872  }
     873
     874  // Iterate over the different symbols by working through the suffix array
     875  vector<Phrase> result;
     876  cellindex ij = 0;
     877  char *tmpString;
     878
     879  Phrase p;
     880  while (ij < inputLength) {
     881
     882    // make a new phrase of length 1
     883    p = Phrase(suffixArray[ij], 1, SUFFIX);
     884    p.findFirstAndLastSuffix(ij, inputLength-1);
     885
     886    // We ignore this symbol if it occurs only once, if it is a delimiter,
     887    // of if we are in stopwords mode and it is a stopword
     888    // - in this new version, only need to check freq
     889    // We could imagine a new mode/command-line option, which is like
     890    // STOPWORDS but without this restrictrion.  This would let you browse
     891    // from "the" to "the AGRIS" for example, but not from "AGRIS" to
     892    // "the AGRIS" (where the is a stopword and AGRIS a content word).
     893    // The system used to work like this; it is easy to implement, but
     894    // it explodes the size of the indexes.  So: would it be useful? 
     895    if (p.suffixFrequency > 1) {
     896      // Get minimal expansions of the phrase
     897      getExpansions(p, result);
     898     
     899      if (!result.empty()) {
     900   
     901    // Remember that we have expanded this phrase
     902    rememberThisPhrase(ij, 1);
     903
     904    // write the phrase text
     905    phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
     906           << result.size() << ":";
     907
     908    // write the results
     909    for (cellcount k = 0; k < result.size(); k++) {
     910      if (k) {
     911        phraseData << ",";
     912      }
     913      phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
     914      outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
     915      outPhraseCounter++;
     916    }
     917    result.clear();
     918   
     919    // Write the documents in which this phrase occurs
     920    df = getDocumentOccurrances(p, documentFrequency);
     921    phraseData << ":" << df << ":";
     922
     923    // write the documents
     924    for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
     925      if (documentFrequency[m]) {
     926        if (first) {
     927          first = 0;
     928        } else {
     929          phraseData << ";";
     930        }
     931        // Output the document number.  Note that here we've numbered the
     932        // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
     933        // add 1 to the document id when we output it.
     934        phraseData << "d" << (m+1);
     935        // Next, output the frequency with which the document occurs, but
     936        // only if it is > 1.
     937        if (documentFrequency[m] > 1) {
     938          phraseData << "," << documentFrequency[m];
     939        }
     940      }
     941    }
     942
     943    phraseData << endl;
     944    phraseCounter++;
     945
     946    // feedback
     947    if (verbosity) {
     948      if (phraseCounter % 1000 == 0) {
     949        cout << "phrase " << phraseCounter << ": "
     950         << "cell " << p.firstSuffixIndex << " - " << p << endl;
     951      }
     952    }
     953      }
     954    }
     955   ij = p.lastSuffixIndex + 1;
     956  }
     957  outPhrase.close();
     958
     959  // REMAINING PASSES
     960  // The previous outPhrase file forms the input to each new pass
     961  cellcount start, length;
     962  while (outPhraseCounter > 0) {
     963
     964    // Start a new pass
     965    phrasePass++;
     966    if (verbosity) {
     967      cout << "Starting pass " << phrasePass << endl;
     968    }
     969
     970    // Open the input file
     971    char inPhraseName[FILENAME_MAX];
     972    sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
     973    ifstream inPhrase (inPhraseName, ios::in);
     974    if (!inPhrase) {
     975      cerr << "File " << inPhraseName << " could not be opened\n";
     976      exit(1);
     977    }
     978
     979    // Open the output file
     980    sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
     981    outPhrase.open(outPhraseName, ios::out);
     982    if (!outPhrase) {
     983      cerr << "File " << outPhraseName << " could not be opened\n";
     984      exit(1);
     985    }
     986    outPhraseCounter = 0;
     987
     988    // Process each phrase
     989    while(inPhrase >> start >> length) {
     990
     991      // Ignore the phrase if we have expanded it before
     992      if (isPhraseStored(start, length))
     993    continue;
     994
     995      // Remember that we have examined this phrase
     996      rememberThisPhrase(start, length);
     997
     998      // Find the phrase in the suffixarray
     999      p = Phrase(suffixArray[start], length, SUFFIX);
     1000      p.findFirstAndLastSuffix(start, inputLength-1);
     1001
     1002      // Ignore the phrase if it only occurs once
     1003      if (p.suffixFrequency < 2)
     1004    continue;
     1005
     1006      // Write the phrase text;
     1007      phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
     1008   
     1009      // Expand the phrase, if it is fewer than 8 words long
     1010      if (length <= 8) {
     1011
     1012    // Get the minimal expansions for this phrase
     1013    getExpansions(p, result);
     1014     
     1015    // write the results
     1016    phraseData << result.size() << ":";
     1017
     1018    for (cellcount i = 0; i < result.size(); i++) {
     1019      if (i) {
     1020        phraseData << ",";
     1021      }
     1022      phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
     1023      outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
     1024      outPhraseCounter++;
     1025    }
     1026    result.clear();
     1027   
     1028      } else {
     1029    // phrase is too long to expand further
     1030    phraseData << "0:";
     1031      }
     1032
     1033      // Write the documents in which this phrase occurs
     1034      df = getDocumentOccurrances(p, documentFrequency);
     1035      phraseData << ":" << df << ":";
     1036
     1037      // write the documents
     1038      for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
     1039    if (documentFrequency[i]) {
     1040      if (first) {
     1041        first = 0;
     1042      } else {
     1043        phraseData << ";";
     1044      }
     1045      // Output the document number.  Note that here we've numbered the
     1046      // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
     1047      // add 1 to the document id when we output it.
     1048      phraseData << "d" << (i+1);
     1049      // Next, output the frequency with which the document occurs, but
     1050      // only if it is > 1.
     1051      if (documentFrequency[i] > 1) {
     1052        phraseData << "," << documentFrequency[i];
     1053      }
     1054    }
     1055      }
     1056     
     1057      phraseData << endl;
     1058      phraseCounter++;
     1059
     1060      // feedback
     1061      if (verbosity) {
     1062    if (phraseCounter % 1000 == 0) {
     1063      cout << "phrase " << phraseCounter << ": "<< "start " << start
     1064           << ", length " << length << " - " << p << endl;
     1065    }
     1066      }
     1067
     1068    }
     1069
     1070    inPhrase.close();
     1071    outPhrase.close();
     1072  }
     1073   
     1074  phraseData.close();
     1075  deletePhraseMemory();
     1076
     1077  delete [] documentFrequency;
     1078  delete [] symbols;
     1079  delete [] suffixArray;
     1080  delete [] prefixArray;
     1081  delete [] suffixCheck;
     1082  delete [] documentArray;
     1083
     1084
     1085 
     1086  cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
     1087  return 0;
     1088}
     1089
     1090
     1091
     1092
     1093
Note: See TracChangeset for help on using the changeset viewer.