Changeset 2694 for trunk/gsdl/src


Ignore:
Timestamp:
2001-08-12T10:28:09+12:00 (23 years ago)
Author:
paynter
Message:

Various improvements to the Phrase code, including new copy constructors,
inlining frequently used code, and an operator<< function.

Location:
trunk/gsdl/src/phind/generate
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/phrase.cpp

    r2674 r2694  
    6767
    6868
     69Phrase::Phrase(const Phrase &p) {
     70  forward = p.forward;
     71  back = p.back;
     72  length = p.length;
     73
     74  suffixFound = p.suffixFound;
     75  prefixFound = p.prefixFound;
     76
     77  firstSuffix = p.firstSuffix;
     78  lastSuffix  = p.lastSuffix;
     79  firstSuffixIndex = p.firstSuffixIndex;
     80  lastSuffixIndex  = p.lastSuffixIndex;
     81  suffixFrequency  = p.suffixFrequency;
     82
     83  firstPrefix = p.firstPrefix;
     84  lastPrefix  = p.lastPrefix;
     85  firstPrefixIndex = p.firstPrefixIndex;
     86  lastPrefixIndex  = p.lastPrefixIndex;
     87  prefixFrequency  = p.prefixFrequency;
     88
     89  uniqueSuffixExtension = p.uniqueSuffixExtension;
     90  uniquePrefixExtension = p.uniquePrefixExtension;
     91}
     92
     93
    6994// Empty the contents of a phrase
    7095
     
    77102  firstSuffix = firstPrefix = NULL;
    78103  lastSuffix = lastPrefix = NULL;
    79   suffixFrequency = prefixFrequency = 0;
     104
     105  firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0;
     106  firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0;
     107
    80108  uniqueSuffixExtension = uniquePrefixExtension = -1;
    81109
     
    88116int Phrase::clearSuffix() {
    89117  suffixFound = 0;
    90   firstSuffix = NULL;
    91   lastSuffix = NULL;
    92   suffixFrequency = 0;
     118  firstSuffix = lastSuffix = NULL;
     119  firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0;
    93120  uniqueSuffixExtension = -1;
    94121  return 0;
     
    97124int Phrase::clearPrefix() {
    98125  prefixFound = 0;
    99   firstPrefix = NULL;
    100   lastPrefix = NULL;
    101   prefixFrequency = 0;
     126  firstPrefix = lastPrefix = NULL;
     127  firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0;
    102128  uniquePrefixExtension = -1;
    103129  return 0;
     
    141167
    142168
     169// Output a phrase to a stream
     170std::ostream &operator<<(std::ostream &stream, const Phrase &phrase)
     171{
     172  assert(phrase.forward);
     173  symbol *s = phrase.forward;
     174
     175  stream << "s" << *s++;
     176  for (cellcount i = 1; i < phrase.length; i++)
     177    stream << " s" << *s++;
     178
     179  return stream;
     180}
     181
    143182
    144183// Convert the phrase to a string
    145 char *Phrase::toString() {
    146 
     184// Note thgat you have to delete the memory yourself.
     185char *Phrase::toString()
     186{
    147187  assert(forward);
    148188
     
    596636
    597637
    598 // Ensure that the phrase has been found in the suffix & prefix arrays
    599 
    600 int Phrase::ensureSuffixFound() {
    601   if (!suffixFound) {
    602     findFirstAndLastSuffix();
    603   }
    604   return 0;
    605 }
    606 
    607 int Phrase::ensurePrefixFound() {
    608   if (!prefixFound) {
    609     findFirstAndLastPrefix();
    610   }
    611   return 0;
    612 }
    613 
    614 
    615638// Calculate a set of initial suffix/prefix candidates
    616639//
     
    619642// and add them to the end of the results vector
    620643
    621 int Phrase::initialSuffixCandidates(vector<Phrase> &results) {
     644void Phrase::initialSuffixCandidates(vector<Phrase> &results) {
    622645
    623646  ensureSuffixFound();
     
    639662    // Move onto the next expansion
    640663    i = next.lastSuffixIndex + 1;
    641 
    642   }
    643   return 0;
    644 }
    645 
    646 
    647 int Phrase::initialPrefixCandidates(vector<Phrase> &results) {
     664  }
     665}
     666
     667
     668void Phrase::initialPrefixCandidates(vector<Phrase> &results) {
    648669
    649670  ensurePrefixFound();
     
    665686    // Move onto the next expansion
    666687    i = next.lastPrefixIndex + 1;
    667 
    668   }
    669   return 0;
     688  }
    670689}
    671690
  • trunk/gsdl/src/phind/generate/phrase.h

    r2674 r2694  
    5050public:
    5151
    52   // The phrase itself is stored with two pointers: forward
    53   // points to its first cell, back points to its last.
    54   // The length is always stored in length.
    55   // If one of these is set, all must be set, and it must
    56   // be true that (forward + length - 1) = back
     52  // The phrase itself is stored with two pointers: forward points to
     53  // its first cell, back points to its last.  The length is always
     54  // stored in length.  If one of these is set, all must be set, and
     55  // it must be true that (forward + length - 1) = back.
    5756  symbol *forward;
    5857  symbol *back;
     
    7675
    7776  // Constructor functions
    78   // First argument is an array of words, second is the length of
    79   // the phrase, third is the direction (SUFFIX of PREFIX) in
    80   // which the words should be read (defaults to forwards).
     77  Phrase();
     78  Phrase(const Phrase &p);
     79
     80  // A "partial" constructor: the first argument is an array of words,
     81  // second is its length, third is the direction (SUFFIX or PREFIX)
     82  // in which the words should be read (defaults to SUFFIX).
    8183  Phrase(symbol *words, cellcount size, int direction);
    8284
    83   // An empty phrase can be created without arguments, but is
    84   // good for nothing and may not be used with any public fuctions.
    85   // We therefore only use it internally.
    86   Phrase();
    87 
    88   // Represent the phrase as  a string
     85  // Represent the phrase as an arracy of characters
     86  // You will have to call "delete []" on the array returned.
    8987  char *toString();
    9088
    9189  // Find an initial set of candidate phrases in the suffix/prefix array
    92   int initialSuffixCandidates(vector<Phrase> &results);
    93   int initialPrefixCandidates(vector<Phrase> &results);
     90  void initialSuffixCandidates(vector<Phrase> &results);
     91  void initialPrefixCandidates(vector<Phrase> &results);
    9492 
    9593  // Does the phrase have a unique extension?
     
    116114
    117115  // Make sure the phrase location in the suffix/prefix array is known
    118   int ensureSuffixFound();
    119   int ensurePrefixFound();
     116  inline void Phrase::ensureSuffixFound() {
     117    if (!suffixFound)
     118      findFirstAndLastSuffix();
     119  }
     120  inline void Phrase::ensurePrefixFound() {
     121    if (!prefixFound)
     122      findFirstAndLastPrefix();
     123  }
     124
     125  // Output a phrase to a stream
     126  friend std::ostream &operator<<(std::ostream &stream, const Phrase &phrase);
    120127
    121128private:
     
    152159
    153160#endif
     161
  • trunk/gsdl/src/phind/generate/suffix2.cpp

    r2673 r2694  
    6161symbol   *symbols;
    6262symbol  **suffixArray;
     63symbol  **prefixArray;
    6364check    *suffixCheck;
    64 symbol  **prefixArray;
     65
    6566
    6667// How many documents are in this collection?
     
    6869symbol  **documentArray;
    6970
     71
    7072// Do we accept any phrase, or do we eliminate those ending with stopwords ?
    7173int phraseMode = ANYPHRASE; //STOPWORDS;
    7274
     75
    7376// The filestem of the collection's phindex directory
    7477char collection[FILENAME_MAX];
    7578
    76 int suffixCompare(const void *, const void *);
    77 int prefixCompare(const void *, const void *);
    78 int pointerCompare(const void *, const void *);
    79 
    80 int readNumbers();
    81 void readStatistics();
    82 
    83 void getExpansions(Phrase &p, vector<Phrase> &results);
    84 cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
    8579
    8680// The ranges of the stopword and content-word symbols for the collection
     
    9185
    9286
    93 
    94 
    95 // Phrase memory
    96 // We have to "remember" each phrase that we've expanded
     87// Some useful comparison functions, defined below.
     88int suffixCompare(const void *, const void *);
     89int prefixCompare(const void *, const void *);
     90int pointerCompare(const void *, const void *);
     91
     92
     93// Functions for implementing "phrase memory".  These let us "remember"
     94// each phrase that we've expanded without using two much memory.
    9795void initialisePhraseMemory();
    9896void rememberThisPhrase(cellindex index, cellcount length);
     
    105103
    106104
    107 int main (int argc, char * argv[]) {
    108 
    109   // Command-line arguments
    110   // argv[1] is the phindex directory
    111   // argv[2] is the maximum array symbol length (optional)
    112   // argv[3] is the mode, where 1 is stopword mode (optional)
    113   if (argc < 2) {
    114     cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
    115     exit(1);
    116   }
    117 
    118   // collection directory
    119   strcpy(collection, argv[1]);
    120 
    121   // mode parameter
    122   phraseMode = atoi(argv[2]);
    123   assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
    124 
    125   // optional verbosity parameter
    126   if (argc == 4) {
    127     verbosity = atoi(argv[3]);
    128     assert (verbosity >= 0);
    129   }
    130 
    131   if (verbosity) {
    132     cout << "suffix2: the simpler phrase extraction program" << endl;
    133   }
    134 
    135   if (verbosity > 1) {
    136     if (phraseMode == STOPWORDS) {
    137       cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
    138     } else {
    139       cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
    140     }
    141   }
    142 
    143   // Read the statistics file
    144   readStatistics();
    145 
    146   // Read the numbers file
    147   readNumbers();
    148 
    149   // Create the suffix & prefix arrays
    150   suffixArray = new symbol *[inputLength];
    151   prefixArray = new symbol *[inputLength];
    152   suffixCheck = new check[inputLength];
    153   if (suffixCheck == NULL) {
    154     cerr << "Suffix2 error: not enough memory to hold " << inputLength << " symbols." << endl;
    155     exit(2);
    156   } 
    157 
    158   // Initialise prefix and suffix arrays
    159   for (cellcount j = 0; j < inputLength; j++) {
    160     suffixArray[j] = &symbols[j];
    161     prefixArray[j] = &symbols[j];
    162   }
    163   qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
    164   qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
    165 
    166 
    167   // Create the document arrays
    168   if (numberOfDocuments == 0) {
    169     cerr << "There are no documents in this collection!" << endl;
    170     exit(1);
    171   }
    172   if (verbosity > 1) {
    173     cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
    174   }
    175 
    176   // The document frequecy array is used to count the number of times
    177   // each phrase occurs in each document.  The number of documents in
    178   // which a phrase occurs is stored in df.
    179   frequency *documentFrequency = new frequency[numberOfDocuments];
    180   frequency df;
    181 
    182   // documentArray will be searched in order to discover which document
    183   // each phrase occurs in.
    184   documentArray = new symbol *[numberOfDocuments]; 
    185 
    186   // Discover all the DOCUMENTSTART symbols and store as a phrase
    187   cellindex d = 0;
    188   while (*suffixArray[d] != DOCUMENTSTART) {
    189     d++;
    190   }
    191   Phrase p(suffixArray[d], 1, SUFFIX);
    192   p.findFirstAndLastSuffix(d, inputLength-1);
    193  
    194   // Insert the document locations time (as pointers) into documentArray
    195   for (cellcount i = 0; i < p.suffixFrequency; i++) {
    196     documentArray[i] = suffixArray[i + p.firstSuffixIndex];
    197   }
    198  
    199   // Sort the document array into ascending order of raw pointer value
    200   qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
    201 
    202 
    203   // Extract phrases
    204   //
    205   // We will make several passesover the data, in each case considering
    206   // a set of input phrases and generating a set of output phrases, which
    207   // we will expancd in later passes.
    208   //
    209   // The input phrases in the first pass will be the vocabulary.
    210   // In later passes, the input phrases will be the output phrases of the
    211   // previous pass.
    212   //
    213   // In each pass we will consider each input phrase in turn.  If we
    214   // have seen it before, we will ignore it.  Otherwise, we will expand
    215   // it and add its expansions to the set of output phrases.
    216 
    217   // Store the phrase data in the phrases file
    218   char phraseDataName[FILENAME_MAX];
    219   sprintf(phraseDataName, "%s/phrases", collection);
    220   ofstream phraseData(phraseDataName, ios::out);
    221   if (!phraseData) {
    222     cout << "File " << phraseDataName << " could not be opened\n";
    223     exit(1);
    224   }
    225 
    226   // Count the number of phrases output
    227   unsigned long int phraseCounter = 0;
    228 
    229   // Set up the phrase expansion memory.
    230   // We need this so that we don't expand a phrase more than once
    231   initialisePhraseMemory();
    232 
    233   // The current pass numebr
    234   int phrasePass = 1;
    235 
    236 
    237   // PASS NUMBER 1
    238   if (verbosity > 1) {
    239     cout << "Starting pass " << phrasePass << endl;
    240   }
    241 
    242   ofstream outPhrase;
    243   char     outPhraseName[FILENAME_MAX];
    244   unsigned long int outPhraseCounter = 0;
    245 
    246   // On the first pass, simply work through the vocabulary
    247   sprintf(outPhraseName, "%s/outPhrase.1", collection);
    248   outPhrase.open(outPhraseName, ios::out);
    249   if (!outPhrase) {
    250     cerr << "File " << outPhraseName << " could not be opened\n";
    251     exit(1);
    252   }
    253 
    254   // Iterate over the different symbols by working through the suffix array
    255   vector<Phrase> result;
    256   cellindex ij = 0;
    257   char *tmpString;
    258 
    259   while (ij < inputLength) {
    260 
    261     // make a new phrase of length 1
    262     p = Phrase(suffixArray[ij], 1, SUFFIX);
    263     p.findFirstAndLastSuffix(ij, inputLength-1);
    264 
    265     // cout << "cell " << ij << " - " << p.toString() << endl;
    266 
    267     // We ignore this symbol if it occurs only once, if it is a delimiter,
    268     // of if we are in stopwords mode and it is a stopword
    269     //
    270     // We could imagine a new mode/command-line option, which is like
    271     // STOPWORDS but without this restrictrion.  This would let you browse
    272     // from "the" to "the AGRIS" for example, but not from "AGRIS" to
    273     // "the AGRIS" (where the is a stopword and AGRIS a content word).
    274     // The system used to work like this; it is easy to implement, but
    275     // it explodes the size of the indexes.  So: would it be useful? 
    276     if (!((p.suffixFrequency <= 1) ||
    277       // (*suffixArray[ij] != 23054) ||
    278       (*suffixArray[ij] <= LASTDELIMITER) ||
    279       ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {
    280 
    281       // Get minimal expansions of the phrase
    282       getExpansions(p, result);
    283      
    284       if (!result.empty()) {
    285    
    286     // Remember that we have expanded this phrase
    287     rememberThisPhrase(ij, 1);
    288 
    289     // write the phrase text
    290     tmpString = p.toString();
    291     phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
    292            << result.size() << ":";
    293     delete [] tmpString;
    294 
    295     // write the results
    296     for (cellcount k = 0; k < result.size(); k++) {
    297       if (k) {
    298         phraseData << ",";
    299       }
    300       phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
    301       outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
    302       outPhraseCounter++;
    303     }
    304     result.clear();
    305    
    306     // Write the documents in which this phrase occurs
    307     df = getDocumentOccurrances(p, documentFrequency);
    308     phraseData << ":" << df << ":";
    309 
    310     // write the documents
    311     for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
    312       if (documentFrequency[m]) {
    313         if (first) {
    314           first = 0;
    315         } else {
    316           phraseData << ";";
    317         }
    318         // Output the document number.  Note that here we've numbered the
    319         // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
    320         // add 1 to the document id when we output it.
    321         phraseData << "d" << (m+1);
    322         // Next, output the frequency with which the document occurs, but
    323         // only if it is > 1.
    324         if (documentFrequency[m] > 1) {
    325           phraseData << "," << documentFrequency[m];
    326         }
    327       }
    328     }
    329 
    330     phraseData << endl;
    331     phraseCounter++;
    332 
    333     // feedback
    334     if (verbosity) {
    335       if (phraseCounter % 1000 == 0) {
    336         tmpString = p.toString();
    337         cout << "phrase " << phraseCounter << ": "
    338          << "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
    339         delete [] tmpString;
    340       }
    341     }
    342       }
    343     }
    344    ij = p.lastSuffixIndex + 1;
    345   }
    346   outPhrase.close();
    347 
    348   // REMAINING PASSES
    349   // The previous outPhrase file forms the input to each new pass
    350   cellcount start, length;
    351   while (outPhraseCounter > 0) {
    352 
    353     // Start a new pass
    354     phrasePass++;
    355     if (verbosity) {
    356       cout << "Starting pass " << phrasePass << endl;
    357     }
    358 
    359     // Open the input file
    360     char inPhraseName[FILENAME_MAX];
    361     sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
    362     ifstream inPhrase (inPhraseName, ios::in);
    363     if (!inPhrase) {
    364       cerr << "File " << inPhraseName << " could not be opened\n";
    365       exit(1);
    366     }
    367 
    368     // Open the output file
    369     sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
    370     outPhrase.open(outPhraseName, ios::out);
    371     if (!outPhrase) {
    372       cerr << "File " << outPhraseName << " could not be opened\n";
    373       exit(1);
    374     }
    375     outPhraseCounter = 0;
    376 
    377     // Process each phrase
    378     while(inPhrase >> start >> length) {
    379 
    380       // Ignore the phrase if we have expanded it before
    381       if (isPhraseStored(start, length)) {
    382     continue;
    383       }
    384 
    385       // Remember that we have examined this phrase
    386       rememberThisPhrase(start, length);
    387 
    388       // Find the phrase in the suffixarray
    389       p = Phrase(suffixArray[start], length, SUFFIX);
    390       p.findFirstAndLastSuffix(start, inputLength-1);
    391 
    392       // cout << "index " << start << ", length " << length << " - "  <<  p.toString() << endl;
    393      
    394 
    395       // Ignore the phrase if it only occurs once
    396       if (p.suffixFrequency < 2) {
    397     continue;
    398       }
    399 
    400 
    401       // Write the phrase text  tmpString = p.toString();
    402       tmpString = p.toString();
    403       phraseData << start << "-" << length << ":" << tmpString << ":"
    404          << p.suffixFrequency << ":";
    405       delete [] tmpString;
    406    
    407 
    408       // Expand the phrase, if it is fewer than 8 words long
    409       if (length <= 8) {
    410 
    411     // Get the minimal expansions for this phrase
    412     getExpansions(p, result);
    413      
    414     // write the results
    415     phraseData << result.size() << ":";
    416 
    417     for (cellcount i = 0; i < result.size(); i++) {
    418       if (i) {
    419         phraseData << ",";
    420       }
    421       phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
    422       outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
    423       outPhraseCounter++;
    424     }
    425     result.clear();
    426    
    427       } else {
    428     // phrase is too long to expand further
    429     phraseData << "0:";
    430       }
    431 
    432    
    433       // Write the documents in which this phrase occurs
    434       df = getDocumentOccurrances(p, documentFrequency);
    435       phraseData << ":" << df << ":";
    436 
    437       // write the documents
    438       for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
    439     if (documentFrequency[i]) {
    440       if (first) {
    441         first = 0;
    442       } else {
    443         phraseData << ";";
    444       }
    445       // Output the document number.  Note that here we've numbered the
    446       // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
    447       // add 1 to the document id when we output it.
    448       phraseData << "d" << (i+1);
    449       // Next, output the frequency with which the document occurs, but
    450       // only if it is > 1.
    451       if (documentFrequency[i] > 1) {
    452         phraseData << "," << documentFrequency[i];
    453       }
    454     }
    455       }
    456      
    457       phraseData << endl;
    458       phraseCounter++;
    459 
    460       // feedback
    461       if (verbosity) {
    462     if (phraseCounter % 1000 == 0) {
    463       tmpString = p.toString();
    464       cout << "phrase " << phraseCounter << ": "<< "start " << start
    465            << ", length " << length << " - " << tmpString << endl;
    466       delete [] tmpString;
    467     }
    468       }
    469 
    470     }
    471 
    472     inPhrase.close();
    473     outPhrase.close();
    474   }
    475    
    476   phraseData.close();
    477   deletePhraseMemory();
    478 
    479   delete [] documentFrequency;
    480   delete [] symbols;
    481   delete [] suffixArray;
    482   delete [] prefixArray;
    483   delete [] suffixCheck;
    484   delete [] documentArray;
    485 
    486 
    487  
    488   cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
    489   return 0;
    490 }
    491 
    492 
    493 
    494 
    495 
    496105// Get a phrase's expansions
    497106//
     
    510119
    511120  // 2. Ensure maximality: expand each initial candidate both right and left
    512   for (vector<Phrase>::iterator i = candidates.begin(); i != candidates.end(); i++) {
     121  for (vector<Phrase>::iterator i = candidates.begin(); i != candidates.end(); ++i) {
    513122    // We should be able to optimise this given we've already expanded in one direction
    514123    i->expandWhileUniquePrefixExtension();
     
    521130    return;
    522131
    523   for (cellcount j = 0; j < inputLength; j++) {
     132  // Initialise the candidates, check array, and various variables.
     133  sort(candidates.begin(), candidates.end(), isShorter);
     134  for (cellcount j = 0; j < inputLength; j++)
    524135    suffixCheck[j] = 0;
    525   }
    526   sort(candidates.begin(), candidates.end(), isShorter);
    527 
    528   unsigned minimum_length = p.length + 1;
    529   if (candidates.begin()->length > minimum_length)
    530     minimum_length = candidates.begin()->length;
     136  unsigned minimum_length = candidates.begin()->length;
    531137 
    532138  // Try to add each candidate to the results set, ignoring the non-minimal
    533   for (vector<Phrase>::iterator candidate = candidates.begin(); candidate != candidates.end(); candidate++) {
    534 
    535     // cerr << "* candidate of length " << candidate->length << ": (" << candidate->toString() << ")\n";
    536 
    537     // Make a copy of candidate that we will mutilate while performing sub-phrase checks
    538     Phrase temp_phrase(candidate->forward, candidate->length, SUFFIX);
     139  for (vector<Phrase>::iterator candidate = candidates.begin();
     140       candidate != candidates.end(); candidate++) {
     141
     142    // Make a copy of candidate to mutilate while performing sub-phrase checks
     143    Phrase temp_phrase(*candidate);
    539144    bool shorter_found = false;
    540145   
     146    // Check for shorter and shorter versions of the tenporary phrase
    541147    while (temp_phrase.length >= minimum_length && !shorter_found) {
    542148      temp_phrase.ensureSuffixFound();
     
    551157     
    552158    if (!shorter_found) {
    553       // cerr << "NOT FOUND! " << candidate->length << ": adding (" << candidate->toString() << ") : "
    554       //      << candidate->firstSuffixIndex << "-" << candidate->lastSuffixIndex << "\n";
    555159      results.push_back(*candidate);
    556160      candidate->ensureSuffixFound();
    557       for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; k++) {
     161      for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
    558162    suffixCheck[k] = candidate->length;
    559       }     
    560163    }
    561164  }
     
    716319// Given a phrase, what documents does it occur in?
    717320
    718 cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
    719 
    720   // cout << "searching for \""<< p.toString() << "\" in documents "
     321cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
     322
     323  // cout << "searching for \""<< p << "\" in documents "
    721324  //      << 0 << "-" << numberOfDocuments - 1 << endl;
    722325
     
    1003606}
    1004607
     608
    1005609bool isLongPhraseStored(cellindex index, cellcount length) {
    1006610
     
    1044648}
    1045649
     650
    1046651void deleteLongPhraseMemory() {
    1047652  // remove the hash & other files
     
    1055660
    1056661
    1057 
    1058 
    1059662// Read the collection statistics file
     663//
    1060664void readStatistics() {
    1061665
     
    1095699
    1096700
    1097 
    1098 
    1099 
     701int main (int argc, char * argv[]) {
     702
     703  // Command-line arguments
     704  // argv[1] is the phindex directory
     705  // argv[2] is the maximum array symbol length (optional)
     706  // argv[3] is the mode, where 1 is stopword mode (optional)
     707  if (argc < 2) {
     708    cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
     709    exit(1);
     710  }
     711
     712  // collection directory
     713  strcpy(collection, argv[1]);
     714
     715  // mode parameter
     716  phraseMode = atoi(argv[2]);
     717  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
     718
     719  // optional verbosity parameter
     720  if (argc == 4) {
     721    verbosity = atoi(argv[3]);
     722    assert (verbosity >= 0);
     723  }
     724
     725  if (verbosity) {
     726    cout << "suffix2: the simpler phrase extraction program" << endl;
     727  }
     728
     729  if (verbosity > 1) {
     730    if (phraseMode == STOPWORDS) {
     731      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
     732    } else {
     733      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
     734    }
     735  }
     736
     737  // Read the statistics file
     738  readStatistics();
     739
     740  // Read the numbers file
     741  readNumbers();
     742
     743  // Create the suffix & prefix arrays
     744  suffixArray = new symbol *[inputLength];
     745  prefixArray = new symbol *[inputLength];
     746  suffixCheck = new check[inputLength];
     747  if (suffixCheck == NULL) {
     748    cerr << "Suffix2 error: not enough memory to hold " << inputLength << " symbols." << endl;
     749    exit(2);
     750  } 
     751
     752  // Initialise prefix and suffix arrays
     753  for (cellcount j = 0; j < inputLength; j++) {
     754    suffixArray[j] = &symbols[j];
     755    prefixArray[j] = &symbols[j];
     756  }
     757  qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
     758  qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
     759
     760
     761  // Create the document arrays
     762  if (numberOfDocuments == 0) {
     763    cerr << "There are no documents in this collection!" << endl;
     764    exit(1);
     765  }
     766  if (verbosity > 1) {
     767    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
     768  }
     769
     770  // The document frequecy array is used to count the number of times
     771  // each phrase occurs in each document.  The number of documents in
     772  // which a phrase occurs is stored in df.
     773  frequency *documentFrequency = new frequency[numberOfDocuments];
     774  frequency df;
     775
     776  // documentArray will be searched in order to discover which document
     777  // each phrase occurs in.
     778  documentArray = new symbol *[numberOfDocuments]; 
     779
     780  // Discover all the DOCUMENTSTART symbols and store as a phrase
     781  cellindex d = 0;
     782  while (*suffixArray[d] != DOCUMENTSTART) {
     783    d++;
     784  }
     785  Phrase p(suffixArray[d], 1, SUFFIX);
     786  p.findFirstAndLastSuffix(d, inputLength-1);
     787 
     788  // Insert the document locations time (as pointers) into documentArray
     789  for (cellcount i = 0; i < p.suffixFrequency; i++) {
     790    documentArray[i] = suffixArray[i + p.firstSuffixIndex];
     791  }
     792 
     793  // Sort the document array into ascending order of raw pointer value
     794  qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
     795
     796
     797  // Extract phrases
     798  //
     799  // We will make several passesover the data, in each case considering
     800  // a set of input phrases and generating a set of output phrases, which
     801  // we will expancd in later passes.
     802  //
     803  // The input phrases in the first pass will be the vocabulary.
     804  // In later passes, the input phrases will be the output phrases of the
     805  // previous pass.
     806  //
     807  // In each pass we will consider each input phrase in turn.  If we
     808  // have seen it before, we will ignore it.  Otherwise, we will expand
     809  // it and add its expansions to the set of output phrases.
     810
     811  // Store the phrase data in the phrases file
     812  char phraseDataName[FILENAME_MAX];
     813  sprintf(phraseDataName, "%s/phrases", collection);
     814  ofstream phraseData(phraseDataName, ios::out);
     815  if (!phraseData) {
     816    cout << "File " << phraseDataName << " could not be opened\n";
     817    exit(1);
     818  }
     819
     820  // Count the number of phrases output
     821  unsigned long int phraseCounter = 0;
     822
     823  // Set up the phrase expansion memory.
     824  // We need this so that we don't expand a phrase more than once
     825  initialisePhraseMemory();
     826
     827  // The current pass numebr
     828  int phrasePass = 1;
     829
     830
     831  // PASS NUMBER 1
     832  if (verbosity > 1) {
     833    cout << "Starting pass " << phrasePass << endl;
     834  }
     835
     836  ofstream outPhrase;
     837  char     outPhraseName[FILENAME_MAX];
     838  unsigned long int outPhraseCounter = 0;
     839
     840  // On the first pass, simply work through the vocabulary
     841  sprintf(outPhraseName, "%s/outPhrase.1", collection);
     842  outPhrase.open(outPhraseName, ios::out);
     843  if (!outPhrase) {
     844    cerr << "File " << outPhraseName << " could not be opened\n";
     845    exit(1);
     846  }
     847
     848  // Iterate over the different symbols by working through the suffix array
     849  vector<Phrase> result;
     850  cellindex ij = 0;
     851  char *tmpString;
     852
     853  while (ij < inputLength) {
     854
     855    // make a new phrase of length 1
     856    p = Phrase(suffixArray[ij], 1, SUFFIX);
     857    p.findFirstAndLastSuffix(ij, inputLength-1);
     858
     859    // We ignore this symbol if it occurs only once, if it is a delimiter,
     860    // of if we are in stopwords mode and it is a stopword
     861    //
     862    // We could imagine a new mode/command-line option, which is like
     863    // STOPWORDS but without this restrictrion.  This would let you browse
     864    // from "the" to "the AGRIS" for example, but not from "AGRIS" to
     865    // "the AGRIS" (where the is a stopword and AGRIS a content word).
     866    // The system used to work like this; it is easy to implement, but
     867    // it explodes the size of the indexes.  So: would it be useful? 
     868    if (!((p.suffixFrequency <= 1) ||
     869      (*suffixArray[ij] <= LASTDELIMITER) ||
     870      ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {
     871
     872      // Get minimal expansions of the phrase
     873      getExpansions(p, result);
     874     
     875      if (!result.empty()) {
     876   
     877    // Remember that we have expanded this phrase
     878    rememberThisPhrase(ij, 1);
     879
     880    // write the phrase text
     881    phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
     882           << result.size() << ":";
     883
     884    // write the results
     885    for (cellcount k = 0; k < result.size(); k++) {
     886      if (k) {
     887        phraseData << ",";
     888      }
     889      phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
     890      outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
     891      outPhraseCounter++;
     892    }
     893    result.clear();
     894   
     895    // Write the documents in which this phrase occurs
     896    df = getDocumentOccurrances(p, documentFrequency);
     897    phraseData << ":" << df << ":";
     898
     899    // write the documents
     900    for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
     901      if (documentFrequency[m]) {
     902        if (first) {
     903          first = 0;
     904        } else {
     905          phraseData << ";";
     906        }
     907        // Output the document number.  Note that here we've numbered the
     908        // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
     909        // add 1 to the document id when we output it.
     910        phraseData << "d" << (m+1);
     911        // Next, output the frequency with which the document occurs, but
     912        // only if it is > 1.
     913        if (documentFrequency[m] > 1) {
     914          phraseData << "," << documentFrequency[m];
     915        }
     916      }
     917    }
     918
     919    phraseData << endl;
     920    phraseCounter++;
     921
     922    // feedback
     923    if (verbosity) {
     924      if (phraseCounter % 1000 == 0) {
     925        cout << "phrase " << phraseCounter << ": "
     926         << "cell " << p.firstSuffixIndex << " - " << p << endl;
     927      }
     928    }
     929      }
     930    }
     931   ij = p.lastSuffixIndex + 1;
     932  }
     933  outPhrase.close();
     934
     935  // REMAINING PASSES
     936  // The previous outPhrase file forms the input to each new pass
     937  cellcount start, length;
     938  while (outPhraseCounter > 0) {
     939
     940    // Start a new pass
     941    phrasePass++;
     942    if (verbosity) {
     943      cout << "Starting pass " << phrasePass << endl;
     944    }
     945
     946    // Open the input file
     947    char inPhraseName[FILENAME_MAX];
     948    sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
     949    ifstream inPhrase (inPhraseName, ios::in);
     950    if (!inPhrase) {
     951      cerr << "File " << inPhraseName << " could not be opened\n";
     952      exit(1);
     953    }
     954
     955    // Open the output file
     956    sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
     957    outPhrase.open(outPhraseName, ios::out);
     958    if (!outPhrase) {
     959      cerr << "File " << outPhraseName << " could not be opened\n";
     960      exit(1);
     961    }
     962    outPhraseCounter = 0;
     963
     964    // Process each phrase
     965    while(inPhrase >> start >> length) {
     966
     967      // Ignore the phrase if we have expanded it before
     968      if (isPhraseStored(start, length))
     969    continue;
     970
     971      // Remember that we have examined this phrase
     972      rememberThisPhrase(start, length);
     973
     974      // Find the phrase in the suffixarray
     975      p = Phrase(suffixArray[start], length, SUFFIX);
     976      p.findFirstAndLastSuffix(start, inputLength-1);
     977
     978      // Ignore the phrase if it only occurs once
     979      if (p.suffixFrequency < 2)
     980    continue;
     981
     982      // Write the phrase text;
     983      phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
     984   
     985      // Expand the phrase, if it is fewer than 8 words long
     986      if (length <= 8) {
     987
     988    // Get the minimal expansions for this phrase
     989    getExpansions(p, result);
     990     
     991    // write the results
     992    phraseData << result.size() << ":";
     993
     994    for (cellcount i = 0; i < result.size(); i++) {
     995      if (i) {
     996        phraseData << ",";
     997      }
     998      phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
     999      outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
     1000      outPhraseCounter++;
     1001    }
     1002    result.clear();
     1003   
     1004      } else {
     1005    // phrase is too long to expand further
     1006    phraseData << "0:";
     1007      }
     1008
     1009      // Write the documents in which this phrase occurs
     1010      df = getDocumentOccurrances(p, documentFrequency);
     1011      phraseData << ":" << df << ":";
     1012
     1013      // write the documents
     1014      for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
     1015    if (documentFrequency[i]) {
     1016      if (first) {
     1017        first = 0;
     1018      } else {
     1019        phraseData << ";";
     1020      }
     1021      // Output the document number.  Note that here we've numbered the
     1022      // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
     1023      // add 1 to the document id when we output it.
     1024      phraseData << "d" << (i+1);
     1025      // Next, output the frequency with which the document occurs, but
     1026      // only if it is > 1.
     1027      if (documentFrequency[i] > 1) {
     1028        phraseData << "," << documentFrequency[i];
     1029      }
     1030    }
     1031      }
     1032     
     1033      phraseData << endl;
     1034      phraseCounter++;
     1035
     1036      // feedback
     1037      if (verbosity) {
     1038    if (phraseCounter % 1000 == 0) {
     1039      cout << "phrase " << phraseCounter << ": "<< "start " << start
     1040           << ", length " << length << " - " << p << endl;
     1041    }
     1042      }
     1043
     1044    }
     1045
     1046    inPhrase.close();
     1047    outPhrase.close();
     1048  }
     1049   
     1050  phraseData.close();
     1051  deletePhraseMemory();
     1052
     1053  delete [] documentFrequency;
     1054  delete [] symbols;
     1055  delete [] suffixArray;
     1056  delete [] prefixArray;
     1057  delete [] suffixCheck;
     1058  delete [] documentArray;
     1059
     1060
     1061 
     1062  cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
     1063  return 0;
     1064}
     1065
     1066
     1067
     1068
     1069
Note: See TracChangeset for help on using the changeset viewer.