Context Navigation

← Previous Change
Next Change →

Changeset 2694 for trunk/gsdl/src

Timestamp:

2001-08-12T10:28:09+12:00 (23 years ago)

Author:

paynter

Message:

Various improvements to the Phrase code, including new copy constructors,
inlining frequently used code, and an operator<< function.

Location:

trunk/gsdl/src/phind/generate

Files:

: 3 edited

phrase.cpp (modified) (9 diffs)
phrase.h (modified) (4 diffs)
suffix2.cpp (modified) (12 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/src/phind/generate/phrase.cpp

-              r2674
+              r2694
+Phrase::Phrase(const Phrase &p) {
+  forward = p.forward;
+  back = p.back;
+  length = p.length;
+  suffixFound = p.suffixFound;
+  prefixFound = p.prefixFound;
+  firstSuffix = p.firstSuffix;
+  lastSuffix  = p.lastSuffix;
+  firstSuffixIndex = p.firstSuffixIndex;
+  lastSuffixIndex  = p.lastSuffixIndex;
+  suffixFrequency  = p.suffixFrequency;
+  firstPrefix = p.firstPrefix;
+  lastPrefix  = p.lastPrefix;
+  firstPrefixIndex = p.firstPrefixIndex;
+  lastPrefixIndex  = p.lastPrefixIndex;
+  prefixFrequency  = p.prefixFrequency;
+  uniqueSuffixExtension = p.uniqueSuffixExtension;
+  uniquePrefixExtension = p.uniquePrefixExtension;
+}
 // Empty the contents of a phrase
 …
   firstSuffix = firstPrefix = NULL;
   lastSuffix = lastPrefix = NULL;
+  suffixFrequency = prefixFrequency = 0;
+  firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0;
+  firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0;
   uniqueSuffixExtension = uniquePrefixExtension = -1;
 …
 int Phrase::clearSuffix() {
   suffixFound = 0;
+  firstSuffix = NULL;
+  lastSuffix = NULL;
+  suffixFrequency = 0;
+  firstSuffix = lastSuffix = NULL;
+  firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0;
   uniqueSuffixExtension = -1;
   return 0;
 …
 int Phrase::clearPrefix() {
   prefixFound = 0;
+  firstPrefix = NULL;
+  lastPrefix = NULL;
+  prefixFrequency = 0;
+  firstPrefix = lastPrefix = NULL;
+  firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0;
   uniquePrefixExtension = -1;
   return 0;
 …
+// Output a phrase to a stream
+std::ostream &operator<<(std::ostream &stream, const Phrase &phrase)
+{
+  assert(phrase.forward);
+  symbol *s = phrase.forward;
+  stream << "s" << *s++;
+  for (cellcount i = 1; i < phrase.length; i++)
+    stream << " s" << *s++;
+  return stream;
+}
 // Convert the phrase to a string
+char *Phrase::toString() {
+// Note thgat you have to delete the memory yourself.
+char *Phrase::toString()
+{
   assert(forward);
 …
-// Ensure that the phrase has been found in the suffix & prefix arrays
-int Phrase::ensureSuffixFound() {
-  if (!suffixFound) {
-    findFirstAndLastSuffix();
+  }
-  return 0;
+}
-int Phrase::ensurePrefixFound() {
-  if (!prefixFound) {
-    findFirstAndLastPrefix();
+  }
-  return 0;
+}
 // Calculate a set of initial suffix/prefix candidates
 //
 …
 // and add them to the end of the results vector
 int Phrase::initialSuffixCandidates(vector<Phrase> &results) {
+void Phrase::initialSuffixCandidates(vector<Phrase> &results) {
   ensureSuffixFound();
 …
     // Move onto the next expansion
     i = next.lastSuffixIndex + 1;
+  }
+  return 0;
+}
+int Phrase::initialPrefixCandidates(vector<Phrase> &results) {
+  }
+}
+void Phrase::initialPrefixCandidates(vector<Phrase> &results) {
   ensurePrefixFound();
 …
     // Move onto the next expansion
     i = next.lastPrefixIndex + 1;
+  }
+  return 0;
+  }
+}

trunk/gsdl/src/phind/generate/phrase.h

-              r2674
+              r2694
 public:
+  // The phrase itself is stored with two pointers: forward
+  // points to its first cell, back points to its last.
+  // The length is always stored in length.
+  // If one of these is set, all must be set, and it must
+  // be true that (forward + length - 1) = back
+  // The phrase itself is stored with two pointers: forward points to
+  // its first cell, back points to its last.  The length is always
+  // stored in length.  If one of these is set, all must be set, and
+  // it must be true that (forward + length - 1) = back.
   symbol *forward;
   symbol *back;
 …
   // Constructor functions
+  // First argument is an array of words, second is the length of
+  // the phrase, third is the direction (SUFFIX of PREFIX) in
+  // which the words should be read (defaults to forwards).
+  Phrase();
+  Phrase(const Phrase &p);
+  // A "partial" constructor: the first argument is an array of words,
+  // second is its length, third is the direction (SUFFIX or PREFIX)
+  // in which the words should be read (defaults to SUFFIX).
   Phrase(symbol *words, cellcount size, int direction);
+  // An empty phrase can be created without arguments, but is
+  // good for nothing and may not be used with any public fuctions.
+  // We therefore only use it internally.
+  Phrase();
+  // Represent the phrase as  a string
+  // Represent the phrase as an arracy of characters
+  // You will have to call "delete []" on the array returned.
   char *toString();
   // Find an initial set of candidate phrases in the suffix/prefix array
   int initialSuffixCandidates(vector<Phrase> &results);
   int initialPrefixCandidates(vector<Phrase> &results);
+  void initialSuffixCandidates(vector<Phrase> &results);
+  void initialPrefixCandidates(vector<Phrase> &results);
   // Does the phrase have a unique extension?
 …
   // Make sure the phrase location in the suffix/prefix array is known
+  int ensureSuffixFound();
+  int ensurePrefixFound();
+  inline void Phrase::ensureSuffixFound() {
+    if (!suffixFound)
+      findFirstAndLastSuffix();
+  }
+  inline void Phrase::ensurePrefixFound() {
+    if (!prefixFound)
+      findFirstAndLastPrefix();
+  }
+  // Output a phrase to a stream
+  friend std::ostream &operator<<(std::ostream &stream, const Phrase &phrase);
 private:
 …
 #endif

trunk/gsdl/src/phind/generate/suffix2.cpp

-              r2673
+              r2694
 symbol   *symbols;
 symbol  **suffixArray;
+symbol  **prefixArray;
 check    *suffixCheck;
+symbol  **prefixArray;
 // How many documents are in this collection?
 …
 symbol  **documentArray;
 // Do we accept any phrase, or do we eliminate those ending with stopwords ?
 int phraseMode = ANYPHRASE; //STOPWORDS;
 // The filestem of the collection's phindex directory
 char collection[FILENAME_MAX];
-int suffixCompare(const void *, const void *);
-int prefixCompare(const void *, const void *);
-int pointerCompare(const void *, const void *);
-int readNumbers();
-void readStatistics();
-void getExpansions(Phrase &p, vector<Phrase> &results);
-cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
 // The ranges of the stopword and content-word symbols for the collection
 …
+// Phrase memory
+// We have to "remember" each phrase that we've expanded
+// Some useful comparison functions, defined below.
+int suffixCompare(const void *, const void *);
+int prefixCompare(const void *, const void *);
+int pointerCompare(const void *, const void *);
+// Functions for implementing "phrase memory".  These let us "remember"
+// each phrase that we've expanded without using two much memory.
 void initialisePhraseMemory();
 void rememberThisPhrase(cellindex index, cellcount length);
 …
-int main (int argc, char * argv[]) {
-  // Command-line arguments
-  // argv[1] is the phindex directory
-  // argv[2] is the maximum array symbol length (optional)
-  // argv[3] is the mode, where 1 is stopword mode (optional)
-  if (argc < 2) {
-    cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
-    exit(1);
+  }
-  // collection directory
-  strcpy(collection, argv[1]);
-  // mode parameter
-  phraseMode = atoi(argv[2]);
-  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
-  // optional verbosity parameter
-  if (argc == 4) {
-    verbosity = atoi(argv[3]);
-    assert (verbosity >= 0);
+  }
-  if (verbosity) {
-    cout << "suffix2: the simpler phrase extraction program" << endl;
+  }
-  if (verbosity > 1) {
-    if (phraseMode == STOPWORDS) {
-      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
-    } else {
-      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
+    }
+  }
-  // Read the statistics file
-  readStatistics();
-  // Read the numbers file
-  readNumbers();
-  // Create the suffix & prefix arrays
-  suffixArray = new symbol *[inputLength];
-  prefixArray = new symbol *[inputLength];
-  suffixCheck = new check[inputLength];
-  if (suffixCheck == NULL) {
-    cerr << "Suffix2 error: not enough memory to hold " << inputLength << " symbols." << endl;
-    exit(2);
+  }
-  // Initialise prefix and suffix arrays
-  for (cellcount j = 0; j < inputLength; j++) {
-    suffixArray[j] = &symbols[j];
-    prefixArray[j] = &symbols[j];
+  }
-  qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
-  qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
-  // Create the document arrays
-  if (numberOfDocuments == 0) {
-    cerr << "There are no documents in this collection!" << endl;
-    exit(1);
+  }
-  if (verbosity > 1) {
-    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
+  }
-  // The document frequecy array is used to count the number of times
-  // each phrase occurs in each document.  The number of documents in
-  // which a phrase occurs is stored in df.
-  frequency *documentFrequency = new frequency[numberOfDocuments];
-  frequency df;
-  // documentArray will be searched in order to discover which document
-  // each phrase occurs in.
-  documentArray = new symbol *[numberOfDocuments];
-  // Discover all the DOCUMENTSTART symbols and store as a phrase
-  cellindex d = 0;
-  while (*suffixArray[d] != DOCUMENTSTART) {
-    d++;
+  }
-  Phrase p(suffixArray[d], 1, SUFFIX);
-  p.findFirstAndLastSuffix(d, inputLength-1);
-  // Insert the document locations time (as pointers) into documentArray
-  for (cellcount i = 0; i < p.suffixFrequency; i++) {
-    documentArray[i] = suffixArray[i + p.firstSuffixIndex];
+  }
-  // Sort the document array into ascending order of raw pointer value
-  qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
-  // Extract phrases
-  //
-  // We will make several passesover the data, in each case considering
-  // a set of input phrases and generating a set of output phrases, which
-  // we will expancd in later passes.
-  //
-  // The input phrases in the first pass will be the vocabulary.
-  // In later passes, the input phrases will be the output phrases of the
-  // previous pass.
-  //
-  // In each pass we will consider each input phrase in turn.  If we
-  // have seen it before, we will ignore it.  Otherwise, we will expand
-  // it and add its expansions to the set of output phrases.
-  // Store the phrase data in the phrases file
-  char phraseDataName[FILENAME_MAX];
-  sprintf(phraseDataName, "%s/phrases", collection);
-  ofstream phraseData(phraseDataName, ios::out);
-  if (!phraseData) {
-    cout << "File " << phraseDataName << " could not be opened\n";
-    exit(1);
+  }
-  // Count the number of phrases output
-  unsigned long int phraseCounter = 0;
-  // Set up the phrase expansion memory.
-  // We need this so that we don't expand a phrase more than once
-  initialisePhraseMemory();
-  // The current pass numebr
-  int phrasePass = 1;
-  // PASS NUMBER 1
-  if (verbosity > 1) {
-    cout << "Starting pass " << phrasePass << endl;
+  }
-  ofstream outPhrase;
-  char     outPhraseName[FILENAME_MAX];
-  unsigned long int outPhraseCounter = 0;
-  // On the first pass, simply work through the vocabulary
-  sprintf(outPhraseName, "%s/outPhrase.1", collection);
-  outPhrase.open(outPhraseName, ios::out);
-  if (!outPhrase) {
-    cerr << "File " << outPhraseName << " could not be opened\n";
-    exit(1);
+  }
-  // Iterate over the different symbols by working through the suffix array
-  vector<Phrase> result;
-  cellindex ij = 0;
-  char *tmpString;
-  while (ij < inputLength) {
-    // make a new phrase of length 1
-    p = Phrase(suffixArray[ij], 1, SUFFIX);
-    p.findFirstAndLastSuffix(ij, inputLength-1);
-    // cout << "cell " << ij << " - " << p.toString() << endl;
-    // We ignore this symbol if it occurs only once, if it is a delimiter,
-    // of if we are in stopwords mode and it is a stopword
-    //
-    // We could imagine a new mode/command-line option, which is like
-    // STOPWORDS but without this restrictrion.  This would let you browse
-    // from "the" to "the AGRIS" for example, but not from "AGRIS" to
-    // "the AGRIS" (where the is a stopword and AGRIS a content word).
-    // The system used to work like this; it is easy to implement, but
-    // it explodes the size of the indexes.  So: would it be useful?
-    if (!((p.suffixFrequency <= 1) ||
-      // (*suffixArray[ij] != 23054) ||
-      (*suffixArray[ij] <= LASTDELIMITER) ||
-      ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {
-      // Get minimal expansions of the phrase
-      getExpansions(p, result);
-      if (!result.empty()) {
-    // Remember that we have expanded this phrase
-    rememberThisPhrase(ij, 1);
-    // write the phrase text
-    tmpString = p.toString();
-    phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
-           << result.size() << ":";
-    delete [] tmpString;
-    // write the results
-    for (cellcount k = 0; k < result.size(); k++) {
-      if (k) {
-        phraseData << ",";
+      }
-      phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
-      outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
-      outPhraseCounter++;
+    }
-    result.clear();
-    // Write the documents in which this phrase occurs
-    df = getDocumentOccurrances(p, documentFrequency);
-    phraseData << ":" << df << ":";
-    // write the documents
-    for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
-      if (documentFrequency[m]) {
-        if (first) {
-          first = 0;
-        } else {
-          phraseData << ";";
+        }
-        // Output the document number.  Note that here we've numbered the
-        // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
-        // add 1 to the document id when we output it.
-        phraseData << "d" << (m+1);
-        // Next, output the frequency with which the document occurs, but
-        // only if it is > 1.
-        if (documentFrequency[m] > 1) {
-          phraseData << "," << documentFrequency[m];
+        }
+      }
+    }
-    phraseData << endl;
-    phraseCounter++;
-    // feedback
-    if (verbosity) {
-      if (phraseCounter % 1000 == 0) {
-        tmpString = p.toString();
-        cout << "phrase " << phraseCounter << ": "
-         << "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
-        delete [] tmpString;
+      }
+    }
+      }
+    }
-   ij = p.lastSuffixIndex + 1;
+  }
-  outPhrase.close();
-  // REMAINING PASSES
-  // The previous outPhrase file forms the input to each new pass
-  cellcount start, length;
-  while (outPhraseCounter > 0) {
-    // Start a new pass
-    phrasePass++;
-    if (verbosity) {
-      cout << "Starting pass " << phrasePass << endl;
+    }
-    // Open the input file
-    char inPhraseName[FILENAME_MAX];
-    sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
-    ifstream inPhrase (inPhraseName, ios::in);
-    if (!inPhrase) {
-      cerr << "File " << inPhraseName << " could not be opened\n";
-      exit(1);
+    }
-    // Open the output file
-    sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
-    outPhrase.open(outPhraseName, ios::out);
-    if (!outPhrase) {
-      cerr << "File " << outPhraseName << " could not be opened\n";
-      exit(1);
+    }
-    outPhraseCounter = 0;
-    // Process each phrase
-    while(inPhrase >> start >> length) {
-      // Ignore the phrase if we have expanded it before
-      if (isPhraseStored(start, length)) {
-    continue;
+      }
-      // Remember that we have examined this phrase
-      rememberThisPhrase(start, length);
-      // Find the phrase in the suffixarray
-      p = Phrase(suffixArray[start], length, SUFFIX);
-      p.findFirstAndLastSuffix(start, inputLength-1);
-      // cout << "index " << start << ", length " << length << " - "  <<  p.toString() << endl;
-      // Ignore the phrase if it only occurs once
-      if (p.suffixFrequency < 2) {
-    continue;
+      }
-      // Write the phrase text  tmpString = p.toString();
-      tmpString = p.toString();
-      phraseData << start << "-" << length << ":" << tmpString << ":"
-         << p.suffixFrequency << ":";
-      delete [] tmpString;
-      // Expand the phrase, if it is fewer than 8 words long
-      if (length <= 8) {
-    // Get the minimal expansions for this phrase
-    getExpansions(p, result);
-    // write the results
-    phraseData << result.size() << ":";
-    for (cellcount i = 0; i < result.size(); i++) {
-      if (i) {
-        phraseData << ",";
+      }
-      phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
-      outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
-      outPhraseCounter++;
+    }
-    result.clear();
-      } else {
-    // phrase is too long to expand further
-    phraseData << "0:";
+      }
-      // Write the documents in which this phrase occurs
-      df = getDocumentOccurrances(p, documentFrequency);
-      phraseData << ":" << df << ":";
-      // write the documents
-      for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
-    if (documentFrequency[i]) {
-      if (first) {
-        first = 0;
-      } else {
-        phraseData << ";";
+      }
-      // Output the document number.  Note that here we've numbered the
-      // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
-      // add 1 to the document id when we output it.
-      phraseData << "d" << (i+1);
-      // Next, output the frequency with which the document occurs, but
-      // only if it is > 1.
-      if (documentFrequency[i] > 1) {
-        phraseData << "," << documentFrequency[i];
+      }
+    }
+      }
-      phraseData << endl;
-      phraseCounter++;
-      // feedback
-      if (verbosity) {
-    if (phraseCounter % 1000 == 0) {
-      tmpString = p.toString();
-      cout << "phrase " << phraseCounter << ": "<< "start " << start
-           << ", length " << length << " - " << tmpString << endl;
-      delete [] tmpString;
+    }
+      }
+    }
-    inPhrase.close();
-    outPhrase.close();
+  }
-  phraseData.close();
-  deletePhraseMemory();
-  delete [] documentFrequency;
-  delete [] symbols;
-  delete [] suffixArray;
-  delete [] prefixArray;
-  delete [] suffixCheck;
-  delete [] documentArray;
-  cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
-  return 0;
+}
 // Get a phrase's expansions
 //
 …
   // 2. Ensure maximality: expand each initial candidate both right and left
   for (vector<Phrase>::iterator i = candidates.begin(); i != candidates.end(); i++) {
+  for (vector<Phrase>::iterator i = candidates.begin(); i != candidates.end(); ++i) {
     // We should be able to optimise this given we've already expanded in one direction
     i->expandWhileUniquePrefixExtension();
 …
     return;
+  for (cellcount j = 0; j < inputLength; j++) {
+  // Initialise the candidates, check array, and various variables.
+  sort(candidates.begin(), candidates.end(), isShorter);
+  for (cellcount j = 0; j < inputLength; j++)
     suffixCheck[j] = 0;
+  }
+  sort(candidates.begin(), candidates.end(), isShorter);
+  unsigned minimum_length = p.length + 1;
+  if (candidates.begin()->length > minimum_length)
+    minimum_length = candidates.begin()->length;
+  unsigned minimum_length = candidates.begin()->length;
   // Try to add each candidate to the results set, ignoring the non-minimal
+  for (vector<Phrase>::iterator candidate = candidates.begin(); candidate != candidates.end(); candidate++) {
+    // cerr << "* candidate of length " << candidate->length << ": (" << candidate->toString() << ")\n";
+    // Make a copy of candidate that we will mutilate while performing sub-phrase checks
+    Phrase temp_phrase(candidate->forward, candidate->length, SUFFIX);
+  for (vector<Phrase>::iterator candidate = candidates.begin();
+       candidate != candidates.end(); candidate++) {
+    // Make a copy of candidate to mutilate while performing sub-phrase checks
+    Phrase temp_phrase(*candidate);
     bool shorter_found = false;
+    // Check for shorter and shorter versions of the tenporary phrase
     while (temp_phrase.length >= minimum_length && !shorter_found) {
       temp_phrase.ensureSuffixFound();
 …
     if (!shorter_found) {
-      // cerr << "NOT FOUND! " << candidate->length << ": adding (" << candidate->toString() << ") : "
-      //      << candidate->firstSuffixIndex << "-" << candidate->lastSuffixIndex << "\n";
       results.push_back(*candidate);
       candidate->ensureSuffixFound();
       for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; k++) {
+      for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
     suffixCheck[k] = candidate->length;
+      }
+    }
+  }
 …
 // Given a phrase, what documents does it occur in?
 cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
   // cout << "searching for \""<< p.toString() << "\" in documents "
+cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
+  // cout << "searching for \""<< p << "\" in documents "
   //      << 0 << "-" << numberOfDocuments - 1 << endl;
 …
+}
 bool isLongPhraseStored(cellindex index, cellcount length) {
 …
+}
 void deleteLongPhraseMemory() {
   // remove the hash & other files
 …
 // Read the collection statistics file
+//
 void readStatistics() {
 …
+int main (int argc, char * argv[]) {
+  // Command-line arguments
+  // argv[1] is the phindex directory
+  // argv[2] is the maximum array symbol length (optional)
+  // argv[3] is the mode, where 1 is stopword mode (optional)
+  if (argc < 2) {
+    cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
+    exit(1);
+  }
+  // collection directory
+  strcpy(collection, argv[1]);
+  // mode parameter
+  phraseMode = atoi(argv[2]);
+  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
+  // optional verbosity parameter
+  if (argc == 4) {
+    verbosity = atoi(argv[3]);
+    assert (verbosity >= 0);
+  }
+  if (verbosity) {
+    cout << "suffix2: the simpler phrase extraction program" << endl;
+  }
+  if (verbosity > 1) {
+    if (phraseMode == STOPWORDS) {
+      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
+    } else {
+      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
+    }
+  }
+  // Read the statistics file
+  readStatistics();
+  // Read the numbers file
+  readNumbers();
+  // Create the suffix & prefix arrays
+  suffixArray = new symbol *[inputLength];
+  prefixArray = new symbol *[inputLength];
+  suffixCheck = new check[inputLength];
+  if (suffixCheck == NULL) {
+    cerr << "Suffix2 error: not enough memory to hold " << inputLength << " symbols." << endl;
+    exit(2);
+  }
+  // Initialise prefix and suffix arrays
+  for (cellcount j = 0; j < inputLength; j++) {
+    suffixArray[j] = &symbols[j];
+    prefixArray[j] = &symbols[j];
+  }
+  qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
+  qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
+  // Create the document arrays
+  if (numberOfDocuments == 0) {
+    cerr << "There are no documents in this collection!" << endl;
+    exit(1);
+  }
+  if (verbosity > 1) {
+    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
+  }
+  // The document frequecy array is used to count the number of times
+  // each phrase occurs in each document.  The number of documents in
+  // which a phrase occurs is stored in df.
+  frequency *documentFrequency = new frequency[numberOfDocuments];
+  frequency df;
+  // documentArray will be searched in order to discover which document
+  // each phrase occurs in.
+  documentArray = new symbol *[numberOfDocuments];
+  // Discover all the DOCUMENTSTART symbols and store as a phrase
+  cellindex d = 0;
+  while (*suffixArray[d] != DOCUMENTSTART) {
+    d++;
+  }
+  Phrase p(suffixArray[d], 1, SUFFIX);
+  p.findFirstAndLastSuffix(d, inputLength-1);
+  // Insert the document locations time (as pointers) into documentArray
+  for (cellcount i = 0; i < p.suffixFrequency; i++) {
+    documentArray[i] = suffixArray[i + p.firstSuffixIndex];
+  }
+  // Sort the document array into ascending order of raw pointer value
+  qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
+  // Extract phrases
+  //
+  // We will make several passesover the data, in each case considering
+  // a set of input phrases and generating a set of output phrases, which
+  // we will expancd in later passes.
+  //
+  // The input phrases in the first pass will be the vocabulary.
+  // In later passes, the input phrases will be the output phrases of the
+  // previous pass.
+  //
+  // In each pass we will consider each input phrase in turn.  If we
+  // have seen it before, we will ignore it.  Otherwise, we will expand
+  // it and add its expansions to the set of output phrases.
+  // Store the phrase data in the phrases file
+  char phraseDataName[FILENAME_MAX];
+  sprintf(phraseDataName, "%s/phrases", collection);
+  ofstream phraseData(phraseDataName, ios::out);
+  if (!phraseData) {
+    cout << "File " << phraseDataName << " could not be opened\n";
+    exit(1);
+  }
+  // Count the number of phrases output
+  unsigned long int phraseCounter = 0;
+  // Set up the phrase expansion memory.
+  // We need this so that we don't expand a phrase more than once
+  initialisePhraseMemory();
+  // The current pass numebr
+  int phrasePass = 1;
+  // PASS NUMBER 1
+  if (verbosity > 1) {
+    cout << "Starting pass " << phrasePass << endl;
+  }
+  ofstream outPhrase;
+  char     outPhraseName[FILENAME_MAX];
+  unsigned long int outPhraseCounter = 0;
+  // On the first pass, simply work through the vocabulary
+  sprintf(outPhraseName, "%s/outPhrase.1", collection);
+  outPhrase.open(outPhraseName, ios::out);
+  if (!outPhrase) {
+    cerr << "File " << outPhraseName << " could not be opened\n";
+    exit(1);
+  }
+  // Iterate over the different symbols by working through the suffix array
+  vector<Phrase> result;
+  cellindex ij = 0;
+  char *tmpString;
+  while (ij < inputLength) {
+    // make a new phrase of length 1
+    p = Phrase(suffixArray[ij], 1, SUFFIX);
+    p.findFirstAndLastSuffix(ij, inputLength-1);
+    // We ignore this symbol if it occurs only once, if it is a delimiter,
+    // of if we are in stopwords mode and it is a stopword
+    //
+    // We could imagine a new mode/command-line option, which is like
+    // STOPWORDS but without this restrictrion.  This would let you browse
+    // from "the" to "the AGRIS" for example, but not from "AGRIS" to
+    // "the AGRIS" (where the is a stopword and AGRIS a content word).
+    // The system used to work like this; it is easy to implement, but
+    // it explodes the size of the indexes.  So: would it be useful?
+    if (!((p.suffixFrequency <= 1) ||
+      (*suffixArray[ij] <= LASTDELIMITER) ||
+      ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {
+      // Get minimal expansions of the phrase
+      getExpansions(p, result);
+      if (!result.empty()) {
+    // Remember that we have expanded this phrase
+    rememberThisPhrase(ij, 1);
+    // write the phrase text
+    phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
+           << result.size() << ":";
+    // write the results
+    for (cellcount k = 0; k < result.size(); k++) {
+      if (k) {
+        phraseData << ",";
+      }
+      phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
+      outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
+      outPhraseCounter++;
+    }
+    result.clear();
+    // Write the documents in which this phrase occurs
+    df = getDocumentOccurrances(p, documentFrequency);
+    phraseData << ":" << df << ":";
+    // write the documents
+    for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
+      if (documentFrequency[m]) {
+        if (first) {
+          first = 0;
+        } else {
+          phraseData << ";";
+        }
+        // Output the document number.  Note that here we've numbered the
+        // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
+        // add 1 to the document id when we output it.
+        phraseData << "d" << (m+1);
+        // Next, output the frequency with which the document occurs, but
+        // only if it is > 1.
+        if (documentFrequency[m] > 1) {
+          phraseData << "," << documentFrequency[m];
+        }
+      }
+    }
+    phraseData << endl;
+    phraseCounter++;
+    // feedback
+    if (verbosity) {
+      if (phraseCounter % 1000 == 0) {
+        cout << "phrase " << phraseCounter << ": "
+         << "cell " << p.firstSuffixIndex << " - " << p << endl;
+      }
+    }
+      }
+    }
+   ij = p.lastSuffixIndex + 1;
+  }
+  outPhrase.close();
+  // REMAINING PASSES
+  // The previous outPhrase file forms the input to each new pass
+  cellcount start, length;
+  while (outPhraseCounter > 0) {
+    // Start a new pass
+    phrasePass++;
+    if (verbosity) {
+      cout << "Starting pass " << phrasePass << endl;
+    }
+    // Open the input file
+    char inPhraseName[FILENAME_MAX];
+    sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
+    ifstream inPhrase (inPhraseName, ios::in);
+    if (!inPhrase) {
+      cerr << "File " << inPhraseName << " could not be opened\n";
+      exit(1);
+    }
+    // Open the output file
+    sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
+    outPhrase.open(outPhraseName, ios::out);
+    if (!outPhrase) {
+      cerr << "File " << outPhraseName << " could not be opened\n";
+      exit(1);
+    }
+    outPhraseCounter = 0;
+    // Process each phrase
+    while(inPhrase >> start >> length) {
+      // Ignore the phrase if we have expanded it before
+      if (isPhraseStored(start, length))
+    continue;
+      // Remember that we have examined this phrase
+      rememberThisPhrase(start, length);
+      // Find the phrase in the suffixarray
+      p = Phrase(suffixArray[start], length, SUFFIX);
+      p.findFirstAndLastSuffix(start, inputLength-1);
+      // Ignore the phrase if it only occurs once
+      if (p.suffixFrequency < 2)
+    continue;
+      // Write the phrase text;
+      phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
+      // Expand the phrase, if it is fewer than 8 words long
+      if (length <= 8) {
+    // Get the minimal expansions for this phrase
+    getExpansions(p, result);
+    // write the results
+    phraseData << result.size() << ":";
+    for (cellcount i = 0; i < result.size(); i++) {
+      if (i) {
+        phraseData << ",";
+      }
+      phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
+      outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
+      outPhraseCounter++;
+    }
+    result.clear();
+      } else {
+    // phrase is too long to expand further
+    phraseData << "0:";
+      }
+      // Write the documents in which this phrase occurs
+      df = getDocumentOccurrances(p, documentFrequency);
+      phraseData << ":" << df << ":";
+      // write the documents
+      for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
+    if (documentFrequency[i]) {
+      if (first) {
+        first = 0;
+      } else {
+        phraseData << ";";
+      }
+      // Output the document number.  Note that here we've numbered the
+      // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
+      // add 1 to the document id when we output it.
+      phraseData << "d" << (i+1);
+      // Next, output the frequency with which the document occurs, but
+      // only if it is > 1.
+      if (documentFrequency[i] > 1) {
+        phraseData << "," << documentFrequency[i];
+      }
+    }
+      }
+      phraseData << endl;
+      phraseCounter++;
+      // feedback
+      if (verbosity) {
+    if (phraseCounter % 1000 == 0) {
+      cout << "phrase " << phraseCounter << ": "<< "start " << start
+           << ", length " << length << " - " << p << endl;
+    }
+      }
+    }
+    inPhrase.close();
+    outPhrase.close();
+  }
+  phraseData.close();
+  deletePhraseMemory();
+  delete [] documentFrequency;
+  delete [] symbols;
+  delete [] suffixArray;
+  delete [] prefixArray;
+  delete [] suffixCheck;
+  delete [] documentArray;
+  cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
+  return 0;
+}

Note: See TracChangeset for help on using the changeset viewer.