Context Navigation

← Previous Changeset
Next Changeset →

Changeset 2801

Timestamp:

2001-10-15T15:02:54+13:00 (23 years ago)

Author:

kjm18

Message:

new version of suffix, based on suffix2 (gordon and craigs simpler version)
with kaths improvements

Location:

trunk/gsdl/src/phind/generate

Files:

: 3 edited

phrase.cpp (modified) (14 diffs)
phrase.h (modified) (1 diff)
suffix.cpp (modified) (10 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/src/phind/generate/phrase.cpp

-              r2704
+              r2801
   --length;
   --back;
+  if (phraseMode==STOPWORDS) {
+    while (*back >=firstStopSymbol && *back <= lastStopSymbol) {
+      --length;
+      --back;
+    }
+  }
   clearSuffix();
   clearPrefix();
 …
   --length;
   ++forward;
+  if (phraseMode==STOPWORDS) {
+    while (*forward >=firstStopSymbol && *forward <= lastStopSymbol) {
+      --length;
+      ++forward;
+    }
+  }
   clearSuffix();
   clearPrefix();
 …
 int Phrase::findFirstAndLastSuffix() {
   return findFirstAndLastSuffix(0, inputLength-1);
+}
 …
   assert(begin <= end);
-  // cout << "findFirstAndLastSuffix (" << begin << " - " << end << ") : " << toString() << endl;
   // if we're only searching one cell, it is very easy
   if (begin == end) {
 …
   do {
-    // cout << "Find anywhere in " << begin << " - " << end << endl;
     c = (end + begin) / 2;
     cmp = compareSuffix(suffixArray[c], length);
 …
   do {
-    // cout << "first-searching with " << begin << " - " << end << endl;
     if (begin == end) {
       c = begin;
 …
     // to find the first occurance, suffixArray[c] must be the same as the
     // phrase, but suffixArray[c-1] must be different.
+    cmp = compareSuffix(suffixArray[c-1], length);
+    if (cmp == 0) {
+      end = c - 1;
+      assert(end >= begin);
+      cmp = 1;
+    } else {
+      cmp = 0;
+    if (c>0) {
+      cmp = compareSuffix(suffixArray[c-1], length);
+      if (cmp == 0) {
+        end = c - 1;
+        assert(end >= begin);
+        cmp = 1;
+      } else {
+        cmp = 0;
+      }
+    }
+      }
 …
   do {
-    // cout << "last-searching with " << begin << " - " << end << endl;
     if (begin == end) {
       c = begin;
 …
   assert(begin <= end);
-  // cout << "findFirstAndLastPrefix (" << begin << " - " << end << ") : " << toString() << endl;
   // if we're only searching one cell, it is very easy
   if (begin == end) {
 …
   do {
-    // cout << "Find anywhere in prefixarray " << begin << " - " << end << endl;
     c = (end + begin) / 2;
     cmp = comparePrefix(prefixArray[c], length);
 …
   do {
-    // cout << "first-prefix-searching with " << begin << " - " << end << endl;
     if (begin == end) {
       c = begin;
 …
       c = (begin + end) / 2;
       cmp = comparePrefix(prefixArray[c], length);
-      // cout << "cmp: " << cmp << ", c: " << c << endl;
       if (cmp == 1) {
     // target phrase is lower than phrase at prefixArray[c]
 …
   do {
-    // cout << "last-prefix-searching with " << begin << " - " << end << endl;
     if (begin == end) {
       c = begin;
 …
+}
 // Compare the length of two phrases
 //

trunk/gsdl/src/phind/generate/phrase.h

-              r2694
+              r2801
   friend std::ostream &operator<<(std::ostream &stream, const Phrase &phrase);
+  int uniqueSuffixExtension;
+  int uniquePrefixExtension;
 private:
   // Does the phrase have a unique suffix/prefix extension?
   // if yes, then 1; if no then 0; if unknown then -1;
-  int uniqueSuffixExtension;
-  int uniquePrefixExtension;
   // reset a phrase

trunk/gsdl/src/phind/generate/suffix.cpp

-              r2498
+              r2801
 /**********************************************************************
+ *
+ * suffix.cpp -- Extract the repeated phrases in the input using
+ *               suffix and prefix arrays.
+ * Suffix.cpp -- Extract the repeated phrases in the input with suffix
+ *                and prefix arrays (cgn & gwp's simpler algorithm,
+ *                 and kjm's improvements).
+ *
  * Copyright 2000 Gordon W. Paynter
 …
 symbol   *symbols;
 symbol  **suffixArray;
+symbol  **prefixArray;
 check    *suffixCheck;
-symbol  **prefixArray;
-check    *prefixCheck;
 …
 symbol  **documentArray;
 // Do we accept any phrase, or do we eliminate those ending with stopwords ?
 int phraseMode = ANYPHRASE; //STOPWORDS;
 // The filestem of the collection's phindex directory
 char collection[FILENAME_MAX];
-int suffixCompare(const void *, const void *);
-int prefixCompare(const void *, const void *);
-int pointerCompare(const void *, const void *);
-int readNumbers();
-void readStatistics();
-void getMinimalExpansions(Phrase &p, vector<Phrase> &results);
-cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
 // The ranges of the stopword and content-word symbols for the collection
 …
+// Phrase memory
+// We have to "remember" each phrase that we've expanded
+// Some useful comparison functions, defined below.
+int suffixCompare(const void *, const void *);
+int prefixCompare(const void *, const void *);
+int pointerCompare(const void *, const void *);
+// Functions for implementing "phrase memory".  These let us "remember"
+// each phrase that we've expanded without using too much memory.
 void initialisePhraseMemory();
 void rememberThisPhrase(cellindex index, cellcount length);
 …
+int main (int argc, char * argv[]) {
+  // Command-line arguments
+  // argv[1] is the phindex directory
+  // argv[2] is the maximum array symbol length (optional)
+  // argv[3] is the mode, where 1 is stopword mode (optional)
+  if (argc < 2) {
+    cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
+    exit(1);
+  }
+  // collection directory
+  strcpy(collection, argv[1]);
+  // mode parameter
+  phraseMode = atoi(argv[2]);
+  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
+  // optional verbosity parameter
+  if (argc == 4) {
+    verbosity = atoi(argv[3]);
+    assert (verbosity >= 0);
+  }
+  if (verbosity) {
+    cout << "Suffix phrase extraction program" << endl;
+  }
+  if (verbosity > 1) {
+    if (phraseMode == STOPWORDS) {
+      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
+    } else {
+      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
+    }
+  }
+  // Read the statistics file
+  readStatistics();
+  // Read the numbers file
+  readNumbers();
+  // Create the suffix & prefix arrays
+  suffixArray = new symbol *[inputLength];
+  prefixArray = new symbol *[inputLength];
+  suffixCheck = new check[inputLength];
+  prefixCheck = new check[inputLength];
+  if (prefixCheck == NULL) {
+    cerr << "Suffix error: not enough memory to hold " << inputLength
+     << " symbols." << endl;
+    exit(2);
+  }
+  // Initialise prefix and suffix arrays
+  for (cellcount j = 0; j < inputLength; j++) {
+    suffixArray[j] = &symbols[j];
+    prefixArray[j] = &symbols[j];
+  }
+  qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
+  qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
+  // Create the document arrays
+  if (numberOfDocuments == 0) {
+    cerr << "There are no documents in this collection!" << endl;
+    exit(1);
+  }
+  if (verbosity > 1) {
+    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
+  }
+  // The document frequecy array is used to count the number of times
+  // each phrase occurs in each document.  The number of documents in
+  // which a phrase occurs is stored in df.
+  frequency *documentFrequency = new frequency[numberOfDocuments];
+  frequency df;
+  // documentArray will be searched in order to discover which document
+  // each phrase occurs in.
+  documentArray = new symbol *[numberOfDocuments];
+  // Discover all the DOCUMENTSTART symbols and store as a phrase
+  cellindex d = 0;
+  while (*suffixArray[d] != DOCUMENTSTART) {
+    d++;
+  }
+  Phrase p(suffixArray[d], 1, SUFFIX);
+  p.findFirstAndLastSuffix(d, inputLength-1);
+// Get a phrase's expansions
+//
+// Get the set of "minimal", "maximal", non-unique expansions of a
+// phrase p, using the simpler algorithm that Craig and Gordon came up
+// with at Google.
+//
+// Returns a vector of Expansions.
+void getExpansions(Phrase &p, vector<Phrase> &results) {
+  // 1. Get the initial candidates
+  vector<Phrase> candidates;
+  p.initialSuffixCandidates(candidates);
+  int suffcands = candidates.size();
+  p.initialPrefixCandidates(candidates);
+  if (candidates.size() == 0)
+    return;
+  vector<Phrase>::iterator i;
+  for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
+    i->expandWhileUniquePrefixExtension();
+    i->ensureSuffixFound();
+  }
+  for (i; i != candidates.end(); ++i) {
+    i->expandWhileUniqueSuffixExtension();
+  }
+  // 3. Ensure minimality: ignore phrases whose subphrases are also found
+  results.clear();
+  // Initialise the candidates, check array, and various variables.
+  sort(candidates.begin(), candidates.end(), isShorter);
+  for (cellcount j = 0; j < inputLength; j++)
+    suffixCheck[j] = 0;
+  unsigned minimum_length = candidates.begin()->length;
+  // Insert the document locations (as pointers) into documentArray
+  for (cellcount i = 0; i < p.suffixFrequency; i++) {
+    documentArray[i] = suffixArray[i + p.firstSuffixIndex];
+  }
+  // Sort the document array into ascending order of raw pointer value
+  qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
+  // Extract phrases
+  //
+  // We will make several passesover the data, in each case considering
+  // a set of input phrases and generating a set of output phrases, which
+  // we will expancd in later passes.
+  //
+  // The input phrases in the first pass will be the vocabulary.
+  // In later passes, the input phrases will be the output phrases of the
+  // previous pass.
+  //
+  // In each pass we will consider each input phrase in turn.  If we
+  // have seen it before, we will ignore it.  Otherwise, we will expand
+  // it and add its expansions to the set of output phrases.
+  // Store the phrase data in the phrases file
+  char phraseDataName[FILENAME_MAX];
+  sprintf(phraseDataName, "%s/phrases", collection);
+  ofstream phraseData(phraseDataName, ios::out);
+  if (!phraseData) {
+    cout << "File " << phraseDataName << " could not be opened\n";
+    exit(1);
+  }
+  // Count the number of phrases output
+  unsigned long int phraseCounter = 0;
+  // Set up the phrase expansion memory.
+  // We need this so that we don't expand a phrase more than once
+  initialisePhraseMemory();
+  // The current pass numebr
+  int phrasePass = 1;
+  // PASS NUMBER 1
+  if (verbosity > 1) {
+    cout << "Starting pass " << phrasePass << endl;
+  }
+  ofstream outPhrase;
+  char     outPhraseName[FILENAME_MAX];
+  unsigned long int outPhraseCounter = 0;
+  // On the first pass, simply work through the vocabulary
+  sprintf(outPhraseName, "%s/outPhrase.1", collection);
+  outPhrase.open(outPhraseName, ios::out);
+  if (!outPhrase) {
+    cerr << "File " << outPhraseName << " could not be opened\n";
+    exit(1);
+  }
+  // Iterate over the different symbols by working through the suffix array
+  vector<Phrase> result;
+  cellindex ij = 0;
+  char *tmpString;
+  while (ij < inputLength) {
+    // make a new phrase of length 1
+    p = Phrase(suffixArray[ij], 1, SUFFIX);
+    p.findFirstAndLastSuffix(ij, inputLength-1);
+    // cout << "cell " << ij << " - " << p.toString() << endl;
+    // We ignore this symbol if it occurs only once, if it is a delimiter,
+    // of if we are in stopwords mode and it is a stopword
+    //
+    // We could imagine a new mode/command-line option, which is like
+    // STOPWORDS but without this restrictrion.  This would let you browse
+    // from "the" to "the AGRIS" for example, but not from "AGRIS" to
+    // "the AGRIS" (where the is a stopword and AGRIS a content word).
+    // The system used to work like this; it is easy to implement, but
+    // it explodes the size of the indexes.  So: would it be useful?
+    if (!((p.suffixFrequency <= 1) ||
+      // (*suffixArray[ij] != 23054) ||
+      (*suffixArray[ij] <= LASTDELIMITER) ||
+      ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {
+      // Get minimal expansions of the phrase
+      getMinimalExpansions(p, result);
+  // Try to add each candidate to the results set, ignoring the non-minimal
+  for (vector<Phrase>::iterator candidate = candidates.begin();
+       candidate != candidates.end(); candidate++) {
+    // Make a copy of candidate to mutilate while performing sub-phrase checks
+    Phrase temp_phrase(*candidate);
+    bool shorter_found = false;
+    // Check for shorter and shorter versions of the tenporary phrase
+    while (temp_phrase.length >= minimum_length && !shorter_found) {
+      temp_phrase.ensureSuffixFound();
+      if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
+    temp_phrase.shortenByOneAtPrefix();
+      else
+    shorter_found = true;
+      // Possible efficiency here: we can finish if the prefix of c
+      // and temp_phrase are the same for candidate->length symbols.
+    }
+      if (!result.empty()) {
+    // Remember that we have expanded this phrase
+    rememberThisPhrase(ij, 1);
+    // write the phrase text
+    tmpString = p.toString();
+    phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
+           << result.size() << ":";
+    delete [] tmpString;
+    // write the results
+    for (cellcount k = 0; k < result.size(); k++) {
+      if (k) {
+        phraseData << ",";
+      }
+      phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
+      outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
+      outPhraseCounter++;
+    }
+    result.clear();
+    // Write the documents in which this phrase occurs
+    df = getDocumentOccurrances(p, documentFrequency);
+    phraseData << ":" << df << ":";
+    // write the documents
+    for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
+      if (documentFrequency[m]) {
+        if (first) {
+          first = 0;
+        } else {
+          phraseData << ";";
+        }
+        // Output the document number.  Note that here we've numbered the
+        // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
+        // add 1 to the document id when we output it.
+        phraseData << "d" << (m+1);
+        // Next, output the frequency with which the document occurs, but
+        // only if it is > 1.
+        if (documentFrequency[m] > 1) {
+          phraseData << "," << documentFrequency[m];
+        }
+      }
+    }
+    phraseData << endl;
+    phraseCounter++;
+    // feedback
+    if (verbosity) {
+      if (phraseCounter % 1000 == 0) {
+        tmpString = p.toString();
+        cout << "phrase " << phraseCounter << ": "
+         << "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
+        delete [] tmpString;
+      }
+    }
+      }
+    }
+   ij = p.lastSuffixIndex + 1;
+  }
+  outPhrase.close();
+  // REMAINING PASSES
+  // The previous outPhrase file forms the input to each new pass
+  cellcount start, length;
+  while (outPhraseCounter > 0) {
+    // Start a new pass
+    phrasePass++;
+    if (verbosity) {
+      cout << "Starting pass " << phrasePass << endl;
+    }
+    // Open the input file
+    char inPhraseName[FILENAME_MAX];
+    sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
+    ifstream inPhrase (inPhraseName, ios::in);
+    if (!inPhrase) {
+      cerr << "File " << inPhraseName << " could not be opened\n";
+      exit(1);
+    }
+    // Open the output file
+    sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
+    outPhrase.open(outPhraseName, ios::out);
+    if (!outPhrase) {
+      cerr << "File " << outPhraseName << " could not be opened\n";
+      exit(1);
+    }
+    outPhraseCounter = 0;
+    // Process each phrase
+    while(inPhrase >> start >> length) {
+      // Ignore the phrase if we have expanded it before
+      if (isPhraseStored(start, length)) {
+    continue;
+      }
+      // Remember that we have examined this phrase
+      rememberThisPhrase(start, length);
+      // Find the phrase in the suffixarray
+      p = Phrase(suffixArray[start], length, SUFFIX);
+      p.findFirstAndLastSuffix(start, inputLength-1);
+      // cout << "index " << start << ", length " << length << " - "  <<  p.toString() << endl;
+      // Ignore the phrase if it only occurs once
+      if (p.suffixFrequency < 2) {
+    continue;
+      }
+      // Write the phrase text  tmpString = p.toString();
+      tmpString = p.toString();
+      phraseData << start << "-" << length << ":" << tmpString << ":"
+         << p.suffixFrequency << ":";
+      delete [] tmpString;
+      // Expand the phrase, if it is fewer than 8 words long
+      if (length <= 8) {
+    // Get the minimal expansions for this phrase
+    getMinimalExpansions(p, result);
+    // write the results
+    phraseData << result.size() << ":";
+    for (cellcount i = 0; i < result.size(); i++) {
+      if (i) {
+        phraseData << ",";
+      }
+      phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
+      outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
+      outPhraseCounter++;
+    }
+    result.clear();
+      } else {
+    // phrase is too long to expand further
+    phraseData << "0:";
+      }
+      // Write the documents in which this phrase occurs
+      df = getDocumentOccurrances(p, documentFrequency);
+      phraseData << ":" << df << ":";
+      // write the documents
+      for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
+    if (documentFrequency[i]) {
+      if (first) {
+        first = 0;
+      } else {
+        phraseData << ";";
+      }
+      // Output the document number.  Note that here we've numbered the
+      // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
+      // add 1 to the document id when we output it.
+      phraseData << "d" << (i+1);
+      // Next, output the frequency with which the document occurs, but
+      // only if it is > 1.
+      if (documentFrequency[i] > 1) {
+        phraseData << "," << documentFrequency[i];
+      }
+    }
+      }
+      phraseData << endl;
+      phraseCounter++;
+      // feedback
+      if (verbosity) {
+    if (phraseCounter % 1000 == 0) {
+      tmpString = p.toString();
+      cout << "phrase " << phraseCounter << ": "<< "start " << start
+           << ", length " << length << " - " << tmpString << endl;
+      delete [] tmpString;
+    }
+      }
+    }
+    inPhrase.close();
+    outPhrase.close();
+  }
+  phraseData.close();
+  deletePhraseMemory();
+  delete [] documentFrequency;
+  delete [] symbols;
+  delete [] suffixArray;
+  delete [] prefixArray;
+  delete [] suffixCheck;
+  delete [] prefixCheck;
+  delete [] documentArray;
+  cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
+  return 0;
+}
+// Get Minimal Expansions
+//
+// Get the set of "minimal" expansions of a phrase p, using the
+// algorithm described in the documentation.
+//
+// Returns a vector of Expansions.
+void getMinimalExpansions(Phrase &p, vector<Phrase> &results) {
+  // 1. Initialise the result and candiate vectors
+  vector<Phrase> candidates;
+  for (cellcount j = 0; j < inputLength; j++) {
+    suffixCheck[j] = 0;
+    prefixCheck[j] = 0;
+  }
+  // 2. Expand the phrase p
+  // 2.1 Create the candidate set
+  p.initialSuffixCandidates(candidates);
+  p.initialPrefixCandidates(candidates);
+  // 2.2 Sort the candidates by phrase length
+  make_heap(candidates.begin(), candidates.end(), isLonger);
+  // 3. While candidates is non-empty, confirm the phrases it
+  //    contains, expanding them as required
+  while (!candidates.empty()) {
+    // 3.1 Get next candidate
+    pop_heap(candidates.begin(), candidates.end(), isLonger);
+    Phrase c = candidates.back();
+    candidates.pop_back();
+    // 3.2 If we know there are no unique right extensions
+    //     (i.e. this is a phrase drawn from the suffix array)
+    if (!c.hasUniqueSuffixExtension()) {
+      c.ensurePrefixFound();
+      // 3.2.1 Ignore candidate if we have used a subphrase instead
+      if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
+    // cout << "ignoring" << endl;
+      }
+      // 3.2.2 If candidate has a unique left (prefix) extension,
+      //       Then extend it and add it back into Candidates.
+      else if (c.hasUniquePrefixExtension()) {
+    // cout << "expanding prefix " << c.toString() << "=> ";
+    c.expandUniquePrefixExtensionByOne();
+    candidates.push_back(c);
+    push_heap(candidates.begin(), candidates.end(), isLonger);
+     }
+      // 3.2.3 If candidate has no unique left (prefix) extension,
+      //       Then add it to the list of results.
+      else {
+    // cout << "no unique prefix, add to results" << endl;
+    results.push_back(c);
+    for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
+      suffixCheck[i] = c.length;
+    }
+    for (cellcount ik = c.firstPrefixIndex; ik <= c.lastPrefixIndex; ik++) {
+      prefixCheck[ik] = c.length;
+    }
+      }
+    }
+    // 3.3 If we know there are no unique left extensions,
+    //     Then fdo the same as for 3.2 but exchange suffix & prefix
+    else if (!c.hasUniquePrefixExtension()) {
+      c.ensureSuffixFound();
+      // 3.3.1
+      if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
+      }
+      // 3.3.2
+      else if (c.hasUniqueSuffixExtension()) {
+    c.expandUniqueSuffixExtensionByOne();
+    candidates.push_back(c);
+    push_heap(candidates.begin(), candidates.end(), isLonger);
+      }
+      // 3.3.3
+      else {
+    results.push_back(c);
+    for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
+      suffixCheck[i] = c.length;
+    }
+    for (cellcount ijk = c.firstPrefixIndex; ijk <= c.lastPrefixIndex; ijk++) {
+      prefixCheck[ijk] = c.length;
+    }
+      }
+    if (!shorter_found) {
+      results.push_back(*candidate);
+      candidate->ensureSuffixFound();
+      for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
+    suffixCheck[k] = candidate->length;
+    }
+  }
 …
 // Given a phrase, what documents does it occur in?
 cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
   // cout << "searching for \""<< p.toString() << "\" in documents "
+cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
+  // cout << "searching for \""<< p << "\" in documents "
   //      << 0 << "-" << numberOfDocuments - 1 << endl;
 …
+}
 bool isLongPhraseStored(cellindex index, cellcount length) {
 …
+}
 void deleteLongPhraseMemory() {
   // remove the hash & other files
 …
 // Read the collection statistics file
+//
 void readStatistics() {
 …
+}
+cellcount getContentCount(symbol firstContent) {
+  cellcount content=0;
+  for (cellcount i=0; i<inputLength; i++) {
+    if (symbols[i]>=firstContent) content++;
+  }
+  return content;
+}
+int main (int argc, char * argv[]) {
+  // Command-line arguments
+  // argv[1] is the phindex directory
+  // argv[2] is the maximum array symbol length (optional)
+  // argv[3] is the mode, where 1 is stopword mode (optional)
+  if (argc < 2) {
+    cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
+    exit(1);
+  }
+  // collection directory
+  strcpy(collection, argv[1]);
+  // mode parameter
+  phraseMode = atoi(argv[2]);
+  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
+  // optional verbosity parameter
+  if (argc == 4) {
+    verbosity = atoi(argv[3]);
+    assert (verbosity >= 0);
+  }
+  if (verbosity) {
+    cout << "suffix: the phrase extraction program" << endl;
+  }
+  if (verbosity > 1) {
+    if (phraseMode == STOPWORDS) {
+      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
+    } else {
+      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
+    }
+  }
+  // Read the statistics file
+  readStatistics();
+  // Read the numbers file
+  readNumbers();
+  if (numberOfDocuments == 0) {
+    cerr << "There are no documents in this collection!" << endl;
+    exit(1);
+  }
+  symbol firstContent;
+  if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
+  else firstContent = firstStopSymbol;
+  cellcount contentLength = 0;
+  contentLength = getContentCount(firstContent);
+  // Create the suffix & prefix arrays
+  suffixArray = new symbol *[contentLength];
+  prefixArray = new symbol *[contentLength];
+  cellcount here=0;
+  // Initialise prefix and suffix arrays, only use the needed suffixes
+  for (cellcount j = 0; j < inputLength; j++) {
+    if (symbols[j]>=firstContent) {
+      suffixArray[here] = &symbols[j];
+      prefixArray[here] = &symbols[j];
+      here++;
+    }
+  }
+  qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
+  qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
+  suffixCheck = new check[contentLength];
+  if (suffixCheck == NULL) {
+    cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
+    exit(2);
+  }
+  for (cellcount j = 0; j < contentLength; j++)
+    suffixCheck[j] = 0;
+  cout <<"\ngenerating the phrase hierarchy\n\n";
+  // Create the document arrays
+  if (verbosity > 1) {
+    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
+  }
+  // The document frequecy array is used to count the number of times
+  // each phrase occurs in each document.  The number of documents in
+  // which a phrase occurs is stored in df.
+  frequency *documentFrequency = new frequency[numberOfDocuments];
+  frequency df;
+  // documentArray will be searched in order to discover which document
+  // each phrase occurs in.
+  documentArray = new symbol *[numberOfDocuments];
+  // just scan through the input text to find the doc starts
+  cellindex d = 0;
+  for (cellindex i=0; i<inputLength; i++) {
+    if (symbols[i] == DOCUMENTSTART) {
+      documentArray[d] = &symbols[i];
+      d++;
+    }
+  }
+  // the phrases stuff is expecting inputLength to be the length of the
+  // suffix array, so change it.
+  inputLength = contentLength;
+  // Extract phrases
+  //
+  // We will make several passesover the data, in each case considering
+  // a set of input phrases and generating a set of output phrases, which
+  // we will expancd in later passes.
+  //
+  // The input phrases in the first pass will be the vocabulary.
+  // In later passes, the input phrases will be the output phrases of the
+  // previous pass.
+  //
+  // In each pass we will consider each input phrase in turn.  If we
+  // have seen it before, we will ignore it.  Otherwise, we will expand
+  // it and add its expansions to the set of output phrases.
+  // Store the phrase data in the phrases file
+  char phraseDataName[FILENAME_MAX];
+  sprintf(phraseDataName, "%s/phrases", collection);
+  ofstream phraseData(phraseDataName, ios::out);
+  if (!phraseData) {
+    cout << "File " << phraseDataName << " could not be opened\n";
+    exit(1);
+  }
+  // Count the number of phrases output
+  unsigned long int phraseCounter = 0;
+  // Set up the phrase expansion memory.
+  // We need this so that we don't expand a phrase more than once
+  initialisePhraseMemory();
+  // The current pass numebr
+  int phrasePass = 1;
+  // PASS NUMBER 1
+  if (verbosity > 1) {
+    cout << "Starting pass " << phrasePass << endl;
+  }
+  ofstream outPhrase;
+  char     outPhraseName[FILENAME_MAX];
+  unsigned long int outPhraseCounter = 0;
+  // On the first pass, simply work through the vocabulary
+  sprintf(outPhraseName, "%s/outPhrase.1", collection);
+  outPhrase.open(outPhraseName, ios::out);
+  if (!outPhrase) {
+    cerr << "File " << outPhraseName << " could not be opened\n";
+    exit(1);
+  }
+  // Iterate over the different symbols by working through the suffix array
+  vector<Phrase> result;
+  cellindex ij = 0;
+  char *tmpString;
+  Phrase p;
+  while (ij < inputLength) {
+    // make a new phrase of length 1
+    p = Phrase(suffixArray[ij], 1, SUFFIX);
+    p.findFirstAndLastSuffix(ij, inputLength-1);
+    // We ignore this symbol if it occurs only once, if it is a delimiter,
+    // of if we are in stopwords mode and it is a stopword
+    // - in this new version, only need to check freq
+    // We could imagine a new mode/command-line option, which is like
+    // STOPWORDS but without this restrictrion.  This would let you browse
+    // from "the" to "the AGRIS" for example, but not from "AGRIS" to
+    // "the AGRIS" (where the is a stopword and AGRIS a content word).
+    // The system used to work like this; it is easy to implement, but
+    // it explodes the size of the indexes.  So: would it be useful?
+    if (p.suffixFrequency > 1) {
+      // Get minimal expansions of the phrase
+      getExpansions(p, result);
+      if (!result.empty()) {
+    // Remember that we have expanded this phrase
+    rememberThisPhrase(ij, 1);
+    // write the phrase text
+    phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
+           << result.size() << ":";
+    // write the results
+    for (cellcount k = 0; k < result.size(); k++) {
+      if (k) {
+        phraseData << ",";
+      }
+      phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
+      outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
+      outPhraseCounter++;
+    }
+    result.clear();
+    // Write the documents in which this phrase occurs
+    df = getDocumentOccurrances(p, documentFrequency);
+    phraseData << ":" << df << ":";
+    // write the documents
+    for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
+      if (documentFrequency[m]) {
+        if (first) {
+          first = 0;
+        } else {
+          phraseData << ";";
+        }
+        // Output the document number.  Note that here we've numbered the
+        // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
+        // add 1 to the document id when we output it.
+        phraseData << "d" << (m+1);
+        // Next, output the frequency with which the document occurs, but
+        // only if it is > 1.
+        if (documentFrequency[m] > 1) {
+          phraseData << "," << documentFrequency[m];
+        }
+      }
+    }
+    phraseData << endl;
+    phraseCounter++;
+    // feedback
+    if (verbosity) {
+      if (phraseCounter % 1000 == 0) {
+        cout << "phrase " << phraseCounter << ": "
+         << "cell " << p.firstSuffixIndex << " - " << p << endl;
+      }
+    }
+      }
+    }
+   ij = p.lastSuffixIndex + 1;
+  }
+  outPhrase.close();
+  // REMAINING PASSES
+  // The previous outPhrase file forms the input to each new pass
+  cellcount start, length;
+  while (outPhraseCounter > 0) {
+    // Start a new pass
+    phrasePass++;
+    if (verbosity) {
+      cout << "Starting pass " << phrasePass << endl;
+    }
+    // Open the input file
+    char inPhraseName[FILENAME_MAX];
+    sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
+    ifstream inPhrase (inPhraseName, ios::in);
+    if (!inPhrase) {
+      cerr << "File " << inPhraseName << " could not be opened\n";
+      exit(1);
+    }
+    // Open the output file
+    sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
+    outPhrase.open(outPhraseName, ios::out);
+    if (!outPhrase) {
+      cerr << "File " << outPhraseName << " could not be opened\n";
+      exit(1);
+    }
+    outPhraseCounter = 0;
+    // Process each phrase
+    while(inPhrase >> start >> length) {
+      // Ignore the phrase if we have expanded it before
+      if (isPhraseStored(start, length))
+    continue;
+      // Remember that we have examined this phrase
+      rememberThisPhrase(start, length);
+      // Find the phrase in the suffixarray
+      p = Phrase(suffixArray[start], length, SUFFIX);
+      p.findFirstAndLastSuffix(start, inputLength-1);
+      // Ignore the phrase if it only occurs once
+      if (p.suffixFrequency < 2)
+    continue;
+      // Write the phrase text;
+      phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
+      // Expand the phrase, if it is fewer than 8 words long
+      if (length <= 8) {
+    // Get the minimal expansions for this phrase
+    getExpansions(p, result);
+    // write the results
+    phraseData << result.size() << ":";
+    for (cellcount i = 0; i < result.size(); i++) {
+      if (i) {
+        phraseData << ",";
+      }
+      phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
+      outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
+      outPhraseCounter++;
+    }
+    result.clear();
+      } else {
+    // phrase is too long to expand further
+    phraseData << "0:";
+      }
+      // Write the documents in which this phrase occurs
+      df = getDocumentOccurrances(p, documentFrequency);
+      phraseData << ":" << df << ":";
+      // write the documents
+      for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
+    if (documentFrequency[i]) {
+      if (first) {
+        first = 0;
+      } else {
+        phraseData << ";";
+      }
+      // Output the document number.  Note that here we've numbered the
+      // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
+      // add 1 to the document id when we output it.
+      phraseData << "d" << (i+1);
+      // Next, output the frequency with which the document occurs, but
+      // only if it is > 1.
+      if (documentFrequency[i] > 1) {
+        phraseData << "," << documentFrequency[i];
+      }
+    }
+      }
+      phraseData << endl;
+      phraseCounter++;
+      // feedback
+      if (verbosity) {
+    if (phraseCounter % 1000 == 0) {
+      cout << "phrase " << phraseCounter << ": "<< "start " << start
+           << ", length " << length << " - " << p << endl;
+    }
+      }
+    }
+    inPhrase.close();
+    outPhrase.close();
+  }
+  phraseData.close();
+  deletePhraseMemory();
+  delete [] documentFrequency;
+  delete [] symbols;
+  delete [] suffixArray;
+  delete [] prefixArray;
+  delete [] suffixCheck;
+  delete [] documentArray;
+  cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
+  return 0;
+}

Note: See TracChangeset for help on using the changeset viewer.