/********************************************************************** * * Suffix.cpp -- Extract the repeated phrases in the input with suffix * and prefix arrays (cgn & gwp's simpler algorithm, * and kjm's improvements). * * Copyright 2000 Gordon W. Paynter * Copyright 2000 The New Zealand Digital Library Project * * A component of the Greenstone digital library software from the * New Zealand Digital Library Project at the University of Waikato, * New Zealand. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include #include #include #include #include #if defined(GSDL_USE_IOS_H) # include # include #else # include # include #endif #if defined(GSDL_USE_STL_H) # if defined(GSDL_USE_ALGO_H) # include # else # include # endif # include #else # include # include #endif #include "suffix.h" #include "phrase.h" #include "check.h" #include "mglong.h" // Global variables declared in suffix.h cellcount inputLength; symbol *symbols; symbol **suffixArray; symbol **prefixArray; // How many documents are in this collection? cellcount numberOfDocuments; symbol **documentArray; // Do we accept any phrase, or do we eliminate those ending with stopwords ? int phraseMode = ANYPHRASE; //STOPWORDS; // What is the minimum phrase frequency for a phrase to be included in the hierarchy int minOccurs = 2; // The filestem of the collection's phindex directory char collection[FILENAME_MAX]; // The ranges of the stopword and content-word symbols for the collection symbol firstStopSymbol = 0; symbol lastStopSymbol = 0; symbol firstContentSymbol = 0; symbol lastContentSymbol = 0; // Some useful comparison functions, defined below. int suffixCompare(const void *, const void *); int prefixCompare(const void *, const void *); int pointerCompare(const void *, const void *); // Functions for implementing "phrase memory". These let us "remember" // each phrase that we've expanded without using too much memory. void initialisePhraseMemory(); void rememberThisPhrase(cellindex index, cellcount length); bool isPhraseStored(cellindex index, cellcount length); void deletePhraseMemory(); // how much output do we want? int verbosity = 1; // Get a phrase's expansions // // Get the set of "minimal", "maximal", non-unique expansions of a // phrase p, using the simpler algorithm that Craig and Gordon came up // with at Google. // // Returns a vector of Expansions. void getExpansions(Phrase &p, vector &results) { // 1. Get the initial candidates vector candidates; p.initialSuffixCandidates(candidates); int suffcands = candidates.size(); p.initialPrefixCandidates(candidates); if (candidates.size() == 0) return; vector::iterator i; for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) { i->expandWhileUniquePrefixExtension(); i->ensureSuffixFound(); } for (i; i != candidates.end(); ++i) { i->expandWhileUniqueSuffixExtension(); } // 3. Ensure minimality: ignore phrases whose subphrases are also found results.clear(); // Initialise the candidates, check array, and various variables. sort(candidates.begin(), candidates.end(), isShorter); unsigned minimum_length = candidates.begin()->length; clearSuffixCheck(); // Try to add each candidate to the results set, ignoring the non-minimal for (vector::iterator candidate = candidates.begin(); candidate != candidates.end(); ++candidate) { // Make a copy of candidate to mutilate while performing sub-phrase checks Phrase temp_phrase(*candidate); bool shorter_found = false; // Check for shorter and shorter versions of the temporary phrase while (temp_phrase.length >= minimum_length && !shorter_found) { temp_phrase.ensureSuffixFound(); if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0) temp_phrase.shortenByOneAtPrefix(); else shorter_found = true; // Possible efficiency here: we can finish if the prefix of c // and temp_phrase are the same for candidate->length symbols. } // If no shorter phrase is found, use this one if (!shorter_found) { results.push_back(*candidate); candidate->ensureSuffixFound(); setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex); } } } // suffixCompare // // Compare two pointers into a suffix array. We use this in the // qsort function, so the input are pointers to pointers. // // Return -1 if (a < b), otherwise (a > b) so return +1, int suffixCompare(const void *cpa, const void *cpb) { // Cast then dereference pointers to suffix array elements symbol *pa = (symbol *) cpa; symbol *pb = (symbol *) cpb; pa = (symbol *) *pa; pb = (symbol *) *pb; // If the two elements are the same, examine the next one while (*pa == *pb) { *pa++; *pb++; } // Make the copmparison and return if ( *pa < *pb) { return -1; } else { return +1; } } // prefixCompare // // Compare two pointers into a prefix array. We use this in the // qsort function, so the input are pointers to pointers. // // Return -1 if (a > b), otherwise (a < b) so return +1, int prefixCompare(const void *cpa, const void *cpb) { // Cast then dereference pointers to prefix array elements symbol *pa = (symbol *) cpa; symbol *pb = (symbol *) cpb; pa = (symbol *) *pa; pb = (symbol *) *pb; // If the two elements are the same, examine the next one while (*pa == *pb) { *pa--; *pb--; } // Make the copmparison and return if ( *pa > *pb) { return -1; } else { return +1; } } // simpleCompare // // Compare two pointers based on the memory location they point to. int pointerCompare( const void *pa, const void *pb ) { symbol **a = (symbol **) pa; symbol **b = (symbol **) pb; if (*a < *b) { return -1; } else if (*a > *b) { return 1; } else { return 0; } } // Read the clauses.numbers file into the "symbols" array. // // Each number in the file is a symbol number; it is essential that // the first symbol (and no others) be COLLECTIONSTART and the last // symbol (and no others) be COLLECTIONEND. // // Return the number of numbers in the array. int readNumbers() { char filename[FILENAME_MAX]; sprintf(filename, "%s/clauses.numbers", collection); if (verbosity) { cout << "Reading numbers file: " << filename << endl; } // Open the numbers file ifstream inFile1(filename, ios::in); if (!inFile1) { cerr << "File " << filename << " could not be opened\n"; exit(1); } // Count the number of symbols inputLength = 0; symbol word; while (inFile1 >> word) { ++inputLength; } inFile1.close(); // Allocate the symbbols array if (verbosity > 1) { cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl; } symbols = new symbol[inputLength]; if (symbols == NULL) { cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl; exit(2); } // Read the numbers file into the numbers array if (verbosity > 2) { cout << "Reading the numbers" << endl; } ifstream inFile2(filename, ios::in); if (!inFile2) { cerr << "File " << filename << " could not be opened\n"; exit(1); } cellcount next = 0; numberOfDocuments = 0; while (inFile2 >> word) { symbols[next++] = word; if (word == DOCUMENTSTART) { ++numberOfDocuments; } } inFile2.close(); // Make sure the numbers file is intact assert(symbols[0] == COLLECTIONSTART); assert(symbols[next-1] == COLLECTIONEND); return inputLength; } // Get Document Occurrance statistics // // Given a phrase, what documents does it occur in? cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) { // cout << "searching for \""<< p << "\" in documents " // << 0 << "-" << numberOfDocuments - 1 << endl; // The number of documents in which this phrase occurs cellcount df = 0; // Initialise the document frequency array // for (cellindex i = 0; i < numberOfDocuments; ++i) { // frequency[i] = 0; //} memset(frequency, 0, sizeof(cellcount)*numberOfDocuments); // variables used to facilitate the search cellindex begin; cellindex end; cellindex d; symbol *target; bool found; // search for the document in which each occurence of the phrase is found for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; ++j) { // cout << "looking for phrase at suffixArray[" << j << "]\n"; target = suffixArray[j]; begin = 0; end = numberOfDocuments - 1; found = false; // Search for the occurence of a document delimiter that target // occurs immediately after. // We do this by performing a binary chop search on documentArray. while (!found) { // cout << "searching for " << (cellindex) target << " in " // << begin << " - " << end << endl; assert (begin <= end); // If the beginning and end of the interval are the same, // then we've found the correct document if (begin == end) { if (frequency[begin] == 0) { ++df; } ++frequency[begin]; found = true; } // Otherwise, examine a new document midway through the begin-end // interval and see if it is the one. else { d = (begin + end) / 2; if (target > documentArray[d]) { // If target addrss is greater than this, but thisi sthe last document, // then this must be the one we want. Or, if target is greater than // this one but less then the next, this must be the one we wnat. if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) { if (frequency[d] == 0) { ++df; } ++frequency[d]; found = true; } else { // otherwise we know to search later in the document set begin = d + 1; } } else { // search earlier in the document set end = d - 1; } } } } return df; } // phraseExpansionMemory : Which phrases have we expanded? // // A set of utilities for keeping track of which phrases we have expanded. // We don't want to expand a phrase more than once, after all. // // This REALLY ought to be in its own class, but it works so that's okay. // // Phrases are identified by their firstSuffixPosition and length. // // Functions provided are: // void initialisePhraseMemory() // void rememberThisPhrase(index, length) // bool isPhraseStored(index, length) // void deletePhraseMemory() // // Internally, we will have two separate cases: // // Phrases of length 1-8: // unsigned char phraseMemory[inputLength] // is an array where each cell "remembers" the corresponding index in the // suffixArray, and each of the 8 bits of the cell correspond to the phrases // of length 1, 2... 8. // Eventually, we will make this disk-based (i.e. store the array in a file). // // Phrases of length 9+: // file hashTableFile // file listOfEntries // The first file is a hash table; each phrase maps to one of its cells, which // contains either 0 (empty, no occurence) or a number which is an entry number // in the second file. This file contains a "list" of entries. Each consists of // three numbers: the suffixArray index of the phrase, the length of the phrase, // and the entry number of the next phrase with the same hash. // unsigned char *phraseMemory; void initialiseLongPhraseMemory(); void rememberThisLongPhrase(cellindex index, cellcount length); bool isLongPhraseStored(cellindex index, cellcount length); void deleteLongPhraseMemory(); void initialisePhraseMemory() { phraseMemory = new unsigned char[inputLength]; // to begin with, everything is empty // for (cellcount i = 0; i < inputLength; ++i) { // phraseMemory[i] = 0; //} memset(phraseMemory, 0, sizeof(char)*inputLength); // intialise the hashTable of long phrases initialiseLongPhraseMemory(); } void rememberThisPhrase(cellindex index, cellcount length) { // if the phrase is very long, use the file-based system if (length > 8) { rememberThisLongPhrase(index, length); return; } // create a char with just the bit corresponding to length set unsigned char newbit = 1; for (cellcount i = 1; i < length; ++i) { newbit <<= 1; } // set that bit in the memory array at position index phraseMemory[index] |= newbit; } bool isPhraseStored(cellindex index, cellcount length) { // if the phrase is very long, use the file-based system if (length > 8) { return isLongPhraseStored(index, length); } // create a char with just the bit corresponding to length set unsigned char newbit = 1; for (cellcount i = 1; i < length; ++i) { newbit <<= 1; } // retrurn true if that bit is set in memory arrayat position index return (phraseMemory[index] & newbit); } void deletePhraseMemory() { delete []phraseMemory; deleteLongPhraseMemory(); } // Files etc used to store "long" equavlents of the above fstream hashTableFile; char hashTableFileName[FILENAME_MAX]; fstream listOfEntries; char listOfEntriesName[FILENAME_MAX]; cellindex nextEntryNumber; const cellcount bigPrime = 7919; void initialiseLongPhraseMemory() { cellindex example = 0; sprintf(hashTableFileName, "%s/hashTable", collection); sprintf(listOfEntriesName, "%s/hashLists", collection); // create the new hashtable if (verbosity > 1) { cout << "Initialising hashTable: " << hashTableFileName << endl; } hashTableFile.open(hashTableFileName, ios::in | ios::out); for (cellcount i = 0; i < bigPrime; ++i) { hashTableFile.write((char *) &example, sizeof(example)); } // create the list of phrases if (verbosity > 1) { cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl; } listOfEntries.open(listOfEntriesName, ios::in | ios::out); listOfEntries.write((char *) &example, sizeof(example)); listOfEntries.write((char *) &example, sizeof(example)); listOfEntries.write((char *) &example, sizeof(example)); nextEntryNumber = 1; } void rememberThisLongPhrase(cellindex index, cellcount length) { // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n"; cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex); cellindex pointer; cellindex zero = 0; cellindex readp = 0; cellindex readi = 0; cellindex readl = 0; hashTableFile.seekg(hashOffset); hashTableFile.read((char *) &pointer, sizeof(cellindex)); if (pointer == 0) { // There is no entry at all in the hash table for this entry // so create one pointer = nextEntryNumber++; hashTableFile.seekg(hashOffset); hashTableFile.write((char *) &pointer, sizeof(cellindex)); listOfEntries.seekp(pointer * sizeof(cellindex) * 3); listOfEntries.write((char *) &zero, sizeof(cellindex)); listOfEntries.write((char *) &index, sizeof(cellindex)); listOfEntries.write((char *) &length, sizeof(cellindex)); } else { // There is a list starting at this hash value, so the phrase may // be already remembered, or it might need to be appended while (pointer != 0) { // Read the entry pointed to by pointer listOfEntries.seekg(pointer * sizeof(cellindex) * 3); listOfEntries.read((char *) &readp, sizeof(cellindex)); listOfEntries.read((char *) &readi, sizeof(cellindex)); listOfEntries.read((char *) &readl, sizeof(cellindex)); // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl; if ((readi == index) && (readl = length)) { // we've found that we've already stored it return; } else if (readp == 0) { // we're reached the end of the list. Add a new entry. listOfEntries.seekp(pointer * sizeof(cellindex) * 3); listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex)); pointer = nextEntryNumber++; listOfEntries.seekp(pointer * sizeof(cellindex) * 3); listOfEntries.write((char *) &zero, sizeof(cellindex)); listOfEntries.write((char *) &index, sizeof(cellindex)); listOfEntries.write((char *) &length, sizeof(cellindex)); return; } else { // go on to the next node pointer = readp; } } } } bool isLongPhraseStored(cellindex index, cellcount length) { // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n"; cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex); cellindex pointer; cellindex readp = 0; cellindex readi = 0; cellindex readl = 0; // Find the phrase in the hashFile hashTableFile.seekg(hashOffset); hashTableFile.read((char *) &pointer, sizeof(cellindex)); if (pointer == 0) { // There is no entry at all in the hash table for this entry // so nothing is stored return false; } else { // There is a list starting at this hash value, so the phrase may // be already remembered in that list while (pointer != 0) { // Read the entry pointed to by pointer listOfEntries.seekg(pointer * sizeof(cellindex) * 3); listOfEntries.read((char *) &readp, sizeof(cellindex)); listOfEntries.read((char *) &readi, sizeof(cellindex)); listOfEntries.read((char *) &readl, sizeof(cellindex)); if ((readi == index) && (readl = length)) { // we've found the phrase stored here return true; } else { // go on to the next node pointer = readp; } } } return false; } void deleteLongPhraseMemory() { // remove the hash & other files hashTableFile.close(); listOfEntries.close(); remove(hashTableFileName); remove(listOfEntriesName); } // Read the collection statistics file // void readStatistics() { // open the statistics file char filename[FILENAME_MAX]; sprintf(filename, "%s/clauses.stats", collection); // Open the file ifstream inFile(filename, ios::in); if (!inFile) { cerr << "File " << filename << " could not be opened\n"; exit(1); } // Read the numbers file into the numbers array char key[1000]; symbol value; while (inFile >> key >> value){ if (strcmp(key, "first_stopword") == 0) { firstStopSymbol = value; } else if (strcmp(key, "last_stopword") == 0) { lastStopSymbol = value; } else if (strcmp(key, "first_contentword") == 0) { firstContentSymbol = value; } else if (strcmp(key, "last_contentword") == 0) { lastContentSymbol = value; } } inFile.close(); // Make sure we have the information we need if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) { cerr << "Statistics file incomplete" << endl; exit(1); } } cellcount getContentCount(symbol firstContent) { cellcount content=0; for (cellcount i=0; i=firstContent) ++content; } return content; } int main (int argc, char * argv[]) { // Command-line arguments // argv[1] is the phindex directory // argv[2] is the mode, where 1 is stopword mode // argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy // argv[4] is opitonal verbosity if (argc < 4) { cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl; exit(1); } // collection directory strcpy(collection, argv[1]); // mode parameter phraseMode = atoi(argv[2]); assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE)); minOccurs = atoi(argv[3]); assert((minOccurs > 0)); // optional verbosity parameter if (argc == 5) { verbosity = atoi(argv[4]); assert (verbosity >= 0); } if (verbosity) { cout << "suffix: the phrase extraction program" << endl; } if (verbosity > 1) { if (phraseMode == STOPWORDS) { cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl; } else { cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl; } } // Read the statistics file readStatistics(); // Read the numbers file readNumbers(); if (numberOfDocuments == 0) { cerr << "There are no documents in this collection!" << endl; exit(0); // not necessarily an error... } symbol firstContent; if (phraseMode==STOPWORDS) firstContent=firstContentSymbol; else firstContent = firstStopSymbol; // Allocate memory for the suffix & prefix arrays cellcount contentLength = 0; contentLength = getContentCount(firstContent); suffixArray = new symbol *[contentLength]; prefixArray = new symbol *[contentLength]; if (prefixArray == NULL) { cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl; exit(2); } allocateSuffixCheck(contentLength); // Initialise prefix and suffix arrays, only use the needed suffixes for (cellcount j = 0, here = 0; j < inputLength; ++j) { if (symbols[j]>=firstContent) { suffixArray[here] = &symbols[j]; prefixArray[here] = &symbols[j]; ++here; } } qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare); qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare); // Create the document arrays if (verbosity > 1) { cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl; } // The document frequecy array is used to count the number of times // each phrase occurs in each document. The number of documents in // which a phrase occurs is stored in df. frequency *documentFrequency = new frequency[numberOfDocuments]; frequency df; // documentArray will be searched in order to discover which document // each phrase occurs in. documentArray = new symbol *[numberOfDocuments]; if (documentArray == NULL) { cerr << "Suffix: out of memory allocating document arrays." << endl; exit(2); } // just scan through the input text to find the doc starts cellindex d = 0; for (cellindex i=0; i 1) { cout << "Starting pass " << phrasePass << endl; } ofstream outPhrase; char outPhraseName[FILENAME_MAX]; mg_u_long outPhraseCounter = 0; // unsigned long int outPhraseCounter = 0; // On the first pass, simply work through the vocabulary sprintf(outPhraseName, "%s/outPhrase.1", collection); outPhrase.open(outPhraseName, ios::out); if (!outPhrase) { cerr << "File " << outPhraseName << " could not be opened\n"; exit(1); } // Iterate over the different symbols by working through the suffix array vector result; cellindex ij = 0; char *tmpString; Phrase p; while (ij < inputLength) { // make a new phrase of length 1 p = Phrase(suffixArray[ij], 1, SUFFIX); p.findFirstAndLastSuffix(ij, inputLength-1); // We ignore this symbol if it occurs only once, if it is a delimiter, // of if we are in stopwords mode and it is a stopword // - in this new version, only need to check freq // We could imagine a new mode/command-line option, which is like // STOPWORDS but without this restrictrion. This would let you browse // from "the" to "the AGRIS" for example, but not from "AGRIS" to // "the AGRIS" (where the is a stopword and AGRIS a content word). // The system used to work like this; it is easy to implement, but // it explodes the size of the indexes. So: would it be useful? if (p.suffixFrequency >= minOccurs) { // Get minimal expansions of the phrase getExpansions(p, result); if (!result.empty()) { // Remember that we have expanded this phrase rememberThisPhrase(ij, 1); // write the phrase text phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":" << result.size() << ":"; // write the results for (cellcount k = 0; k < result.size(); ++k) { if (k) { phraseData << ","; } phraseData << result[k].firstSuffixIndex << "-" << result[k].length; outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl; ++outPhraseCounter; } result.clear(); // Write the documents in which this phrase occurs df = getDocumentOccurrances(p, documentFrequency); phraseData << ":" << df << ":"; // write the documents for (cellcount m = 0, first = 1; m < numberOfDocuments; ++m) { if (documentFrequency[m]) { if (first) { first = 0; } else { phraseData << ";"; } // Output the document number. Note that here we've numbered the // N documents from 0 to N-1, but later they'll be 1-N. Thus we // add 1 to the document id when we output it. phraseData << "d" << (m+1); // Next, output the frequency with which the document occurs, but // only if it is > 1. if (documentFrequency[m] > 1) { phraseData << "," << documentFrequency[m]; } } } phraseData << endl; ++phraseCounter; // feedback if (verbosity) { if (phraseCounter % 1000 == 0) { cout << "phrase " << phraseCounter << ": " << "cell " << p.firstSuffixIndex << " - " << p << endl; } } } } ij = p.lastSuffixIndex + 1; } outPhrase.close(); // REMAINING PASSES // The previous outPhrase file forms the input to each new pass cellcount start, length; while (outPhraseCounter > 0) { // Start a new pass ++phrasePass; if (verbosity) { cout << "Starting pass " << phrasePass << endl; } // Open the input file char inPhraseName[FILENAME_MAX]; sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1); ifstream inPhrase (inPhraseName, ios::in); if (!inPhrase) { cerr << "File " << inPhraseName << " could not be opened\n"; exit(1); } // Open the output file sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass); outPhrase.open(outPhraseName, ios::out); if (!outPhrase) { cerr << "File " << outPhraseName << " could not be opened\n"; exit(1); } outPhraseCounter = 0; // Process each phrase while(inPhrase >> start >> length) { // Ignore the phrase if we have expanded it before if (isPhraseStored(start, length)) continue; // Remember that we have examined this phrase rememberThisPhrase(start, length); // Find the phrase in the suffixarray p = Phrase(suffixArray[start], length, SUFFIX); p.findFirstAndLastSuffix(start, inputLength-1); // Ignore the phrase if it only occurs once if (p.suffixFrequency < minOccurs) continue; // Write the phrase text; phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":"; // Expand the phrase, if it is fewer than 8 words long if (length <= 8) { // Get the minimal expansions for this phrase getExpansions(p, result); // write the results phraseData << result.size() << ":"; for (cellcount i = 0; i < result.size(); ++i) { if (i) { phraseData << ","; } phraseData << result[i].firstSuffixIndex << "-" << result[i].length; outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl; ++outPhraseCounter; } result.clear(); } else { // phrase is too long to expand further phraseData << "0:"; } // Write the documents in which this phrase occurs df = getDocumentOccurrances(p, documentFrequency); phraseData << ":" << df << ":"; // write the documents for (cellcount i = 0, first = 1; i < numberOfDocuments; ++i) { if (documentFrequency[i]) { if (first) { first = 0; } else { phraseData << ";"; } // Output the document number. Note that here we've numbered the // N documents from 0 to N-1, but later they'll be 1-N. Thus we // add 1 to the document id when we output it. phraseData << "d" << (i+1); // Next, output the frequency with which the document occurs, but // only if it is > 1. if (documentFrequency[i] > 1) { phraseData << "," << documentFrequency[i]; } } } phraseData << endl; ++phraseCounter; // feedback if (verbosity) { if (phraseCounter % 1000 == 0) { cout << "phrase " << phraseCounter << ": "<< "start " << start << ", length " << length << " - " << p << endl; } } } inPhrase.close(); outPhrase.close(); } phraseData.close(); deletePhraseMemory(); delete [] documentFrequency; delete [] symbols; delete [] suffixArray; delete [] prefixArray; delete [] suffixCheck; delete [] documentArray; cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl; return 0; }