Changeset 2694 for trunk/gsdl/src
- Timestamp:
- 2001-08-12T10:28:09+12:00 (23 years ago)
- Location:
- trunk/gsdl/src/phind/generate
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/phind/generate/phrase.cpp
r2674 r2694 67 67 68 68 69 Phrase::Phrase(const Phrase &p) { 70 forward = p.forward; 71 back = p.back; 72 length = p.length; 73 74 suffixFound = p.suffixFound; 75 prefixFound = p.prefixFound; 76 77 firstSuffix = p.firstSuffix; 78 lastSuffix = p.lastSuffix; 79 firstSuffixIndex = p.firstSuffixIndex; 80 lastSuffixIndex = p.lastSuffixIndex; 81 suffixFrequency = p.suffixFrequency; 82 83 firstPrefix = p.firstPrefix; 84 lastPrefix = p.lastPrefix; 85 firstPrefixIndex = p.firstPrefixIndex; 86 lastPrefixIndex = p.lastPrefixIndex; 87 prefixFrequency = p.prefixFrequency; 88 89 uniqueSuffixExtension = p.uniqueSuffixExtension; 90 uniquePrefixExtension = p.uniquePrefixExtension; 91 } 92 93 69 94 // Empty the contents of a phrase 70 95 … … 77 102 firstSuffix = firstPrefix = NULL; 78 103 lastSuffix = lastPrefix = NULL; 79 suffixFrequency = prefixFrequency = 0; 104 105 firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0; 106 firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0; 107 80 108 uniqueSuffixExtension = uniquePrefixExtension = -1; 81 109 … … 88 116 int Phrase::clearSuffix() { 89 117 suffixFound = 0; 90 firstSuffix = NULL; 91 lastSuffix = NULL; 92 suffixFrequency = 0; 118 firstSuffix = lastSuffix = NULL; 119 firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0; 93 120 uniqueSuffixExtension = -1; 94 121 return 0; … … 97 124 int Phrase::clearPrefix() { 98 125 prefixFound = 0; 99 firstPrefix = NULL; 100 lastPrefix = NULL; 101 prefixFrequency = 0; 126 firstPrefix = lastPrefix = NULL; 127 firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0; 102 128 uniquePrefixExtension = -1; 103 129 return 0; … … 141 167 142 168 169 // Output a phrase to a stream 170 std::ostream &operator<<(std::ostream &stream, const Phrase &phrase) 171 { 172 assert(phrase.forward); 173 symbol *s = phrase.forward; 174 175 stream << "s" << *s++; 176 for (cellcount i = 1; i < phrase.length; i++) 177 stream << " s" << *s++; 178 179 return stream; 180 } 181 143 182 144 183 // Convert the phrase to a string 145 char *Phrase::toString() { 146 184 // Note thgat you have to delete the memory yourself. 185 char *Phrase::toString() 186 { 147 187 assert(forward); 148 188 … … 596 636 597 637 598 // Ensure that the phrase has been found in the suffix & prefix arrays599 600 int Phrase::ensureSuffixFound() {601 if (!suffixFound) {602 findFirstAndLastSuffix();603 }604 return 0;605 }606 607 int Phrase::ensurePrefixFound() {608 if (!prefixFound) {609 findFirstAndLastPrefix();610 }611 return 0;612 }613 614 615 638 // Calculate a set of initial suffix/prefix candidates 616 639 // … … 619 642 // and add them to the end of the results vector 620 643 621 intPhrase::initialSuffixCandidates(vector<Phrase> &results) {644 void Phrase::initialSuffixCandidates(vector<Phrase> &results) { 622 645 623 646 ensureSuffixFound(); … … 639 662 // Move onto the next expansion 640 663 i = next.lastSuffixIndex + 1; 641 642 } 643 return 0; 644 } 645 646 647 int Phrase::initialPrefixCandidates(vector<Phrase> &results) { 664 } 665 } 666 667 668 void Phrase::initialPrefixCandidates(vector<Phrase> &results) { 648 669 649 670 ensurePrefixFound(); … … 665 686 // Move onto the next expansion 666 687 i = next.lastPrefixIndex + 1; 667 668 } 669 return 0; 688 } 670 689 } 671 690 -
trunk/gsdl/src/phind/generate/phrase.h
r2674 r2694 50 50 public: 51 51 52 // The phrase itself is stored with two pointers: forward 53 // points to its first cell, back points to its last. 54 // The length is always stored in length. 55 // If one of these is set, all must be set, and it must 56 // be true that (forward + length - 1) = back 52 // The phrase itself is stored with two pointers: forward points to 53 // its first cell, back points to its last. The length is always 54 // stored in length. If one of these is set, all must be set, and 55 // it must be true that (forward + length - 1) = back. 57 56 symbol *forward; 58 57 symbol *back; … … 76 75 77 76 // Constructor functions 78 // First argument is an array of words, second is the length of 79 // the phrase, third is the direction (SUFFIX of PREFIX) in 80 // which the words should be read (defaults to forwards). 77 Phrase(); 78 Phrase(const Phrase &p); 79 80 // A "partial" constructor: the first argument is an array of words, 81 // second is its length, third is the direction (SUFFIX or PREFIX) 82 // in which the words should be read (defaults to SUFFIX). 81 83 Phrase(symbol *words, cellcount size, int direction); 82 84 83 // An empty phrase can be created without arguments, but is 84 // good for nothing and may not be used with any public fuctions. 85 // We therefore only use it internally. 86 Phrase(); 87 88 // Represent the phrase as a string 85 // Represent the phrase as an arracy of characters 86 // You will have to call "delete []" on the array returned. 89 87 char *toString(); 90 88 91 89 // Find an initial set of candidate phrases in the suffix/prefix array 92 intinitialSuffixCandidates(vector<Phrase> &results);93 intinitialPrefixCandidates(vector<Phrase> &results);90 void initialSuffixCandidates(vector<Phrase> &results); 91 void initialPrefixCandidates(vector<Phrase> &results); 94 92 95 93 // Does the phrase have a unique extension? … … 116 114 117 115 // Make sure the phrase location in the suffix/prefix array is known 118 int ensureSuffixFound(); 119 int ensurePrefixFound(); 116 inline void Phrase::ensureSuffixFound() { 117 if (!suffixFound) 118 findFirstAndLastSuffix(); 119 } 120 inline void Phrase::ensurePrefixFound() { 121 if (!prefixFound) 122 findFirstAndLastPrefix(); 123 } 124 125 // Output a phrase to a stream 126 friend std::ostream &operator<<(std::ostream &stream, const Phrase &phrase); 120 127 121 128 private: … … 152 159 153 160 #endif 161 -
trunk/gsdl/src/phind/generate/suffix2.cpp
r2673 r2694 61 61 symbol *symbols; 62 62 symbol **suffixArray; 63 symbol **prefixArray; 63 64 check *suffixCheck; 64 symbol **prefixArray; 65 65 66 66 67 // How many documents are in this collection? … … 68 69 symbol **documentArray; 69 70 71 70 72 // Do we accept any phrase, or do we eliminate those ending with stopwords ? 71 73 int phraseMode = ANYPHRASE; //STOPWORDS; 72 74 75 73 76 // The filestem of the collection's phindex directory 74 77 char collection[FILENAME_MAX]; 75 78 76 int suffixCompare(const void *, const void *);77 int prefixCompare(const void *, const void *);78 int pointerCompare(const void *, const void *);79 80 int readNumbers();81 void readStatistics();82 83 void getExpansions(Phrase &p, vector<Phrase> &results);84 cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);85 79 86 80 // The ranges of the stopword and content-word symbols for the collection … … 91 85 92 86 93 94 95 // Phrase memory 96 // We have to "remember" each phrase that we've expanded 87 // Some useful comparison functions, defined below. 88 int suffixCompare(const void *, const void *); 89 int prefixCompare(const void *, const void *); 90 int pointerCompare(const void *, const void *); 91 92 93 // Functions for implementing "phrase memory". These let us "remember" 94 // each phrase that we've expanded without using two much memory. 97 95 void initialisePhraseMemory(); 98 96 void rememberThisPhrase(cellindex index, cellcount length); … … 105 103 106 104 107 int main (int argc, char * argv[]) {108 109 // Command-line arguments110 // argv[1] is the phindex directory111 // argv[2] is the maximum array symbol length (optional)112 // argv[3] is the mode, where 1 is stopword mode (optional)113 if (argc < 2) {114 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;115 exit(1);116 }117 118 // collection directory119 strcpy(collection, argv[1]);120 121 // mode parameter122 phraseMode = atoi(argv[2]);123 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));124 125 // optional verbosity parameter126 if (argc == 4) {127 verbosity = atoi(argv[3]);128 assert (verbosity >= 0);129 }130 131 if (verbosity) {132 cout << "suffix2: the simpler phrase extraction program" << endl;133 }134 135 if (verbosity > 1) {136 if (phraseMode == STOPWORDS) {137 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;138 } else {139 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;140 }141 }142 143 // Read the statistics file144 readStatistics();145 146 // Read the numbers file147 readNumbers();148 149 // Create the suffix & prefix arrays150 suffixArray = new symbol *[inputLength];151 prefixArray = new symbol *[inputLength];152 suffixCheck = new check[inputLength];153 if (suffixCheck == NULL) {154 cerr << "Suffix2 error: not enough memory to hold " << inputLength << " symbols." << endl;155 exit(2);156 }157 158 // Initialise prefix and suffix arrays159 for (cellcount j = 0; j < inputLength; j++) {160 suffixArray[j] = &symbols[j];161 prefixArray[j] = &symbols[j];162 }163 qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);164 qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);165 166 167 // Create the document arrays168 if (numberOfDocuments == 0) {169 cerr << "There are no documents in this collection!" << endl;170 exit(1);171 }172 if (verbosity > 1) {173 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;174 }175 176 // The document frequecy array is used to count the number of times177 // each phrase occurs in each document. The number of documents in178 // which a phrase occurs is stored in df.179 frequency *documentFrequency = new frequency[numberOfDocuments];180 frequency df;181 182 // documentArray will be searched in order to discover which document183 // each phrase occurs in.184 documentArray = new symbol *[numberOfDocuments];185 186 // Discover all the DOCUMENTSTART symbols and store as a phrase187 cellindex d = 0;188 while (*suffixArray[d] != DOCUMENTSTART) {189 d++;190 }191 Phrase p(suffixArray[d], 1, SUFFIX);192 p.findFirstAndLastSuffix(d, inputLength-1);193 194 // Insert the document locations time (as pointers) into documentArray195 for (cellcount i = 0; i < p.suffixFrequency; i++) {196 documentArray[i] = suffixArray[i + p.firstSuffixIndex];197 }198 199 // Sort the document array into ascending order of raw pointer value200 qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);201 202 203 // Extract phrases204 //205 // We will make several passesover the data, in each case considering206 // a set of input phrases and generating a set of output phrases, which207 // we will expancd in later passes.208 //209 // The input phrases in the first pass will be the vocabulary.210 // In later passes, the input phrases will be the output phrases of the211 // previous pass.212 //213 // In each pass we will consider each input phrase in turn. If we214 // have seen it before, we will ignore it. Otherwise, we will expand215 // it and add its expansions to the set of output phrases.216 217 // Store the phrase data in the phrases file218 char phraseDataName[FILENAME_MAX];219 sprintf(phraseDataName, "%s/phrases", collection);220 ofstream phraseData(phraseDataName, ios::out);221 if (!phraseData) {222 cout << "File " << phraseDataName << " could not be opened\n";223 exit(1);224 }225 226 // Count the number of phrases output227 unsigned long int phraseCounter = 0;228 229 // Set up the phrase expansion memory.230 // We need this so that we don't expand a phrase more than once231 initialisePhraseMemory();232 233 // The current pass numebr234 int phrasePass = 1;235 236 237 // PASS NUMBER 1238 if (verbosity > 1) {239 cout << "Starting pass " << phrasePass << endl;240 }241 242 ofstream outPhrase;243 char outPhraseName[FILENAME_MAX];244 unsigned long int outPhraseCounter = 0;245 246 // On the first pass, simply work through the vocabulary247 sprintf(outPhraseName, "%s/outPhrase.1", collection);248 outPhrase.open(outPhraseName, ios::out);249 if (!outPhrase) {250 cerr << "File " << outPhraseName << " could not be opened\n";251 exit(1);252 }253 254 // Iterate over the different symbols by working through the suffix array255 vector<Phrase> result;256 cellindex ij = 0;257 char *tmpString;258 259 while (ij < inputLength) {260 261 // make a new phrase of length 1262 p = Phrase(suffixArray[ij], 1, SUFFIX);263 p.findFirstAndLastSuffix(ij, inputLength-1);264 265 // cout << "cell " << ij << " - " << p.toString() << endl;266 267 // We ignore this symbol if it occurs only once, if it is a delimiter,268 // of if we are in stopwords mode and it is a stopword269 //270 // We could imagine a new mode/command-line option, which is like271 // STOPWORDS but without this restrictrion. This would let you browse272 // from "the" to "the AGRIS" for example, but not from "AGRIS" to273 // "the AGRIS" (where the is a stopword and AGRIS a content word).274 // The system used to work like this; it is easy to implement, but275 // it explodes the size of the indexes. So: would it be useful?276 if (!((p.suffixFrequency <= 1) ||277 // (*suffixArray[ij] != 23054) ||278 (*suffixArray[ij] <= LASTDELIMITER) ||279 ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {280 281 // Get minimal expansions of the phrase282 getExpansions(p, result);283 284 if (!result.empty()) {285 286 // Remember that we have expanded this phrase287 rememberThisPhrase(ij, 1);288 289 // write the phrase text290 tmpString = p.toString();291 phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":"292 << result.size() << ":";293 delete [] tmpString;294 295 // write the results296 for (cellcount k = 0; k < result.size(); k++) {297 if (k) {298 phraseData << ",";299 }300 phraseData << result[k].firstSuffixIndex << "-" << result[k].length;301 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;302 outPhraseCounter++;303 }304 result.clear();305 306 // Write the documents in which this phrase occurs307 df = getDocumentOccurrances(p, documentFrequency);308 phraseData << ":" << df << ":";309 310 // write the documents311 for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {312 if (documentFrequency[m]) {313 if (first) {314 first = 0;315 } else {316 phraseData << ";";317 }318 // Output the document number. Note that here we've numbered the319 // N documents from 0 to N-1, but later they'll be 1-N. Thus we320 // add 1 to the document id when we output it.321 phraseData << "d" << (m+1);322 // Next, output the frequency with which the document occurs, but323 // only if it is > 1.324 if (documentFrequency[m] > 1) {325 phraseData << "," << documentFrequency[m];326 }327 }328 }329 330 phraseData << endl;331 phraseCounter++;332 333 // feedback334 if (verbosity) {335 if (phraseCounter % 1000 == 0) {336 tmpString = p.toString();337 cout << "phrase " << phraseCounter << ": "338 << "cell " << p.firstSuffixIndex << " - " << tmpString << endl;339 delete [] tmpString;340 }341 }342 }343 }344 ij = p.lastSuffixIndex + 1;345 }346 outPhrase.close();347 348 // REMAINING PASSES349 // The previous outPhrase file forms the input to each new pass350 cellcount start, length;351 while (outPhraseCounter > 0) {352 353 // Start a new pass354 phrasePass++;355 if (verbosity) {356 cout << "Starting pass " << phrasePass << endl;357 }358 359 // Open the input file360 char inPhraseName[FILENAME_MAX];361 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);362 ifstream inPhrase (inPhraseName, ios::in);363 if (!inPhrase) {364 cerr << "File " << inPhraseName << " could not be opened\n";365 exit(1);366 }367 368 // Open the output file369 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);370 outPhrase.open(outPhraseName, ios::out);371 if (!outPhrase) {372 cerr << "File " << outPhraseName << " could not be opened\n";373 exit(1);374 }375 outPhraseCounter = 0;376 377 // Process each phrase378 while(inPhrase >> start >> length) {379 380 // Ignore the phrase if we have expanded it before381 if (isPhraseStored(start, length)) {382 continue;383 }384 385 // Remember that we have examined this phrase386 rememberThisPhrase(start, length);387 388 // Find the phrase in the suffixarray389 p = Phrase(suffixArray[start], length, SUFFIX);390 p.findFirstAndLastSuffix(start, inputLength-1);391 392 // cout << "index " << start << ", length " << length << " - " << p.toString() << endl;393 394 395 // Ignore the phrase if it only occurs once396 if (p.suffixFrequency < 2) {397 continue;398 }399 400 401 // Write the phrase text tmpString = p.toString();402 tmpString = p.toString();403 phraseData << start << "-" << length << ":" << tmpString << ":"404 << p.suffixFrequency << ":";405 delete [] tmpString;406 407 408 // Expand the phrase, if it is fewer than 8 words long409 if (length <= 8) {410 411 // Get the minimal expansions for this phrase412 getExpansions(p, result);413 414 // write the results415 phraseData << result.size() << ":";416 417 for (cellcount i = 0; i < result.size(); i++) {418 if (i) {419 phraseData << ",";420 }421 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;422 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;423 outPhraseCounter++;424 }425 result.clear();426 427 } else {428 // phrase is too long to expand further429 phraseData << "0:";430 }431 432 433 // Write the documents in which this phrase occurs434 df = getDocumentOccurrances(p, documentFrequency);435 phraseData << ":" << df << ":";436 437 // write the documents438 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {439 if (documentFrequency[i]) {440 if (first) {441 first = 0;442 } else {443 phraseData << ";";444 }445 // Output the document number. Note that here we've numbered the446 // N documents from 0 to N-1, but later they'll be 1-N. Thus we447 // add 1 to the document id when we output it.448 phraseData << "d" << (i+1);449 // Next, output the frequency with which the document occurs, but450 // only if it is > 1.451 if (documentFrequency[i] > 1) {452 phraseData << "," << documentFrequency[i];453 }454 }455 }456 457 phraseData << endl;458 phraseCounter++;459 460 // feedback461 if (verbosity) {462 if (phraseCounter % 1000 == 0) {463 tmpString = p.toString();464 cout << "phrase " << phraseCounter << ": "<< "start " << start465 << ", length " << length << " - " << tmpString << endl;466 delete [] tmpString;467 }468 }469 470 }471 472 inPhrase.close();473 outPhrase.close();474 }475 476 phraseData.close();477 deletePhraseMemory();478 479 delete [] documentFrequency;480 delete [] symbols;481 delete [] suffixArray;482 delete [] prefixArray;483 delete [] suffixCheck;484 delete [] documentArray;485 486 487 488 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;489 return 0;490 }491 492 493 494 495 496 105 // Get a phrase's expansions 497 106 // … … 510 119 511 120 // 2. Ensure maximality: expand each initial candidate both right and left 512 for (vector<Phrase>::iterator i = candidates.begin(); i != candidates.end(); i++) {121 for (vector<Phrase>::iterator i = candidates.begin(); i != candidates.end(); ++i) { 513 122 // We should be able to optimise this given we've already expanded in one direction 514 123 i->expandWhileUniquePrefixExtension(); … … 521 130 return; 522 131 523 for (cellcount j = 0; j < inputLength; j++) { 132 // Initialise the candidates, check array, and various variables. 133 sort(candidates.begin(), candidates.end(), isShorter); 134 for (cellcount j = 0; j < inputLength; j++) 524 135 suffixCheck[j] = 0; 525 } 526 sort(candidates.begin(), candidates.end(), isShorter); 527 528 unsigned minimum_length = p.length + 1; 529 if (candidates.begin()->length > minimum_length) 530 minimum_length = candidates.begin()->length; 136 unsigned minimum_length = candidates.begin()->length; 531 137 532 138 // Try to add each candidate to the results set, ignoring the non-minimal 533 for (vector<Phrase>::iterator candidate = candidates.begin(); candidate != candidates.end(); candidate++) { 534 535 // cerr << "* candidate of length " << candidate->length << ": (" << candidate->toString() << ")\n"; 536 537 // Make a copy of candidate that we will mutilate while performing sub-phrase checks 538 Phrase temp_phrase(candidate->forward, candidate->length, SUFFIX); 139 for (vector<Phrase>::iterator candidate = candidates.begin(); 140 candidate != candidates.end(); candidate++) { 141 142 // Make a copy of candidate to mutilate while performing sub-phrase checks 143 Phrase temp_phrase(*candidate); 539 144 bool shorter_found = false; 540 145 146 // Check for shorter and shorter versions of the tenporary phrase 541 147 while (temp_phrase.length >= minimum_length && !shorter_found) { 542 148 temp_phrase.ensureSuffixFound(); … … 551 157 552 158 if (!shorter_found) { 553 // cerr << "NOT FOUND! " << candidate->length << ": adding (" << candidate->toString() << ") : "554 // << candidate->firstSuffixIndex << "-" << candidate->lastSuffixIndex << "\n";555 159 results.push_back(*candidate); 556 160 candidate->ensureSuffixFound(); 557 for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; k++) {161 for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k) 558 162 suffixCheck[k] = candidate->length; 559 }560 163 } 561 164 } … … 716 319 // Given a phrase, what documents does it occur in? 717 320 718 cellcount getDocumentOccurrances( Phrase &p, cellcount *frequency) {719 720 // cout << "searching for \""<< p .toString()<< "\" in documents "321 cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) { 322 323 // cout << "searching for \""<< p << "\" in documents " 721 324 // << 0 << "-" << numberOfDocuments - 1 << endl; 722 325 … … 1003 606 } 1004 607 608 1005 609 bool isLongPhraseStored(cellindex index, cellcount length) { 1006 610 … … 1044 648 } 1045 649 650 1046 651 void deleteLongPhraseMemory() { 1047 652 // remove the hash & other files … … 1055 660 1056 661 1057 1058 1059 662 // Read the collection statistics file 663 // 1060 664 void readStatistics() { 1061 665 … … 1095 699 1096 700 1097 1098 1099 701 int main (int argc, char * argv[]) { 702 703 // Command-line arguments 704 // argv[1] is the phindex directory 705 // argv[2] is the maximum array symbol length (optional) 706 // argv[3] is the mode, where 1 is stopword mode (optional) 707 if (argc < 2) { 708 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl; 709 exit(1); 710 } 711 712 // collection directory 713 strcpy(collection, argv[1]); 714 715 // mode parameter 716 phraseMode = atoi(argv[2]); 717 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE)); 718 719 // optional verbosity parameter 720 if (argc == 4) { 721 verbosity = atoi(argv[3]); 722 assert (verbosity >= 0); 723 } 724 725 if (verbosity) { 726 cout << "suffix2: the simpler phrase extraction program" << endl; 727 } 728 729 if (verbosity > 1) { 730 if (phraseMode == STOPWORDS) { 731 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl; 732 } else { 733 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl; 734 } 735 } 736 737 // Read the statistics file 738 readStatistics(); 739 740 // Read the numbers file 741 readNumbers(); 742 743 // Create the suffix & prefix arrays 744 suffixArray = new symbol *[inputLength]; 745 prefixArray = new symbol *[inputLength]; 746 suffixCheck = new check[inputLength]; 747 if (suffixCheck == NULL) { 748 cerr << "Suffix2 error: not enough memory to hold " << inputLength << " symbols." << endl; 749 exit(2); 750 } 751 752 // Initialise prefix and suffix arrays 753 for (cellcount j = 0; j < inputLength; j++) { 754 suffixArray[j] = &symbols[j]; 755 prefixArray[j] = &symbols[j]; 756 } 757 qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare); 758 qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare); 759 760 761 // Create the document arrays 762 if (numberOfDocuments == 0) { 763 cerr << "There are no documents in this collection!" << endl; 764 exit(1); 765 } 766 if (verbosity > 1) { 767 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl; 768 } 769 770 // The document frequecy array is used to count the number of times 771 // each phrase occurs in each document. The number of documents in 772 // which a phrase occurs is stored in df. 773 frequency *documentFrequency = new frequency[numberOfDocuments]; 774 frequency df; 775 776 // documentArray will be searched in order to discover which document 777 // each phrase occurs in. 778 documentArray = new symbol *[numberOfDocuments]; 779 780 // Discover all the DOCUMENTSTART symbols and store as a phrase 781 cellindex d = 0; 782 while (*suffixArray[d] != DOCUMENTSTART) { 783 d++; 784 } 785 Phrase p(suffixArray[d], 1, SUFFIX); 786 p.findFirstAndLastSuffix(d, inputLength-1); 787 788 // Insert the document locations time (as pointers) into documentArray 789 for (cellcount i = 0; i < p.suffixFrequency; i++) { 790 documentArray[i] = suffixArray[i + p.firstSuffixIndex]; 791 } 792 793 // Sort the document array into ascending order of raw pointer value 794 qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare); 795 796 797 // Extract phrases 798 // 799 // We will make several passesover the data, in each case considering 800 // a set of input phrases and generating a set of output phrases, which 801 // we will expancd in later passes. 802 // 803 // The input phrases in the first pass will be the vocabulary. 804 // In later passes, the input phrases will be the output phrases of the 805 // previous pass. 806 // 807 // In each pass we will consider each input phrase in turn. If we 808 // have seen it before, we will ignore it. Otherwise, we will expand 809 // it and add its expansions to the set of output phrases. 810 811 // Store the phrase data in the phrases file 812 char phraseDataName[FILENAME_MAX]; 813 sprintf(phraseDataName, "%s/phrases", collection); 814 ofstream phraseData(phraseDataName, ios::out); 815 if (!phraseData) { 816 cout << "File " << phraseDataName << " could not be opened\n"; 817 exit(1); 818 } 819 820 // Count the number of phrases output 821 unsigned long int phraseCounter = 0; 822 823 // Set up the phrase expansion memory. 824 // We need this so that we don't expand a phrase more than once 825 initialisePhraseMemory(); 826 827 // The current pass numebr 828 int phrasePass = 1; 829 830 831 // PASS NUMBER 1 832 if (verbosity > 1) { 833 cout << "Starting pass " << phrasePass << endl; 834 } 835 836 ofstream outPhrase; 837 char outPhraseName[FILENAME_MAX]; 838 unsigned long int outPhraseCounter = 0; 839 840 // On the first pass, simply work through the vocabulary 841 sprintf(outPhraseName, "%s/outPhrase.1", collection); 842 outPhrase.open(outPhraseName, ios::out); 843 if (!outPhrase) { 844 cerr << "File " << outPhraseName << " could not be opened\n"; 845 exit(1); 846 } 847 848 // Iterate over the different symbols by working through the suffix array 849 vector<Phrase> result; 850 cellindex ij = 0; 851 char *tmpString; 852 853 while (ij < inputLength) { 854 855 // make a new phrase of length 1 856 p = Phrase(suffixArray[ij], 1, SUFFIX); 857 p.findFirstAndLastSuffix(ij, inputLength-1); 858 859 // We ignore this symbol if it occurs only once, if it is a delimiter, 860 // of if we are in stopwords mode and it is a stopword 861 // 862 // We could imagine a new mode/command-line option, which is like 863 // STOPWORDS but without this restrictrion. This would let you browse 864 // from "the" to "the AGRIS" for example, but not from "AGRIS" to 865 // "the AGRIS" (where the is a stopword and AGRIS a content word). 866 // The system used to work like this; it is easy to implement, but 867 // it explodes the size of the indexes. So: would it be useful? 868 if (!((p.suffixFrequency <= 1) || 869 (*suffixArray[ij] <= LASTDELIMITER) || 870 ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) { 871 872 // Get minimal expansions of the phrase 873 getExpansions(p, result); 874 875 if (!result.empty()) { 876 877 // Remember that we have expanded this phrase 878 rememberThisPhrase(ij, 1); 879 880 // write the phrase text 881 phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":" 882 << result.size() << ":"; 883 884 // write the results 885 for (cellcount k = 0; k < result.size(); k++) { 886 if (k) { 887 phraseData << ","; 888 } 889 phraseData << result[k].firstSuffixIndex << "-" << result[k].length; 890 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl; 891 outPhraseCounter++; 892 } 893 result.clear(); 894 895 // Write the documents in which this phrase occurs 896 df = getDocumentOccurrances(p, documentFrequency); 897 phraseData << ":" << df << ":"; 898 899 // write the documents 900 for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) { 901 if (documentFrequency[m]) { 902 if (first) { 903 first = 0; 904 } else { 905 phraseData << ";"; 906 } 907 // Output the document number. Note that here we've numbered the 908 // N documents from 0 to N-1, but later they'll be 1-N. Thus we 909 // add 1 to the document id when we output it. 910 phraseData << "d" << (m+1); 911 // Next, output the frequency with which the document occurs, but 912 // only if it is > 1. 913 if (documentFrequency[m] > 1) { 914 phraseData << "," << documentFrequency[m]; 915 } 916 } 917 } 918 919 phraseData << endl; 920 phraseCounter++; 921 922 // feedback 923 if (verbosity) { 924 if (phraseCounter % 1000 == 0) { 925 cout << "phrase " << phraseCounter << ": " 926 << "cell " << p.firstSuffixIndex << " - " << p << endl; 927 } 928 } 929 } 930 } 931 ij = p.lastSuffixIndex + 1; 932 } 933 outPhrase.close(); 934 935 // REMAINING PASSES 936 // The previous outPhrase file forms the input to each new pass 937 cellcount start, length; 938 while (outPhraseCounter > 0) { 939 940 // Start a new pass 941 phrasePass++; 942 if (verbosity) { 943 cout << "Starting pass " << phrasePass << endl; 944 } 945 946 // Open the input file 947 char inPhraseName[FILENAME_MAX]; 948 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1); 949 ifstream inPhrase (inPhraseName, ios::in); 950 if (!inPhrase) { 951 cerr << "File " << inPhraseName << " could not be opened\n"; 952 exit(1); 953 } 954 955 // Open the output file 956 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass); 957 outPhrase.open(outPhraseName, ios::out); 958 if (!outPhrase) { 959 cerr << "File " << outPhraseName << " could not be opened\n"; 960 exit(1); 961 } 962 outPhraseCounter = 0; 963 964 // Process each phrase 965 while(inPhrase >> start >> length) { 966 967 // Ignore the phrase if we have expanded it before 968 if (isPhraseStored(start, length)) 969 continue; 970 971 // Remember that we have examined this phrase 972 rememberThisPhrase(start, length); 973 974 // Find the phrase in the suffixarray 975 p = Phrase(suffixArray[start], length, SUFFIX); 976 p.findFirstAndLastSuffix(start, inputLength-1); 977 978 // Ignore the phrase if it only occurs once 979 if (p.suffixFrequency < 2) 980 continue; 981 982 // Write the phrase text; 983 phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":"; 984 985 // Expand the phrase, if it is fewer than 8 words long 986 if (length <= 8) { 987 988 // Get the minimal expansions for this phrase 989 getExpansions(p, result); 990 991 // write the results 992 phraseData << result.size() << ":"; 993 994 for (cellcount i = 0; i < result.size(); i++) { 995 if (i) { 996 phraseData << ","; 997 } 998 phraseData << result[i].firstSuffixIndex << "-" << result[i].length; 999 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl; 1000 outPhraseCounter++; 1001 } 1002 result.clear(); 1003 1004 } else { 1005 // phrase is too long to expand further 1006 phraseData << "0:"; 1007 } 1008 1009 // Write the documents in which this phrase occurs 1010 df = getDocumentOccurrances(p, documentFrequency); 1011 phraseData << ":" << df << ":"; 1012 1013 // write the documents 1014 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) { 1015 if (documentFrequency[i]) { 1016 if (first) { 1017 first = 0; 1018 } else { 1019 phraseData << ";"; 1020 } 1021 // Output the document number. Note that here we've numbered the 1022 // N documents from 0 to N-1, but later they'll be 1-N. Thus we 1023 // add 1 to the document id when we output it. 1024 phraseData << "d" << (i+1); 1025 // Next, output the frequency with which the document occurs, but 1026 // only if it is > 1. 1027 if (documentFrequency[i] > 1) { 1028 phraseData << "," << documentFrequency[i]; 1029 } 1030 } 1031 } 1032 1033 phraseData << endl; 1034 phraseCounter++; 1035 1036 // feedback 1037 if (verbosity) { 1038 if (phraseCounter % 1000 == 0) { 1039 cout << "phrase " << phraseCounter << ": "<< "start " << start 1040 << ", length " << length << " - " << p << endl; 1041 } 1042 } 1043 1044 } 1045 1046 inPhrase.close(); 1047 outPhrase.close(); 1048 } 1049 1050 phraseData.close(); 1051 deletePhraseMemory(); 1052 1053 delete [] documentFrequency; 1054 delete [] symbols; 1055 delete [] suffixArray; 1056 delete [] prefixArray; 1057 delete [] suffixCheck; 1058 delete [] documentArray; 1059 1060 1061 1062 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl; 1063 return 0; 1064 } 1065 1066 1067 1068 1069
Note:
See TracChangeset
for help on using the changeset viewer.