Changeset 2801
- Timestamp:
- 2001-10-15T15:02:54+13:00 (23 years ago)
- Location:
- trunk/gsdl/src/phind/generate
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/phind/generate/phrase.cpp
r2704 r2801 153 153 --length; 154 154 --back; 155 if (phraseMode==STOPWORDS) { 156 while (*back >=firstStopSymbol && *back <= lastStopSymbol) { 157 --length; 158 --back; 159 } 160 } 155 161 clearSuffix(); 156 162 clearPrefix(); … … 161 167 --length; 162 168 ++forward; 169 if (phraseMode==STOPWORDS) { 170 while (*forward >=firstStopSymbol && *forward <= lastStopSymbol) { 171 --length; 172 ++forward; 173 } 174 } 163 175 clearSuffix(); 164 176 clearPrefix(); … … 391 403 392 404 int Phrase::findFirstAndLastSuffix() { 405 393 406 return findFirstAndLastSuffix(0, inputLength-1); 394 407 } … … 399 412 assert(begin <= end); 400 413 401 // cout << "findFirstAndLastSuffix (" << begin << " - " << end << ") : " << toString() << endl;402 403 414 // if we're only searching one cell, it is very easy 404 415 if (begin == end) { … … 415 426 416 427 do { 417 // cout << "Find anywhere in " << begin << " - " << end << endl;418 428 c = (end + begin) / 2; 419 429 cmp = compareSuffix(suffixArray[c], length); … … 436 446 437 447 do { 438 // cout << "first-searching with " << begin << " - " << end << endl;439 448 if (begin == end) { 440 449 c = begin; … … 451 460 // to find the first occurance, suffixArray[c] must be the same as the 452 461 // phrase, but suffixArray[c-1] must be different. 453 cmp = compareSuffix(suffixArray[c-1], length); 454 if (cmp == 0) { 455 end = c - 1; 456 assert(end >= begin); 457 cmp = 1; 458 } else { 459 cmp = 0; 462 if (c>0) { 463 cmp = compareSuffix(suffixArray[c-1], length); 464 if (cmp == 0) { 465 end = c - 1; 466 assert(end >= begin); 467 cmp = 1; 468 } else { 469 cmp = 0; 470 } 460 471 } 461 472 } … … 473 484 474 485 do { 475 // cout << "last-searching with " << begin << " - " << end << endl;476 486 if (begin == end) { 477 487 c = begin; … … 525 535 assert(begin <= end); 526 536 527 // cout << "findFirstAndLastPrefix (" << begin << " - " << end << ") : " << toString() << endl;528 529 537 // if we're only searching one cell, it is very easy 530 538 if (begin == end) { … … 541 549 542 550 do { 543 // cout << "Find anywhere in prefixarray " << begin << " - " << end << endl;544 551 c = (end + begin) / 2; 545 552 cmp = comparePrefix(prefixArray[c], length); … … 560 567 561 568 do { 562 // cout << "first-prefix-searching with " << begin << " - " << end << endl;563 569 if (begin == end) { 564 570 c = begin; … … 567 573 c = (begin + end) / 2; 568 574 cmp = comparePrefix(prefixArray[c], length); 569 // cout << "cmp: " << cmp << ", c: " << c << endl;570 575 if (cmp == 1) { 571 576 // target phrase is lower than phrase at prefixArray[c] … … 601 606 602 607 do { 603 // cout << "last-prefix-searching with " << begin << " - " << end << endl;604 608 if (begin == end) { 605 609 c = begin; … … 829 833 } 830 834 831 832 833 834 835 836 837 838 839 840 841 842 835 // Compare the length of two phrases 843 836 // -
trunk/gsdl/src/phind/generate/phrase.h
r2694 r2801 126 126 friend std::ostream &operator<<(std::ostream &stream, const Phrase &phrase); 127 127 128 int uniqueSuffixExtension; 129 int uniquePrefixExtension; 130 128 131 private: 129 132 130 133 // Does the phrase have a unique suffix/prefix extension? 131 134 // if yes, then 1; if no then 0; if unknown then -1; 132 int uniqueSuffixExtension;133 int uniquePrefixExtension;134 135 135 136 // reset a phrase -
trunk/gsdl/src/phind/generate/suffix.cpp
r2498 r2801 1 1 /********************************************************************** 2 2 * 3 * suffix.cpp -- Extract the repeated phrases in the input using 4 * suffix and prefix arrays. 3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix 4 * and prefix arrays (cgn & gwp's simpler algorithm, 5 * and kjm's improvements). 5 6 * 6 7 * Copyright 2000 Gordon W. Paynter … … 61 62 symbol *symbols; 62 63 symbol **suffixArray; 64 symbol **prefixArray; 63 65 check *suffixCheck; 64 symbol **prefixArray;65 check *prefixCheck;66 66 67 67 … … 70 70 symbol **documentArray; 71 71 72 72 73 // Do we accept any phrase, or do we eliminate those ending with stopwords ? 73 74 int phraseMode = ANYPHRASE; //STOPWORDS; 74 75 76 75 77 // The filestem of the collection's phindex directory 76 78 char collection[FILENAME_MAX]; 77 79 78 int suffixCompare(const void *, const void *);79 int prefixCompare(const void *, const void *);80 int pointerCompare(const void *, const void *);81 82 int readNumbers();83 void readStatistics();84 85 void getMinimalExpansions(Phrase &p, vector<Phrase> &results);86 cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);87 80 88 81 // The ranges of the stopword and content-word symbols for the collection … … 93 86 94 87 95 96 97 // Phrase memory 98 // We have to "remember" each phrase that we've expanded 88 // Some useful comparison functions, defined below. 89 int suffixCompare(const void *, const void *); 90 int prefixCompare(const void *, const void *); 91 int pointerCompare(const void *, const void *); 92 93 94 // Functions for implementing "phrase memory". These let us "remember" 95 // each phrase that we've expanded without using too much memory. 99 96 void initialisePhraseMemory(); 100 97 void rememberThisPhrase(cellindex index, cellcount length); … … 107 104 108 105 109 int main (int argc, char * argv[]) { 110 111 // Command-line arguments 112 // argv[1] is the phindex directory 113 // argv[2] is the maximum array symbol length (optional) 114 // argv[3] is the mode, where 1 is stopword mode (optional) 115 if (argc < 2) { 116 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl; 117 exit(1); 118 } 119 120 // collection directory 121 strcpy(collection, argv[1]); 122 123 // mode parameter 124 phraseMode = atoi(argv[2]); 125 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE)); 126 127 // optional verbosity parameter 128 if (argc == 4) { 129 verbosity = atoi(argv[3]); 130 assert (verbosity >= 0); 131 } 132 133 if (verbosity) { 134 cout << "Suffix phrase extraction program" << endl; 135 } 136 137 if (verbosity > 1) { 138 if (phraseMode == STOPWORDS) { 139 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl; 140 } else { 141 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl; 142 } 143 } 144 145 // Read the statistics file 146 readStatistics(); 147 148 // Read the numbers file 149 readNumbers(); 150 151 // Create the suffix & prefix arrays 152 suffixArray = new symbol *[inputLength]; 153 prefixArray = new symbol *[inputLength]; 154 suffixCheck = new check[inputLength]; 155 prefixCheck = new check[inputLength]; 156 if (prefixCheck == NULL) { 157 cerr << "Suffix error: not enough memory to hold " << inputLength 158 << " symbols." << endl; 159 exit(2); 160 } 161 162 // Initialise prefix and suffix arrays 163 for (cellcount j = 0; j < inputLength; j++) { 164 suffixArray[j] = &symbols[j]; 165 prefixArray[j] = &symbols[j]; 166 } 167 qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare); 168 qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare); 169 170 171 // Create the document arrays 172 if (numberOfDocuments == 0) { 173 cerr << "There are no documents in this collection!" << endl; 174 exit(1); 175 } 176 if (verbosity > 1) { 177 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl; 178 } 179 180 // The document frequecy array is used to count the number of times 181 // each phrase occurs in each document. The number of documents in 182 // which a phrase occurs is stored in df. 183 frequency *documentFrequency = new frequency[numberOfDocuments]; 184 frequency df; 185 186 // documentArray will be searched in order to discover which document 187 // each phrase occurs in. 188 documentArray = new symbol *[numberOfDocuments]; 189 190 // Discover all the DOCUMENTSTART symbols and store as a phrase 191 cellindex d = 0; 192 while (*suffixArray[d] != DOCUMENTSTART) { 193 d++; 194 } 195 Phrase p(suffixArray[d], 1, SUFFIX); 196 p.findFirstAndLastSuffix(d, inputLength-1); 106 // Get a phrase's expansions 107 // 108 // Get the set of "minimal", "maximal", non-unique expansions of a 109 // phrase p, using the simpler algorithm that Craig and Gordon came up 110 // with at Google. 111 // 112 // Returns a vector of Expansions. 113 114 void getExpansions(Phrase &p, vector<Phrase> &results) { 115 116 // 1. Get the initial candidates 117 vector<Phrase> candidates; 118 p.initialSuffixCandidates(candidates); 119 int suffcands = candidates.size(); 120 p.initialPrefixCandidates(candidates); 121 122 if (candidates.size() == 0) 123 return; 124 125 vector<Phrase>::iterator i; 126 for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) { 127 i->expandWhileUniquePrefixExtension(); 128 i->ensureSuffixFound(); 129 } 130 for (i; i != candidates.end(); ++i) { 131 i->expandWhileUniqueSuffixExtension(); 132 } 133 134 // 3. Ensure minimality: ignore phrases whose subphrases are also found 135 results.clear(); 136 137 // Initialise the candidates, check array, and various variables. 138 sort(candidates.begin(), candidates.end(), isShorter); 139 for (cellcount j = 0; j < inputLength; j++) 140 suffixCheck[j] = 0; 141 unsigned minimum_length = candidates.begin()->length; 197 142 198 // Insert the document locations (as pointers) into documentArray 199 for (cellcount i = 0; i < p.suffixFrequency; i++) { 200 documentArray[i] = suffixArray[i + p.firstSuffixIndex]; 201 } 202 203 // Sort the document array into ascending order of raw pointer value 204 qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare); 205 206 207 // Extract phrases 208 // 209 // We will make several passesover the data, in each case considering 210 // a set of input phrases and generating a set of output phrases, which 211 // we will expancd in later passes. 212 // 213 // The input phrases in the first pass will be the vocabulary. 214 // In later passes, the input phrases will be the output phrases of the 215 // previous pass. 216 // 217 // In each pass we will consider each input phrase in turn. If we 218 // have seen it before, we will ignore it. Otherwise, we will expand 219 // it and add its expansions to the set of output phrases. 220 221 // Store the phrase data in the phrases file 222 char phraseDataName[FILENAME_MAX]; 223 sprintf(phraseDataName, "%s/phrases", collection); 224 ofstream phraseData(phraseDataName, ios::out); 225 if (!phraseData) { 226 cout << "File " << phraseDataName << " could not be opened\n"; 227 exit(1); 228 } 229 230 // Count the number of phrases output 231 unsigned long int phraseCounter = 0; 232 233 // Set up the phrase expansion memory. 234 // We need this so that we don't expand a phrase more than once 235 initialisePhraseMemory(); 236 237 // The current pass numebr 238 int phrasePass = 1; 239 240 241 // PASS NUMBER 1 242 if (verbosity > 1) { 243 cout << "Starting pass " << phrasePass << endl; 244 } 245 246 ofstream outPhrase; 247 char outPhraseName[FILENAME_MAX]; 248 unsigned long int outPhraseCounter = 0; 249 250 // On the first pass, simply work through the vocabulary 251 sprintf(outPhraseName, "%s/outPhrase.1", collection); 252 outPhrase.open(outPhraseName, ios::out); 253 if (!outPhrase) { 254 cerr << "File " << outPhraseName << " could not be opened\n"; 255 exit(1); 256 } 257 258 // Iterate over the different symbols by working through the suffix array 259 vector<Phrase> result; 260 cellindex ij = 0; 261 char *tmpString; 262 263 while (ij < inputLength) { 264 265 // make a new phrase of length 1 266 p = Phrase(suffixArray[ij], 1, SUFFIX); 267 p.findFirstAndLastSuffix(ij, inputLength-1); 268 269 // cout << "cell " << ij << " - " << p.toString() << endl; 270 271 // We ignore this symbol if it occurs only once, if it is a delimiter, 272 // of if we are in stopwords mode and it is a stopword 273 // 274 // We could imagine a new mode/command-line option, which is like 275 // STOPWORDS but without this restrictrion. This would let you browse 276 // from "the" to "the AGRIS" for example, but not from "AGRIS" to 277 // "the AGRIS" (where the is a stopword and AGRIS a content word). 278 // The system used to work like this; it is easy to implement, but 279 // it explodes the size of the indexes. So: would it be useful? 280 if (!((p.suffixFrequency <= 1) || 281 // (*suffixArray[ij] != 23054) || 282 (*suffixArray[ij] <= LASTDELIMITER) || 283 ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) { 284 285 // Get minimal expansions of the phrase 286 getMinimalExpansions(p, result); 143 // Try to add each candidate to the results set, ignoring the non-minimal 144 for (vector<Phrase>::iterator candidate = candidates.begin(); 145 candidate != candidates.end(); candidate++) { 146 147 // Make a copy of candidate to mutilate while performing sub-phrase checks 148 Phrase temp_phrase(*candidate); 149 bool shorter_found = false; 150 151 // Check for shorter and shorter versions of the tenporary phrase 152 while (temp_phrase.length >= minimum_length && !shorter_found) { 153 temp_phrase.ensureSuffixFound(); 154 if (suffixCheck[temp_phrase.firstSuffixIndex] == 0) 155 temp_phrase.shortenByOneAtPrefix(); 156 else 157 shorter_found = true; 158 159 // Possible efficiency here: we can finish if the prefix of c 160 // and temp_phrase are the same for candidate->length symbols. 161 } 287 162 288 if (!result.empty()) { 289 290 // Remember that we have expanded this phrase 291 rememberThisPhrase(ij, 1); 292 293 // write the phrase text 294 tmpString = p.toString(); 295 phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":" 296 << result.size() << ":"; 297 delete [] tmpString; 298 299 // write the results 300 for (cellcount k = 0; k < result.size(); k++) { 301 if (k) { 302 phraseData << ","; 303 } 304 phraseData << result[k].firstSuffixIndex << "-" << result[k].length; 305 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl; 306 outPhraseCounter++; 307 } 308 result.clear(); 309 310 // Write the documents in which this phrase occurs 311 df = getDocumentOccurrances(p, documentFrequency); 312 phraseData << ":" << df << ":"; 313 314 // write the documents 315 for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) { 316 if (documentFrequency[m]) { 317 if (first) { 318 first = 0; 319 } else { 320 phraseData << ";"; 321 } 322 // Output the document number. Note that here we've numbered the 323 // N documents from 0 to N-1, but later they'll be 1-N. Thus we 324 // add 1 to the document id when we output it. 325 phraseData << "d" << (m+1); 326 // Next, output the frequency with which the document occurs, but 327 // only if it is > 1. 328 if (documentFrequency[m] > 1) { 329 phraseData << "," << documentFrequency[m]; 330 } 331 } 332 } 333 334 phraseData << endl; 335 phraseCounter++; 336 337 // feedback 338 if (verbosity) { 339 if (phraseCounter % 1000 == 0) { 340 tmpString = p.toString(); 341 cout << "phrase " << phraseCounter << ": " 342 << "cell " << p.firstSuffixIndex << " - " << tmpString << endl; 343 delete [] tmpString; 344 } 345 } 346 } 347 } 348 ij = p.lastSuffixIndex + 1; 349 } 350 outPhrase.close(); 351 352 // REMAINING PASSES 353 // The previous outPhrase file forms the input to each new pass 354 cellcount start, length; 355 while (outPhraseCounter > 0) { 356 357 // Start a new pass 358 phrasePass++; 359 if (verbosity) { 360 cout << "Starting pass " << phrasePass << endl; 361 } 362 363 // Open the input file 364 char inPhraseName[FILENAME_MAX]; 365 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1); 366 ifstream inPhrase (inPhraseName, ios::in); 367 if (!inPhrase) { 368 cerr << "File " << inPhraseName << " could not be opened\n"; 369 exit(1); 370 } 371 372 // Open the output file 373 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass); 374 outPhrase.open(outPhraseName, ios::out); 375 if (!outPhrase) { 376 cerr << "File " << outPhraseName << " could not be opened\n"; 377 exit(1); 378 } 379 outPhraseCounter = 0; 380 381 // Process each phrase 382 while(inPhrase >> start >> length) { 383 384 // Ignore the phrase if we have expanded it before 385 if (isPhraseStored(start, length)) { 386 continue; 387 } 388 389 // Remember that we have examined this phrase 390 rememberThisPhrase(start, length); 391 392 // Find the phrase in the suffixarray 393 p = Phrase(suffixArray[start], length, SUFFIX); 394 p.findFirstAndLastSuffix(start, inputLength-1); 395 396 // cout << "index " << start << ", length " << length << " - " << p.toString() << endl; 397 398 399 // Ignore the phrase if it only occurs once 400 if (p.suffixFrequency < 2) { 401 continue; 402 } 403 404 405 // Write the phrase text tmpString = p.toString(); 406 tmpString = p.toString(); 407 phraseData << start << "-" << length << ":" << tmpString << ":" 408 << p.suffixFrequency << ":"; 409 delete [] tmpString; 410 411 412 // Expand the phrase, if it is fewer than 8 words long 413 if (length <= 8) { 414 415 // Get the minimal expansions for this phrase 416 getMinimalExpansions(p, result); 417 418 // write the results 419 phraseData << result.size() << ":"; 420 421 for (cellcount i = 0; i < result.size(); i++) { 422 if (i) { 423 phraseData << ","; 424 } 425 phraseData << result[i].firstSuffixIndex << "-" << result[i].length; 426 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl; 427 outPhraseCounter++; 428 } 429 result.clear(); 430 431 } else { 432 // phrase is too long to expand further 433 phraseData << "0:"; 434 } 435 436 437 // Write the documents in which this phrase occurs 438 df = getDocumentOccurrances(p, documentFrequency); 439 phraseData << ":" << df << ":"; 440 441 // write the documents 442 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) { 443 if (documentFrequency[i]) { 444 if (first) { 445 first = 0; 446 } else { 447 phraseData << ";"; 448 } 449 // Output the document number. Note that here we've numbered the 450 // N documents from 0 to N-1, but later they'll be 1-N. Thus we 451 // add 1 to the document id when we output it. 452 phraseData << "d" << (i+1); 453 // Next, output the frequency with which the document occurs, but 454 // only if it is > 1. 455 if (documentFrequency[i] > 1) { 456 phraseData << "," << documentFrequency[i]; 457 } 458 } 459 } 460 461 phraseData << endl; 462 phraseCounter++; 463 464 // feedback 465 if (verbosity) { 466 if (phraseCounter % 1000 == 0) { 467 tmpString = p.toString(); 468 cout << "phrase " << phraseCounter << ": "<< "start " << start 469 << ", length " << length << " - " << tmpString << endl; 470 delete [] tmpString; 471 } 472 } 473 474 } 475 476 inPhrase.close(); 477 outPhrase.close(); 478 } 479 480 phraseData.close(); 481 deletePhraseMemory(); 482 483 delete [] documentFrequency; 484 delete [] symbols; 485 delete [] suffixArray; 486 delete [] prefixArray; 487 delete [] suffixCheck; 488 delete [] prefixCheck; 489 delete [] documentArray; 490 491 492 493 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl; 494 return 0; 495 } 496 497 498 // Get Minimal Expansions 499 // 500 // Get the set of "minimal" expansions of a phrase p, using the 501 // algorithm described in the documentation. 502 // 503 // Returns a vector of Expansions. 504 505 void getMinimalExpansions(Phrase &p, vector<Phrase> &results) { 506 507 // 1. Initialise the result and candiate vectors 508 vector<Phrase> candidates; 509 for (cellcount j = 0; j < inputLength; j++) { 510 suffixCheck[j] = 0; 511 prefixCheck[j] = 0; 512 } 513 514 // 2. Expand the phrase p 515 516 // 2.1 Create the candidate set 517 p.initialSuffixCandidates(candidates); 518 p.initialPrefixCandidates(candidates); 519 520 // 2.2 Sort the candidates by phrase length 521 make_heap(candidates.begin(), candidates.end(), isLonger); 522 523 // 3. While candidates is non-empty, confirm the phrases it 524 // contains, expanding them as required 525 while (!candidates.empty()) { 526 527 // 3.1 Get next candidate 528 pop_heap(candidates.begin(), candidates.end(), isLonger); 529 Phrase c = candidates.back(); 530 candidates.pop_back(); 531 532 // 3.2 If we know there are no unique right extensions 533 // (i.e. this is a phrase drawn from the suffix array) 534 if (!c.hasUniqueSuffixExtension()) { 535 536 c.ensurePrefixFound(); 537 538 // 3.2.1 Ignore candidate if we have used a subphrase instead 539 if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) { 540 // cout << "ignoring" << endl; 541 } 542 543 // 3.2.2 If candidate has a unique left (prefix) extension, 544 // Then extend it and add it back into Candidates. 545 else if (c.hasUniquePrefixExtension()) { 546 // cout << "expanding prefix " << c.toString() << "=> "; 547 c.expandUniquePrefixExtensionByOne(); 548 candidates.push_back(c); 549 push_heap(candidates.begin(), candidates.end(), isLonger); 550 } 551 552 // 3.2.3 If candidate has no unique left (prefix) extension, 553 // Then add it to the list of results. 554 else { 555 // cout << "no unique prefix, add to results" << endl; 556 results.push_back(c); 557 for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) { 558 suffixCheck[i] = c.length; 559 } 560 for (cellcount ik = c.firstPrefixIndex; ik <= c.lastPrefixIndex; ik++) { 561 prefixCheck[ik] = c.length; 562 } 563 } 564 } 565 566 // 3.3 If we know there are no unique left extensions, 567 // Then fdo the same as for 3.2 but exchange suffix & prefix 568 else if (!c.hasUniquePrefixExtension()) { 569 570 c.ensureSuffixFound(); 571 572 // 3.3.1 573 if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) { 574 575 } 576 577 // 3.3.2 578 else if (c.hasUniqueSuffixExtension()) { 579 c.expandUniqueSuffixExtensionByOne(); 580 candidates.push_back(c); 581 push_heap(candidates.begin(), candidates.end(), isLonger); 582 } 583 584 // 3.3.3 585 else { 586 results.push_back(c); 587 for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) { 588 suffixCheck[i] = c.length; 589 } 590 for (cellcount ijk = c.firstPrefixIndex; ijk <= c.lastPrefixIndex; ijk++) { 591 prefixCheck[ijk] = c.length; 592 } 593 594 } 163 if (!shorter_found) { 164 results.push_back(*candidate); 165 candidate->ensureSuffixFound(); 166 for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k) 167 suffixCheck[k] = candidate->length; 595 168 } 596 169 } … … 751 324 // Given a phrase, what documents does it occur in? 752 325 753 cellcount getDocumentOccurrances( Phrase &p, cellcount *frequency) {754 755 // cout << "searching for \""<< p .toString()<< "\" in documents "326 cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) { 327 328 // cout << "searching for \""<< p << "\" in documents " 756 329 // << 0 << "-" << numberOfDocuments - 1 << endl; 757 330 … … 1038 611 } 1039 612 613 1040 614 bool isLongPhraseStored(cellindex index, cellcount length) { 1041 615 … … 1079 653 } 1080 654 655 1081 656 void deleteLongPhraseMemory() { 1082 657 // remove the hash & other files … … 1090 665 1091 666 1092 1093 1094 667 // Read the collection statistics file 668 // 1095 669 void readStatistics() { 1096 670 … … 1129 703 } 1130 704 1131 1132 1133 1134 705 cellcount getContentCount(symbol firstContent) { 706 707 cellcount content=0; 708 for (cellcount i=0; i<inputLength; i++) { 709 if (symbols[i]>=firstContent) content++; 710 } 711 712 return content; 713 } 714 715 int main (int argc, char * argv[]) { 716 717 // Command-line arguments 718 // argv[1] is the phindex directory 719 // argv[2] is the maximum array symbol length (optional) 720 // argv[3] is the mode, where 1 is stopword mode (optional) 721 if (argc < 2) { 722 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl; 723 exit(1); 724 } 725 726 // collection directory 727 strcpy(collection, argv[1]); 728 729 // mode parameter 730 phraseMode = atoi(argv[2]); 731 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE)); 732 733 // optional verbosity parameter 734 if (argc == 4) { 735 verbosity = atoi(argv[3]); 736 assert (verbosity >= 0); 737 } 738 739 if (verbosity) { 740 cout << "suffix: the phrase extraction program" << endl; 741 } 742 743 if (verbosity > 1) { 744 if (phraseMode == STOPWORDS) { 745 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl; 746 } else { 747 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl; 748 } 749 } 750 751 // Read the statistics file 752 readStatistics(); 753 754 // Read the numbers file 755 readNumbers(); 756 757 if (numberOfDocuments == 0) { 758 cerr << "There are no documents in this collection!" << endl; 759 exit(1); 760 } 761 762 symbol firstContent; 763 if (phraseMode==STOPWORDS) firstContent=firstContentSymbol; 764 else firstContent = firstStopSymbol; 765 766 cellcount contentLength = 0; 767 contentLength = getContentCount(firstContent); 768 769 // Create the suffix & prefix arrays 770 suffixArray = new symbol *[contentLength]; 771 prefixArray = new symbol *[contentLength]; 772 773 cellcount here=0; 774 // Initialise prefix and suffix arrays, only use the needed suffixes 775 for (cellcount j = 0; j < inputLength; j++) { 776 if (symbols[j]>=firstContent) { 777 suffixArray[here] = &symbols[j]; 778 prefixArray[here] = &symbols[j]; 779 here++; 780 } 781 } 782 qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare); 783 qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare); 784 785 suffixCheck = new check[contentLength]; 786 if (suffixCheck == NULL) { 787 cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl; 788 exit(2); 789 } 790 for (cellcount j = 0; j < contentLength; j++) 791 suffixCheck[j] = 0; 792 793 cout <<"\ngenerating the phrase hierarchy\n\n"; 794 795 // Create the document arrays 796 if (verbosity > 1) { 797 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl; 798 } 799 800 // The document frequecy array is used to count the number of times 801 // each phrase occurs in each document. The number of documents in 802 // which a phrase occurs is stored in df. 803 frequency *documentFrequency = new frequency[numberOfDocuments]; 804 frequency df; 805 806 // documentArray will be searched in order to discover which document 807 // each phrase occurs in. 808 documentArray = new symbol *[numberOfDocuments]; 809 810 // just scan through the input text to find the doc starts 811 cellindex d = 0; 812 for (cellindex i=0; i<inputLength; i++) { 813 if (symbols[i] == DOCUMENTSTART) { 814 documentArray[d] = &symbols[i]; 815 d++; 816 } 817 } 818 819 // the phrases stuff is expecting inputLength to be the length of the 820 // suffix array, so change it. 821 inputLength = contentLength; 822 823 // Extract phrases 824 // 825 // We will make several passesover the data, in each case considering 826 // a set of input phrases and generating a set of output phrases, which 827 // we will expancd in later passes. 828 // 829 // The input phrases in the first pass will be the vocabulary. 830 // In later passes, the input phrases will be the output phrases of the 831 // previous pass. 832 // 833 // In each pass we will consider each input phrase in turn. If we 834 // have seen it before, we will ignore it. Otherwise, we will expand 835 // it and add its expansions to the set of output phrases. 836 837 // Store the phrase data in the phrases file 838 char phraseDataName[FILENAME_MAX]; 839 sprintf(phraseDataName, "%s/phrases", collection); 840 ofstream phraseData(phraseDataName, ios::out); 841 if (!phraseData) { 842 cout << "File " << phraseDataName << " could not be opened\n"; 843 exit(1); 844 } 845 846 // Count the number of phrases output 847 unsigned long int phraseCounter = 0; 848 849 // Set up the phrase expansion memory. 850 // We need this so that we don't expand a phrase more than once 851 initialisePhraseMemory(); 852 853 // The current pass numebr 854 int phrasePass = 1; 855 856 857 // PASS NUMBER 1 858 if (verbosity > 1) { 859 cout << "Starting pass " << phrasePass << endl; 860 } 861 862 ofstream outPhrase; 863 char outPhraseName[FILENAME_MAX]; 864 unsigned long int outPhraseCounter = 0; 865 866 // On the first pass, simply work through the vocabulary 867 sprintf(outPhraseName, "%s/outPhrase.1", collection); 868 outPhrase.open(outPhraseName, ios::out); 869 if (!outPhrase) { 870 cerr << "File " << outPhraseName << " could not be opened\n"; 871 exit(1); 872 } 873 874 // Iterate over the different symbols by working through the suffix array 875 vector<Phrase> result; 876 cellindex ij = 0; 877 char *tmpString; 878 879 Phrase p; 880 while (ij < inputLength) { 881 882 // make a new phrase of length 1 883 p = Phrase(suffixArray[ij], 1, SUFFIX); 884 p.findFirstAndLastSuffix(ij, inputLength-1); 885 886 // We ignore this symbol if it occurs only once, if it is a delimiter, 887 // of if we are in stopwords mode and it is a stopword 888 // - in this new version, only need to check freq 889 // We could imagine a new mode/command-line option, which is like 890 // STOPWORDS but without this restrictrion. This would let you browse 891 // from "the" to "the AGRIS" for example, but not from "AGRIS" to 892 // "the AGRIS" (where the is a stopword and AGRIS a content word). 893 // The system used to work like this; it is easy to implement, but 894 // it explodes the size of the indexes. So: would it be useful? 895 if (p.suffixFrequency > 1) { 896 // Get minimal expansions of the phrase 897 getExpansions(p, result); 898 899 if (!result.empty()) { 900 901 // Remember that we have expanded this phrase 902 rememberThisPhrase(ij, 1); 903 904 // write the phrase text 905 phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":" 906 << result.size() << ":"; 907 908 // write the results 909 for (cellcount k = 0; k < result.size(); k++) { 910 if (k) { 911 phraseData << ","; 912 } 913 phraseData << result[k].firstSuffixIndex << "-" << result[k].length; 914 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl; 915 outPhraseCounter++; 916 } 917 result.clear(); 918 919 // Write the documents in which this phrase occurs 920 df = getDocumentOccurrances(p, documentFrequency); 921 phraseData << ":" << df << ":"; 922 923 // write the documents 924 for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) { 925 if (documentFrequency[m]) { 926 if (first) { 927 first = 0; 928 } else { 929 phraseData << ";"; 930 } 931 // Output the document number. Note that here we've numbered the 932 // N documents from 0 to N-1, but later they'll be 1-N. Thus we 933 // add 1 to the document id when we output it. 934 phraseData << "d" << (m+1); 935 // Next, output the frequency with which the document occurs, but 936 // only if it is > 1. 937 if (documentFrequency[m] > 1) { 938 phraseData << "," << documentFrequency[m]; 939 } 940 } 941 } 942 943 phraseData << endl; 944 phraseCounter++; 945 946 // feedback 947 if (verbosity) { 948 if (phraseCounter % 1000 == 0) { 949 cout << "phrase " << phraseCounter << ": " 950 << "cell " << p.firstSuffixIndex << " - " << p << endl; 951 } 952 } 953 } 954 } 955 ij = p.lastSuffixIndex + 1; 956 } 957 outPhrase.close(); 958 959 // REMAINING PASSES 960 // The previous outPhrase file forms the input to each new pass 961 cellcount start, length; 962 while (outPhraseCounter > 0) { 963 964 // Start a new pass 965 phrasePass++; 966 if (verbosity) { 967 cout << "Starting pass " << phrasePass << endl; 968 } 969 970 // Open the input file 971 char inPhraseName[FILENAME_MAX]; 972 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1); 973 ifstream inPhrase (inPhraseName, ios::in); 974 if (!inPhrase) { 975 cerr << "File " << inPhraseName << " could not be opened\n"; 976 exit(1); 977 } 978 979 // Open the output file 980 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass); 981 outPhrase.open(outPhraseName, ios::out); 982 if (!outPhrase) { 983 cerr << "File " << outPhraseName << " could not be opened\n"; 984 exit(1); 985 } 986 outPhraseCounter = 0; 987 988 // Process each phrase 989 while(inPhrase >> start >> length) { 990 991 // Ignore the phrase if we have expanded it before 992 if (isPhraseStored(start, length)) 993 continue; 994 995 // Remember that we have examined this phrase 996 rememberThisPhrase(start, length); 997 998 // Find the phrase in the suffixarray 999 p = Phrase(suffixArray[start], length, SUFFIX); 1000 p.findFirstAndLastSuffix(start, inputLength-1); 1001 1002 // Ignore the phrase if it only occurs once 1003 if (p.suffixFrequency < 2) 1004 continue; 1005 1006 // Write the phrase text; 1007 phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":"; 1008 1009 // Expand the phrase, if it is fewer than 8 words long 1010 if (length <= 8) { 1011 1012 // Get the minimal expansions for this phrase 1013 getExpansions(p, result); 1014 1015 // write the results 1016 phraseData << result.size() << ":"; 1017 1018 for (cellcount i = 0; i < result.size(); i++) { 1019 if (i) { 1020 phraseData << ","; 1021 } 1022 phraseData << result[i].firstSuffixIndex << "-" << result[i].length; 1023 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl; 1024 outPhraseCounter++; 1025 } 1026 result.clear(); 1027 1028 } else { 1029 // phrase is too long to expand further 1030 phraseData << "0:"; 1031 } 1032 1033 // Write the documents in which this phrase occurs 1034 df = getDocumentOccurrances(p, documentFrequency); 1035 phraseData << ":" << df << ":"; 1036 1037 // write the documents 1038 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) { 1039 if (documentFrequency[i]) { 1040 if (first) { 1041 first = 0; 1042 } else { 1043 phraseData << ";"; 1044 } 1045 // Output the document number. Note that here we've numbered the 1046 // N documents from 0 to N-1, but later they'll be 1-N. Thus we 1047 // add 1 to the document id when we output it. 1048 phraseData << "d" << (i+1); 1049 // Next, output the frequency with which the document occurs, but 1050 // only if it is > 1. 1051 if (documentFrequency[i] > 1) { 1052 phraseData << "," << documentFrequency[i]; 1053 } 1054 } 1055 } 1056 1057 phraseData << endl; 1058 phraseCounter++; 1059 1060 // feedback 1061 if (verbosity) { 1062 if (phraseCounter % 1000 == 0) { 1063 cout << "phrase " << phraseCounter << ": "<< "start " << start 1064 << ", length " << length << " - " << p << endl; 1065 } 1066 } 1067 1068 } 1069 1070 inPhrase.close(); 1071 outPhrase.close(); 1072 } 1073 1074 phraseData.close(); 1075 deletePhraseMemory(); 1076 1077 delete [] documentFrequency; 1078 delete [] symbols; 1079 delete [] suffixArray; 1080 delete [] prefixArray; 1081 delete [] suffixCheck; 1082 delete [] documentArray; 1083 1084 1085 1086 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl; 1087 return 0; 1088 } 1089 1090 1091 1092 1093
Note:
See TracChangeset
for help on using the changeset viewer.