Changeset 1882
- Timestamp:
- 2001-01-31T14:08:50+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/phind/generate/suffix.cpp
r1873 r1882 45 45 46 46 // Global variables declared in suffix.h 47 cellcount symbol_array_length;48 47 cellcount inputLength; 49 48 … … 69 68 int pointerCompare(const void *, const void *); 70 69 71 int readNumbers( symbol *numbers);70 int readNumbers(); 72 71 void readStatistics(); 73 72 … … 103 102 // argv[3] is the mode, where 1 is stopword mode (optional) 104 103 if (argc < 2) { 105 cerr << "Usage: " << argv[0] << " collection-directory [max-array-size [mode]]" << endl;104 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl; 106 105 exit(1); 107 106 } … … 110 109 strcpy(collection, argv[1]); 111 110 112 // Symbol length parameter 113 if (argc >= 3) { 114 symbol_array_length = atol(argv[2]); 115 assert(symbol_array_length); 116 } else { 117 symbol_array_length = 1000; 118 } 119 120 // Stopword mode parameter 111 // mode parameter 112 phraseMode = atoi(argv[2]); 113 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE)); 114 115 // optional verbosity parameter 121 116 if (argc == 4) { 122 phraseMode = atoi(argv[3]); 123 assert(phraseMode == STOPWORDS); 124 } 117 verbosity = atoi(argv[3]); 118 assert (verbosity >= 0); 119 } 120 125 121 if (verbosity) { 122 cout << "Suffix phrase extraction program" << endl; 123 } 124 125 if (verbosity > 1) { 126 126 if (phraseMode == STOPWORDS) { 127 cout << "S TOPWORDSmode: no phrase may begin or end with a stopword" << endl;127 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl; 128 128 } else { 129 cout << "A LLPHRASEmode: extract every phrase that occurs more than once" << endl;129 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl; 130 130 } 131 131 } … … 135 135 136 136 // Read the numbers file 137 symbols = new (symbol)[symbol_array_length]; 138 readNumbers(symbols); 139 137 readNumbers(); 140 138 141 139 // Create the suffix & prefix arrays 142 if (verbosity) {143 cout << "Create suffix & prefix arrays for " << inputLength << " symbols" << endl;144 }145 140 suffixArray = new (symbol *)[inputLength]; 146 141 prefixArray = new (symbol *)[inputLength]; 147 142 suffixCheck = new (check)[inputLength]; 148 143 prefixCheck = new (check)[inputLength]; 144 if (prefixCheck == NULL) { 145 cerr << "Suffix error: not enough memory to hold " << inputLength 146 << " symbols." << endl; 147 exit(2); 148 } 149 150 // Initialise prefix and suffix arrays 149 151 for (cellcount j = 0; j < inputLength; j++) { 150 152 suffixArray[j] = &symbols[j]; … … 156 158 157 159 // Create the document arrays 158 if (verbosity) {159 cout << "Create document arrays for " << numberOfDocuments << " documents" << endl;160 }161 160 if (numberOfDocuments == 0) { 162 161 cerr << "There are no documents in this collection!" << endl; 163 162 exit(1); 163 } 164 if (verbosity > 1) { 165 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl; 164 166 } 165 167 … … 226 228 227 229 // PASS NUMBER 1 228 if (verbosity ) {229 cout << endl<< "suffix: starting pass " << phrasePass << endl;230 if (verbosity > 1) { 231 cout << "Starting pass " << phrasePass << endl; 230 232 } 231 233 … … 348 350 phrasePass++; 349 351 if (verbosity) { 350 cout << endl <<"Starting pass " << phrasePass << endl;352 cout << "Starting pass " << phrasePass << endl; 351 353 } 352 354 … … 664 666 665 667 666 // Read the numbers file into an array of symbols.667 // 668 // Each number i s a symbol number; it is essential that the first669 // symbol (and no others) be 0 and the last symbol (and no others)670 // be 1.668 // Read the clauses.numbers file into the "symbols" array. 669 // 670 // Each number in the file is a symbol number; it is essential that 671 // the first symbol (and no others) be COLLECTIONSTART and the last 672 // symbol (and no others) be COLLECTIONEND. 671 673 // 672 674 // Return the number of numbers in the array. 673 675 674 int readNumbers( symbol *numbers) {676 int readNumbers() { 675 677 676 678 char filename[FILENAME_MAX]; 677 679 sprintf(filename, "%s/clauses.numbers", collection); 678 680 if (verbosity) { 679 cout << "Reading numbers f rom: " << filename << endl;681 cout << "Reading numbers file: " << filename << endl; 680 682 } 681 683 … … 687 689 } 688 690 691 // Count the number of symbols 692 inputLength = 0; 693 symbol word; 694 while (inFile >> word) { 695 inputLength++; 696 } 697 inFile.close(); 698 699 // Allocate the symbbols array 700 if (verbosity > 1) { 701 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl; 702 } 703 symbols = new (symbol)[inputLength]; 704 if (symbols == NULL) { 705 cerr << "Suffix error: not enough memory to hold " << inputLength 706 << " symbols." << endl; 707 exit(2); 708 } 709 689 710 // Read the numbers file into the numbers array 690 symbol word; 691 cellcount length = 0; 711 if (verbosity > 2) { 712 cout << "Reading the numbers" << endl; 713 } 714 inFile.open(filename, ios::in); 715 cellcount next = 0; 692 716 numberOfDocuments = 0; 693 while ( (inFile >> word) && (length < symbol_array_length)){694 numbers[length++] = word;717 while (inFile >> word) { 718 symbols[next++] = word; 695 719 if (word == DOCUMENTSTART) { 696 720 numberOfDocuments++; 697 721 } 698 722 } 699 700 // Make sure we were able to read all the numbers 701 if (length >= symbol_array_length) { 702 cerr << "Error: the symbol array is too a short to hold " << filename 703 << endl << "It is currently set to " << symbol_array_length 704 << " and can be adjusted at the command line." << endl; 705 exit(1); 706 } 723 inFile.close(); 707 724 708 725 // Make sure the numbers file is intact 709 assert(numbers[0] == COLLECTIONSTART); 710 assert(numbers[length-1] == COLLECTIONEND); 711 712 // Record the length of the Input file 713 inputLength = length; 714 715 return length; 726 assert(symbols[0] == COLLECTIONSTART); 727 assert(symbols[next-1] == COLLECTIONEND); 728 729 return inputLength; 716 730 } 717 731
Note:
See TracChangeset
for help on using the changeset viewer.