Changeset 2487 for trunk/gsdl/src/phind/generate/suffix.cpp
- Timestamp:
- 2001-06-01T14:51:29+12:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/phind/generate/suffix.cpp
r1882 r2487 28 28 29 29 #include <assert.h> 30 #include <fstream.h>31 #include <iostream.h>32 30 #include <math.h> 33 31 #include <stdio.h> … … 35 33 #include <string.h> 36 34 37 #include <algo.h> 38 #include <heap.h> 39 #include <vector.h> 35 #if defined(GSDL_USE_IOS_H) 36 # include <fstream.h> 37 # include <iostream.h> 38 #else 39 # include <fstream> 40 # include <iostream> 41 #endif 42 43 #if defined(GSDL_USE_STL_H) 44 # if defined(GSDL_USE_ALGO_H) 45 # include <algo.h> 46 # else 47 # include <algorithm.h> 48 # endif 49 # include <vector.h> 50 #else 51 # include <algorithm> 52 # include <vector> 53 #endif 54 #include <stl_heap.h> 55 40 56 41 57 #include "suffix.h" 42 58 #include "phrase.h" 43 44 45 59 46 60 // Global variables declared in suffix.h … … 138 152 139 153 // Create the suffix & prefix arrays 140 suffixArray = new (symbol *)[inputLength];141 prefixArray = new (symbol *)[inputLength];142 suffixCheck = new (check)[inputLength];143 prefixCheck = new (check)[inputLength];154 suffixArray = new symbol *[inputLength]; 155 prefixArray = new symbol *[inputLength]; 156 suffixCheck = new check[inputLength]; 157 prefixCheck = new check[inputLength]; 144 158 if (prefixCheck == NULL) { 145 159 cerr << "Suffix error: not enough memory to hold " << inputLength … … 169 183 // each phrase occurs in each document. The number of documents in 170 184 // which a phrase occurs is stored in df. 171 frequency documentFrequency[numberOfDocuments];185 frequency *documentFrequency = new frequency[numberOfDocuments]; 172 186 frequency df; 173 187 174 188 // documentArray will be searched in order to discover which document 175 189 // each phrase occurs in. 176 documentArray = new (symbol *)[numberOfDocuments];190 documentArray = new symbol *[numberOfDocuments]; 177 191 178 192 // Discover all the DOCUMENTSTART symbols and store as a phrase … … 250 264 // Iterate over the different symbols by working through the suffix array 251 265 vector<Phrase> result; 252 cellindex i = 0;266 cellindex ij = 0; 253 267 char *tmpString; 254 268 255 while (i < inputLength) {269 while (ij < inputLength) { 256 270 257 271 // make a new phrase of length 1 258 p = Phrase(suffixArray[i ], 1, SUFFIX);259 p.findFirstAndLastSuffix(i , inputLength-1);260 261 // cout << "cell " << i << " - " << p.toString() << endl;272 p = Phrase(suffixArray[ij], 1, SUFFIX); 273 p.findFirstAndLastSuffix(ij, inputLength-1); 274 275 // cout << "cell " << ij << " - " << p.toString() << endl; 262 276 263 277 // We ignore this symbol if it occurs only once, if it is a delimiter, … … 271 285 // it explodes the size of the indexes. So: would it be useful? 272 286 if (!((p.suffixFrequency <= 1) || 273 // (*suffixArray[i ] != 23054) ||274 (*suffixArray[i ] <= LASTDELIMITER) ||275 ((phraseMode == STOPWORDS) && (*suffixArray[i ] <= lastStopSymbol)))) {287 // (*suffixArray[ij] != 23054) || 288 (*suffixArray[ij] <= LASTDELIMITER) || 289 ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) { 276 290 277 291 // Get minimal expansions of the phrase … … 281 295 282 296 // Remember that we have expanded this phrase 283 rememberThisPhrase(i , 1);297 rememberThisPhrase(ij, 1); 284 298 285 299 // write the phrase text 286 300 tmpString = p.toString(); 287 phraseData << i << "-1:" << tmpString << ":" << p.suffixFrequency << ":"301 phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":" 288 302 << result.size() << ":"; 289 303 delete [] tmpString; 290 304 291 305 // write the results 292 for (cellcount i = 0; i < result.size(); i++) {293 if ( i) {306 for (cellcount k = 0; k < result.size(); k++) { 307 if (k) { 294 308 phraseData << ","; 295 309 } 296 phraseData << result[ i].firstSuffixIndex << "-" << result[i].length;297 outPhrase << result[ i].firstSuffixIndex << " " << result[i].length << endl;310 phraseData << result[k].firstSuffixIndex << "-" << result[k].length; 311 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl; 298 312 outPhraseCounter++; 299 313 } … … 305 319 306 320 // write the documents 307 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {308 if (documentFrequency[ i]) {321 for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) { 322 if (documentFrequency[m]) { 309 323 if (first) { 310 324 first = 0; … … 315 329 // N documents from 0 to N-1, but later they'll be 1-N. Thus we 316 330 // add 1 to the document id when we output it. 317 phraseData << "d" << ( i+1);331 phraseData << "d" << (m+1); 318 332 // Next, output the frequency with which the document occurs, but 319 333 // only if it is > 1. 320 if (documentFrequency[ i] > 1) {321 phraseData << "," << documentFrequency[ i];334 if (documentFrequency[m] > 1) { 335 phraseData << "," << documentFrequency[m]; 322 336 } 323 337 } … … 338 352 } 339 353 } 340 i = p.lastSuffixIndex + 1;354 ij = p.lastSuffixIndex + 1; 341 355 } 342 356 outPhrase.close(); … … 472 486 deletePhraseMemory(); 473 487 488 delete [] documentFrequency; 474 489 delete [] symbols; 475 490 delete [] suffixArray; … … 548 563 suffixCheck[i] = c.length; 549 564 } 550 for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {551 prefixCheck[i ] = c.length;565 for (cellcount ik = c.firstPrefixIndex; ik <= c.lastPrefixIndex; ik++) { 566 prefixCheck[ik] = c.length; 552 567 } 553 568 } … … 578 593 suffixCheck[i] = c.length; 579 594 } 580 for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {581 prefixCheck[i ] = c.length;595 for (cellcount ijk = c.firstPrefixIndex; ijk <= c.lastPrefixIndex; ijk++) { 596 prefixCheck[ijk] = c.length; 582 597 } 583 598 … … 701 716 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl; 702 717 } 703 symbols = new (symbol)[inputLength];718 symbols = new symbol[inputLength]; 704 719 if (symbols == NULL) { 705 720 cerr << "Suffix error: not enough memory to hold " << inputLength … … 757 772 758 773 // search for the document in which each occurence of the phrase is found 759 for (cellcount i = p.firstSuffixIndex; i <= p.lastSuffixIndex; i++) {774 for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) { 760 775 761 // cout << "looking for phrase at suffixArray[" << i<< "]\n";776 // cout << "looking for phrase at suffixArray[" << j << "]\n"; 762 777 763 target = suffixArray[ i];778 target = suffixArray[j]; 764 779 begin = 0; 765 780 end = numberOfDocuments - 1; … … 864 879 void initialisePhraseMemory() { 865 880 866 phraseMemory = new (unsigned char)[inputLength];881 phraseMemory = new unsigned char[inputLength]; 867 882 868 883 // to begin with, everything is empty
Note:
See TracChangeset
for help on using the changeset viewer.