Changeset 1882

Show
Ignore:
Timestamp:
31.01.2001 14:08:50 (19 years ago)
Author:
paynter
Message:

The length of the main symbols array is now calculated from the
clauses.numbers file, not passedin as a command-line parameter.
This is a little slower, as we have to make an extra pass over
the text, but oh-so-much-more convienient. Requires changes to
command line arguments (which now support verbosity).

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/suffix.cpp

    r1873 r1882  
    4545 
    4646// Global variables declared in suffix.h 
    47 cellcount symbol_array_length; 
    4847cellcount inputLength; 
    4948 
     
    6968int pointerCompare(const void *, const void *); 
    7069 
    71 int readNumbers(symbol *numbers); 
     70int readNumbers(); 
    7271void readStatistics(); 
    7372 
     
    103102  // argv[3] is the mode, where 1 is stopword mode (optional)  
    104103  if (argc < 2) { 
    105     cerr << "Usage: " << argv[0] << " collection-directory [max-array-size [mode]]" << endl; 
     104    cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl; 
    106105    exit(1); 
    107106  } 
     
    110109  strcpy(collection, argv[1]); 
    111110 
    112   // Symbol length parameter 
    113   if (argc >= 3) { 
    114     symbol_array_length = atol(argv[2]); 
    115     assert(symbol_array_length); 
    116   } else { 
    117     symbol_array_length = 1000; 
    118   } 
    119  
    120   // Stopword mode parameter 
     111  // mode parameter 
     112  phraseMode = atoi(argv[2]); 
     113  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE)); 
     114 
     115  // optional verbosity parameter 
    121116  if (argc == 4) { 
    122     phraseMode = atoi(argv[3]); 
    123     assert(phraseMode == STOPWORDS); 
    124   } 
     117    verbosity = atoi(argv[3]); 
     118    assert (verbosity >= 0); 
     119  } 
     120 
    125121  if (verbosity) { 
     122    cout << "Suffix phrase extraction program" << endl; 
     123  } 
     124 
     125  if (verbosity > 1) { 
    126126    if (phraseMode == STOPWORDS) { 
    127       cout << "STOPWORDS mode: no phrase may begin or end with a stopword" << endl; 
     127      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl; 
    128128    } else { 
    129       cout << "ALLPHRASE mode: extract every phrase that occurs more than once" << endl; 
     129      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl; 
    130130    } 
    131131  } 
     
    135135 
    136136  // Read the numbers file 
    137   symbols = new (symbol)[symbol_array_length]; 
    138   readNumbers(symbols); 
    139  
     137  readNumbers(); 
    140138 
    141139  // Create the suffix & prefix arrays 
    142   if (verbosity) { 
    143     cout << "Create suffix & prefix arrays for " << inputLength << " symbols" << endl; 
    144   } 
    145140  suffixArray = new (symbol *)[inputLength]; 
    146141  prefixArray = new (symbol *)[inputLength]; 
    147142  suffixCheck = new (check)[inputLength]; 
    148143  prefixCheck = new (check)[inputLength]; 
     144  if (prefixCheck == NULL) { 
     145    cerr << "Suffix error: not enough memory to hold " << inputLength 
     146     << " symbols." << endl; 
     147    exit(2); 
     148  }   
     149 
     150  // Initialise prefix and suffix arrays 
    149151  for (cellcount j = 0; j < inputLength; j++) { 
    150152    suffixArray[j] = &symbols[j]; 
     
    156158 
    157159  // Create the document arrays 
    158   if (verbosity) { 
    159     cout << "Create document arrays for " << numberOfDocuments << " documents" << endl; 
    160   } 
    161160  if (numberOfDocuments == 0) { 
    162161    cerr << "There are no documents in this collection!" << endl; 
    163162    exit(1); 
     163  } 
     164  if (verbosity > 1) { 
     165    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl; 
    164166  } 
    165167 
     
    226228 
    227229  // PASS NUMBER 1 
    228   if (verbosity) { 
    229     cout << endl<< "suffix: starting pass " << phrasePass << endl; 
     230  if (verbosity > 1) { 
     231    cout << "Starting pass " << phrasePass << endl; 
    230232  } 
    231233 
     
    348350    phrasePass++; 
    349351    if (verbosity) { 
    350       cout << endl << "Starting pass " << phrasePass << endl; 
     352      cout << "Starting pass " << phrasePass << endl; 
    351353    } 
    352354 
     
    664666 
    665667 
    666 // Read the numbers file into an array of symbols. 
    667 // 
    668 // Each number is a symbol number; it is essential that the first 
    669 // symbol (and no others) be 0 and the last symbol (and no others)  
    670 // be 1. 
     668// Read the clauses.numbers file into the "symbols" array. 
     669// 
     670// Each number in the file is a symbol number; it is essential that  
     671// the first symbol (and no others) be COLLECTIONSTART and the last 
     672// symbol (and no others) be COLLECTIONEND. 
    671673// 
    672674// Return the number of numbers in the array. 
    673675 
    674 int readNumbers(symbol *numbers) { 
     676int readNumbers() { 
    675677 
    676678  char filename[FILENAME_MAX]; 
    677679  sprintf(filename, "%s/clauses.numbers", collection); 
    678680  if (verbosity) { 
    679     cout << "Reading numbers from: " << filename << endl; 
     681    cout << "Reading numbers file: " << filename << endl; 
    680682  } 
    681683 
     
    687689  } 
    688690 
     691  // Count the number of symbols 
     692  inputLength = 0; 
     693  symbol word; 
     694  while (inFile >> word) { 
     695    inputLength++; 
     696  } 
     697  inFile.close(); 
     698 
     699  // Allocate the symbbols array 
     700  if (verbosity > 1) { 
     701    cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl; 
     702  } 
     703  symbols = new (symbol)[inputLength]; 
     704  if (symbols == NULL) { 
     705    cerr << "Suffix error: not enough memory to hold " << inputLength 
     706     << " symbols." << endl; 
     707    exit(2); 
     708  } 
     709 
    689710  // Read the numbers file into the numbers array 
    690   symbol word; 
    691   cellcount length = 0; 
     711  if (verbosity > 2) { 
     712    cout << "Reading the numbers" << endl; 
     713  } 
     714  inFile.open(filename, ios::in); 
     715  cellcount next = 0; 
    692716  numberOfDocuments = 0; 
    693   while ((inFile >> word) && (length < symbol_array_length)){ 
    694     numbers[length++] = word; 
     717  while (inFile >> word) { 
     718    symbols[next++] = word; 
    695719    if (word == DOCUMENTSTART) { 
    696720      numberOfDocuments++; 
    697721    } 
    698722  } 
    699    
    700   // Make sure we were able to read all the numbers 
    701   if (length >= symbol_array_length) { 
    702     cerr << "Error: the symbol array is too a short to hold " << filename 
    703      << endl << "It is currently set to " <<  symbol_array_length 
    704      << " and can be adjusted at the command line." << endl; 
    705     exit(1); 
    706   } 
     723  inFile.close(); 
    707724   
    708725  // Make sure the numbers file is intact 
    709   assert(numbers[0] == COLLECTIONSTART); 
    710   assert(numbers[length-1] == COLLECTIONEND); 
    711  
    712   // Record the length of the Input file 
    713   inputLength = length; 
    714  
    715   return length; 
     726  assert(symbols[0] == COLLECTIONSTART); 
     727  assert(symbols[next-1] == COLLECTIONEND); 
     728 
     729  return inputLength; 
    716730} 
    717731