Ignore:
Timestamp:
2001-01-31T14:08:50+13:00 (23 years ago)
Author:
paynter
Message:

The length of the main symbols array is now calculated from the
clauses.numbers file, not passedin as a command-line parameter.
This is a little slower, as we have to make an extra pass over
the text, but oh-so-much-more convienient. Requires changes to
command line arguments (which now support verbosity).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/suffix.cpp

    r1873 r1882  
    4545
    4646// Global variables declared in suffix.h
    47 cellcount symbol_array_length;
    4847cellcount inputLength;
    4948
     
    6968int pointerCompare(const void *, const void *);
    7069
    71 int readNumbers(symbol *numbers);
     70int readNumbers();
    7271void readStatistics();
    7372
     
    103102  // argv[3] is the mode, where 1 is stopword mode (optional)
    104103  if (argc < 2) {
    105     cerr << "Usage: " << argv[0] << " collection-directory [max-array-size [mode]]" << endl;
     104    cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
    106105    exit(1);
    107106  }
     
    110109  strcpy(collection, argv[1]);
    111110
    112   // Symbol length parameter
    113   if (argc >= 3) {
    114     symbol_array_length = atol(argv[2]);
    115     assert(symbol_array_length);
    116   } else {
    117     symbol_array_length = 1000;
    118   }
    119 
    120   // Stopword mode parameter
     111  // mode parameter
     112  phraseMode = atoi(argv[2]);
     113  assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
     114
     115  // optional verbosity parameter
    121116  if (argc == 4) {
    122     phraseMode = atoi(argv[3]);
    123     assert(phraseMode == STOPWORDS);
    124   }
     117    verbosity = atoi(argv[3]);
     118    assert (verbosity >= 0);
     119  }
     120
    125121  if (verbosity) {
     122    cout << "Suffix phrase extraction program" << endl;
     123  }
     124
     125  if (verbosity > 1) {
    126126    if (phraseMode == STOPWORDS) {
    127       cout << "STOPWORDS mode: no phrase may begin or end with a stopword" << endl;
     127      cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
    128128    } else {
    129       cout << "ALLPHRASE mode: extract every phrase that occurs more than once" << endl;
     129      cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
    130130    }
    131131  }
     
    135135
    136136  // Read the numbers file
    137   symbols = new (symbol)[symbol_array_length];
    138   readNumbers(symbols);
    139 
     137  readNumbers();
    140138
    141139  // Create the suffix & prefix arrays
    142   if (verbosity) {
    143     cout << "Create suffix & prefix arrays for " << inputLength << " symbols" << endl;
    144   }
    145140  suffixArray = new (symbol *)[inputLength];
    146141  prefixArray = new (symbol *)[inputLength];
    147142  suffixCheck = new (check)[inputLength];
    148143  prefixCheck = new (check)[inputLength];
     144  if (prefixCheck == NULL) {
     145    cerr << "Suffix error: not enough memory to hold " << inputLength
     146     << " symbols." << endl;
     147    exit(2);
     148  } 
     149
     150  // Initialise prefix and suffix arrays
    149151  for (cellcount j = 0; j < inputLength; j++) {
    150152    suffixArray[j] = &symbols[j];
     
    156158
    157159  // Create the document arrays
    158   if (verbosity) {
    159     cout << "Create document arrays for " << numberOfDocuments << " documents" << endl;
    160   }
    161160  if (numberOfDocuments == 0) {
    162161    cerr << "There are no documents in this collection!" << endl;
    163162    exit(1);
     163  }
     164  if (verbosity > 1) {
     165    cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
    164166  }
    165167
     
    226228
    227229  // PASS NUMBER 1
    228   if (verbosity) {
    229     cout << endl<< "suffix: starting pass " << phrasePass << endl;
     230  if (verbosity > 1) {
     231    cout << "Starting pass " << phrasePass << endl;
    230232  }
    231233
     
    348350    phrasePass++;
    349351    if (verbosity) {
    350       cout << endl << "Starting pass " << phrasePass << endl;
     352      cout << "Starting pass " << phrasePass << endl;
    351353    }
    352354
     
    664666
    665667
    666 // Read the numbers file into an array of symbols.
    667 //
    668 // Each number is a symbol number; it is essential that the first
    669 // symbol (and no others) be 0 and the last symbol (and no others)
    670 // be 1.
     668// Read the clauses.numbers file into the "symbols" array.
     669//
     670// Each number in the file is a symbol number; it is essential that
     671// the first symbol (and no others) be COLLECTIONSTART and the last
     672// symbol (and no others) be COLLECTIONEND.
    671673//
    672674// Return the number of numbers in the array.
    673675
    674 int readNumbers(symbol *numbers) {
     676int readNumbers() {
    675677
    676678  char filename[FILENAME_MAX];
    677679  sprintf(filename, "%s/clauses.numbers", collection);
    678680  if (verbosity) {
    679     cout << "Reading numbers from: " << filename << endl;
     681    cout << "Reading numbers file: " << filename << endl;
    680682  }
    681683
     
    687689  }
    688690
     691  // Count the number of symbols
     692  inputLength = 0;
     693  symbol word;
     694  while (inFile >> word) {
     695    inputLength++;
     696  }
     697  inFile.close();
     698
     699  // Allocate the symbbols array
     700  if (verbosity > 1) {
     701    cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
     702  }
     703  symbols = new (symbol)[inputLength];
     704  if (symbols == NULL) {
     705    cerr << "Suffix error: not enough memory to hold " << inputLength
     706     << " symbols." << endl;
     707    exit(2);
     708  }
     709
    689710  // Read the numbers file into the numbers array
    690   symbol word;
    691   cellcount length = 0;
     711  if (verbosity > 2) {
     712    cout << "Reading the numbers" << endl;
     713  }
     714  inFile.open(filename, ios::in);
     715  cellcount next = 0;
    692716  numberOfDocuments = 0;
    693   while ((inFile >> word) && (length < symbol_array_length)){
    694     numbers[length++] = word;
     717  while (inFile >> word) {
     718    symbols[next++] = word;
    695719    if (word == DOCUMENTSTART) {
    696720      numberOfDocuments++;
    697721    }
    698722  }
    699  
    700   // Make sure we were able to read all the numbers
    701   if (length >= symbol_array_length) {
    702     cerr << "Error: the symbol array is too a short to hold " << filename
    703      << endl << "It is currently set to " <<  symbol_array_length
    704      << " and can be adjusted at the command line." << endl;
    705     exit(1);
    706   }
     723  inFile.close();
    707724 
    708725  // Make sure the numbers file is intact
    709   assert(numbers[0] == COLLECTIONSTART);
    710   assert(numbers[length-1] == COLLECTIONEND);
    711 
    712   // Record the length of the Input file
    713   inputLength = length;
    714 
    715   return length;
     726  assert(symbols[0] == COLLECTIONSTART);
     727  assert(symbols[next-1] == COLLECTIONEND);
     728
     729  return inputLength;
    716730}
    717731
Note: See TracChangeset for help on using the changeset viewer.