Ignore:
Timestamp:
2001-06-01T14:51:29+12:00 (23 years ago)
Author:
sjboddie
Message:

Changes to get phind working under windows

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/suffix.cpp

    r1882 r2487  
    2828
    2929#include <assert.h>
    30 #include <fstream.h>
    31 #include <iostream.h>
    3230#include <math.h>
    3331#include <stdio.h>
     
    3533#include <string.h>
    3634
    37 #include <algo.h>
    38 #include <heap.h>
    39 #include <vector.h>
     35#if defined(GSDL_USE_IOS_H)
     36#  include <fstream.h>
     37#  include <iostream.h>
     38#else
     39#  include <fstream>
     40#  include <iostream>
     41#endif
     42
     43#if defined(GSDL_USE_STL_H)
     44#  if defined(GSDL_USE_ALGO_H)
     45#    include <algo.h>
     46#  else
     47#    include <algorithm.h>
     48#  endif
     49#  include <vector.h>
     50#else
     51#  include <algorithm>
     52#  include <vector>
     53#endif
     54#include <stl_heap.h>
     55
    4056
    4157#include "suffix.h"
    4258#include "phrase.h"
    43 
    44 
    4559
    4660// Global variables declared in suffix.h
     
    138152
    139153  // Create the suffix & prefix arrays
    140   suffixArray = new (symbol *)[inputLength];
    141   prefixArray = new (symbol *)[inputLength];
    142   suffixCheck = new (check)[inputLength];
    143   prefixCheck = new (check)[inputLength];
     154  suffixArray = new symbol *[inputLength];
     155  prefixArray = new symbol *[inputLength];
     156  suffixCheck = new check[inputLength];
     157  prefixCheck = new check[inputLength];
    144158  if (prefixCheck == NULL) {
    145159    cerr << "Suffix error: not enough memory to hold " << inputLength
     
    169183  // each phrase occurs in each document.  The number of documents in
    170184  // which a phrase occurs is stored in df.
    171   frequency documentFrequency[numberOfDocuments];
     185  frequency *documentFrequency = new frequency[numberOfDocuments];
    172186  frequency df;
    173187
    174188  // documentArray will be searched in order to discover which document
    175189  // each phrase occurs in.
    176   documentArray = new (symbol *)[numberOfDocuments]; 
     190  documentArray = new symbol *[numberOfDocuments]; 
    177191
    178192  // Discover all the DOCUMENTSTART symbols and store as a phrase
     
    250264  // Iterate over the different symbols by working through the suffix array
    251265  vector<Phrase> result;
    252   cellindex i = 0;
     266  cellindex ij = 0;
    253267  char *tmpString;
    254268
    255   while (i < inputLength) {
     269  while (ij < inputLength) {
    256270
    257271    // make a new phrase of length 1
    258     p = Phrase(suffixArray[i], 1, SUFFIX);
    259     p.findFirstAndLastSuffix(i, inputLength-1);
    260 
    261     // cout << "cell " << i << " - " << p.toString() << endl;
     272    p = Phrase(suffixArray[ij], 1, SUFFIX);
     273    p.findFirstAndLastSuffix(ij, inputLength-1);
     274
     275    // cout << "cell " << ij << " - " << p.toString() << endl;
    262276
    263277    // We ignore this symbol if it occurs only once, if it is a delimiter,
     
    271285    // it explodes the size of the indexes.  So: would it be useful? 
    272286    if (!((p.suffixFrequency <= 1) ||
    273       // (*suffixArray[i] != 23054) ||
    274       (*suffixArray[i] <= LASTDELIMITER) ||
    275       ((phraseMode == STOPWORDS) && (*suffixArray[i] <= lastStopSymbol)))) {
     287      // (*suffixArray[ij] != 23054) ||
     288      (*suffixArray[ij] <= LASTDELIMITER) ||
     289      ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {
    276290
    277291      // Get minimal expansions of the phrase
     
    281295   
    282296    // Remember that we have expanded this phrase
    283     rememberThisPhrase(i, 1);
     297    rememberThisPhrase(ij, 1);
    284298
    285299    // write the phrase text
    286300    tmpString = p.toString();
    287     phraseData << i << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
     301    phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
    288302           << result.size() << ":";
    289303    delete [] tmpString;
    290304
    291305    // write the results
    292     for (cellcount i = 0; i < result.size(); i++) {
    293       if (i) {
     306    for (cellcount k = 0; k < result.size(); k++) {
     307      if (k) {
    294308        phraseData << ",";
    295309      }
    296       phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
    297       outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
     310      phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
     311      outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
    298312      outPhraseCounter++;
    299313    }
     
    305319
    306320    // write the documents
    307     for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
    308       if (documentFrequency[i]) {
     321    for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
     322      if (documentFrequency[m]) {
    309323        if (first) {
    310324          first = 0;
     
    315329        // N documents from 0 to N-1, but later they'll be 1-N.  Thus we
    316330        // add 1 to the document id when we output it.
    317         phraseData << "d" << (i+1);
     331        phraseData << "d" << (m+1);
    318332        // Next, output the frequency with which the document occurs, but
    319333        // only if it is > 1.
    320         if (documentFrequency[i] > 1) {
    321           phraseData << "," << documentFrequency[i];
     334        if (documentFrequency[m] > 1) {
     335          phraseData << "," << documentFrequency[m];
    322336        }
    323337      }
     
    338352      }
    339353    }
    340    i = p.lastSuffixIndex + 1;
     354   ij = p.lastSuffixIndex + 1;
    341355  }
    342356  outPhrase.close();
     
    472486  deletePhraseMemory();
    473487
     488  delete [] documentFrequency;
    474489  delete [] symbols;
    475490  delete [] suffixArray;
     
    548563      suffixCheck[i] = c.length;
    549564    }
    550     for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
    551       prefixCheck[i] = c.length;
     565    for (cellcount ik = c.firstPrefixIndex; ik <= c.lastPrefixIndex; ik++) {
     566      prefixCheck[ik] = c.length;
    552567    }
    553568      }
     
    578593      suffixCheck[i] = c.length;
    579594    }
    580     for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
    581       prefixCheck[i] = c.length;
     595    for (cellcount ijk = c.firstPrefixIndex; ijk <= c.lastPrefixIndex; ijk++) {
     596      prefixCheck[ijk] = c.length;
    582597    }
    583598
     
    701716    cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
    702717  }
    703   symbols = new (symbol)[inputLength];
     718  symbols = new symbol[inputLength];
    704719  if (symbols == NULL) {
    705720    cerr << "Suffix error: not enough memory to hold " << inputLength
     
    757772
    758773  // search for the document in which each occurence of the phrase is found
    759   for (cellcount i = p.firstSuffixIndex; i <= p.lastSuffixIndex; i++) {
     774  for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
    760775   
    761     // cout << "looking for phrase at suffixArray[" << i << "]\n";
     776    // cout << "looking for phrase at suffixArray[" << j << "]\n";
    762777   
    763     target = suffixArray[i];
     778    target = suffixArray[j];
    764779    begin = 0;
    765780    end = numberOfDocuments - 1;
     
    864879void initialisePhraseMemory() {
    865880
    866   phraseMemory = new (unsigned char)[inputLength];
     881  phraseMemory = new unsigned char[inputLength];
    867882
    868883  // to begin with, everything is empty
Note: See TracChangeset for help on using the changeset viewer.