Changeset 2867 for trunk/gsdl/src


Ignore:
Timestamp:
2001-11-28T15:07:12+13:00 (22 years ago)
Author:
paynter
Message:

Moved all the sufficCheck functionality into the check.h header and
inlined it.

Location:
trunk/gsdl/src/phind/generate
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/phind/generate/Makefile.in

    r2696 r2867  
    4343
    4444
    45 HEADERS = suffix.h phrase.h
     45HEADERS = suffix.h phrase.h check.h
    4646SOURCES = suffix.cpp phrase.cpp
    4747OBJECTS = phrase.o suffix.o
     
    6565suffix : $(OBJECTS)
    6666    $(CXX) $(LDFLAGS) -o $@ $(OBJECTS)
     67
     68
  • trunk/gsdl/src/phind/generate/suffix.cpp

    r2807 r2867  
    22 *
    33 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
    4  *                and prefix arrays (cgn & gwp's simpler algorithm,
    5  *                 and kjm's improvements).
     4 *               and prefix arrays (cgn & gwp's simpler algorithm,
     5 *               and kjm's improvements).
    66 *
    77 * Copyright 2000 Gordon W. Paynter
    88 * Copyright 2000 The New Zealand Digital Library Project
    99 *
    10  * A component of the Greenstone digital library software
    11  * from the New Zealand Digital Library Project at the
    12  * University of Waikato, New Zealand.
     10 * A component of the Greenstone digital library software from the
     11 * New Zealand Digital Library Project at the University of Waikato,
     12 * New Zealand.
    1313 *
    14  * This program is free software; you can redistribute it and/or modify
    15  * it under the terms of the GNU General Public License as published by
    16  * the Free Software Foundation; either version 2 of the License, or
    17  * (at your option) any later version.
     14 * This program is free software; you can redistribute it and/or
     15 * modify it under the terms of the GNU General Public License as
     16 * published by the Free Software Foundation; either version 2 of
     17 * the License, or (at your option) any later version.
    1818 *
    1919 * This program is distributed in the hope that it will be useful,
     
    5656#include "suffix.h"
    5757#include "phrase.h"
     58#include "check.h"
    5859
    5960// Global variables declared in suffix.h
     
    6364symbol  **suffixArray;
    6465symbol  **prefixArray;
    65 check    *suffixCheck;
    66 
    67 // the length of the check array
    68 cellcount checkLength;
    6966
    7067// How many documents are in this collection?
     
    9390int pointerCompare(const void *, const void *);
    9491
    95 // some bit manipulation functions for the check arrays, defined below
    96 int getSuffixCheck(cellindex suff);
    97 void setSuffixCheck(cellindex suff);
    9892                                           
    9993// Functions for implementing "phrase memory".  These let us "remember"
     
    142136  // Initialise the candidates, check array, and various variables.
    143137  sort(candidates.begin(), candidates.end(), isShorter);
    144   memset(suffixCheck, 0, sizeof(check)*checkLength);
    145138  unsigned minimum_length = candidates.begin()->length;
     139  clearSuffixCheck();
    146140 
    147141  // Try to add each candidate to the results set, ignoring the non-minimal
     
    153147    bool shorter_found = false;
    154148   
    155     // Check for shorter and shorter versions of the tenporary phrase
     149    // Check for shorter and shorter versions of the temporary phrase
    156150    while (temp_phrase.length >= minimum_length && !shorter_found) {
    157151      temp_phrase.ensureSuffixFound();
    158       //if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
    159152      if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
    160153    temp_phrase.shortenByOneAtPrefix();
     
    166159    }
    167160     
     161    // If no shorter phrase is found, use this one
    168162    if (!shorter_found) {
    169163      results.push_back(*candidate);
    170164      candidate->ensureSuffixFound();
    171       for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
    172     //suffixCheck[k] = candidate->length;
    173     setSuffixCheck(k);
     165      setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
    174166    }
    175167  }
     
    254246}
    255247
    256 int getSuffixCheck(cellindex suff) {
    257   cellindex cell = suff >> 3;
    258   check remainder = suff & 0x07; // the last 3 bits
    259   if (suffixCheck[cell]& (1 << remainder)) {
    260     return 1;
    261   }
    262   return 0;
    263 }                                                                               
    264 void setSuffixCheck(cellindex suff) {
    265   cellindex cell = suff >> 3;
    266   check remainder = suff & 0x07; // the last 3 bits
    267   suffixCheck[cell] |= (1 << remainder);
    268 
    269 
    270248
    271249// Read the clauses.numbers file into the "symbols" array.
     
    735713}
    736714
     715
    737716int main (int argc, char * argv[]) {
    738717
     
    762741    cout << "suffix: the phrase extraction program" << endl;
    763742  }
    764 
    765743  if (verbosity > 1) {
    766744    if (phraseMode == STOPWORDS) {
     
    786764  else firstContent = firstStopSymbol;
    787765
     766  // Allocate memory for the suffix & prefix arrays
    788767  cellcount contentLength = 0;
    789768  contentLength = getContentCount(firstContent);
    790 
    791   // Create the suffix & prefix arrays
    792769  suffixArray = new symbol *[contentLength];
    793770  prefixArray = new symbol *[contentLength];
    794 
    795   cellcount here=0;
     771  if (prefixArray == NULL) {
     772    cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
     773    exit(2);
     774  } 
     775  allocateSuffixCheck(contentLength);
     776
    796777  // Initialise prefix and suffix arrays, only use the needed suffixes
    797   for (cellcount j = 0; j < inputLength; j++) {
     778  for (cellcount j = 0, here = 0; j < inputLength; j++) {
    798779    if (symbols[j]>=firstContent) {
    799780      suffixArray[here] = &symbols[j];
     
    805786  qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
    806787
    807   checkLength = contentLength/8 + 1;
    808   suffixCheck = new check[checkLength];
    809   if (suffixCheck == NULL) {
    810     cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
    811     exit(2);
    812   } 
    813   memset(suffixCheck, 0, sizeof(check)*checkLength);
    814 
    815   cout <<"\ngenerating the phrase hierarchy\n\n";
    816  
    817788  // Create the document arrays
    818789  if (verbosity > 1) {
     
    829800  // each phrase occurs in.
    830801  documentArray = new symbol *[numberOfDocuments]; 
     802  if (documentArray == NULL) {
     803    cerr << "Suffix: out of memory allocating document arrays." << endl;
     804    exit(2);
     805  } 
    831806
    832807  // just scan through the input text to find the doc starts
     
    857832  // it and add its expansions to the set of output phrases.
    858833
     834  cout <<"\ngenerating the phrase hierarchy\n\n";
     835 
    859836  // Store the phrase data in the phrases file
    860837  char phraseDataName[FILENAME_MAX];
  • trunk/gsdl/src/phind/generate/suffix.h

    r2487 r2867  
    4646typedef unsigned int frequency;
    4747
    48 // The check type is used when we want to store low frequency values.
    49 // Required range: 0 - 8 (could be recoded to use booleans)
    50 typedef unsigned char check;
    51 
    52 
    5348// Global variables
    5449
     
    6257// Suffix and prefix arrays are used to extract phrases
    6358extern symbol  **suffixArray;
    64 extern check    *suffixCheck;
    6559extern symbol  **prefixArray;
    66 extern check    *prefixCheck;
    6760
    6861// Collection-specific information about the first stopword/content symbols
Note: See TracChangeset for help on using the changeset viewer.