Changeset 2867 for trunk/gsdl/src
- Timestamp:
- 2001-11-28T15:07:12+13:00 (22 years ago)
- Location:
- trunk/gsdl/src/phind/generate
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/src/phind/generate/Makefile.in
r2696 r2867 43 43 44 44 45 HEADERS = suffix.h phrase.h 45 HEADERS = suffix.h phrase.h check.h 46 46 SOURCES = suffix.cpp phrase.cpp 47 47 OBJECTS = phrase.o suffix.o … … 65 65 suffix : $(OBJECTS) 66 66 $(CXX) $(LDFLAGS) -o $@ $(OBJECTS) 67 68 -
trunk/gsdl/src/phind/generate/suffix.cpp
r2807 r2867 2 2 * 3 3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix 4 * 5 * 4 * and prefix arrays (cgn & gwp's simpler algorithm, 5 * and kjm's improvements). 6 6 * 7 7 * Copyright 2000 Gordon W. Paynter 8 8 * Copyright 2000 The New Zealand Digital Library Project 9 9 * 10 * A component of the Greenstone digital library software 11 * from the New Zealand Digital Library Project at the12 * University of Waikato,New Zealand.10 * A component of the Greenstone digital library software from the 11 * New Zealand Digital Library Project at the University of Waikato, 12 * New Zealand. 13 13 * 14 * This program is free software; you can redistribute it and/or modify15 * it under the terms of the GNU General Public License as published by16 * the Free Software Foundation; either version 2 of the License, or17 * (at your option) any later version.14 * This program is free software; you can redistribute it and/or 15 * modify it under the terms of the GNU General Public License as 16 * published by the Free Software Foundation; either version 2 of 17 * the License, or (at your option) any later version. 18 18 * 19 19 * This program is distributed in the hope that it will be useful, … … 56 56 #include "suffix.h" 57 57 #include "phrase.h" 58 #include "check.h" 58 59 59 60 // Global variables declared in suffix.h … … 63 64 symbol **suffixArray; 64 65 symbol **prefixArray; 65 check *suffixCheck;66 67 // the length of the check array68 cellcount checkLength;69 66 70 67 // How many documents are in this collection? … … 93 90 int pointerCompare(const void *, const void *); 94 91 95 // some bit manipulation functions for the check arrays, defined below96 int getSuffixCheck(cellindex suff);97 void setSuffixCheck(cellindex suff);98 92 99 93 // Functions for implementing "phrase memory". These let us "remember" … … 142 136 // Initialise the candidates, check array, and various variables. 143 137 sort(candidates.begin(), candidates.end(), isShorter); 144 memset(suffixCheck, 0, sizeof(check)*checkLength);145 138 unsigned minimum_length = candidates.begin()->length; 139 clearSuffixCheck(); 146 140 147 141 // Try to add each candidate to the results set, ignoring the non-minimal … … 153 147 bool shorter_found = false; 154 148 155 // Check for shorter and shorter versions of the te nporary phrase149 // Check for shorter and shorter versions of the temporary phrase 156 150 while (temp_phrase.length >= minimum_length && !shorter_found) { 157 151 temp_phrase.ensureSuffixFound(); 158 //if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)159 152 if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0) 160 153 temp_phrase.shortenByOneAtPrefix(); … … 166 159 } 167 160 161 // If no shorter phrase is found, use this one 168 162 if (!shorter_found) { 169 163 results.push_back(*candidate); 170 164 candidate->ensureSuffixFound(); 171 for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k) 172 //suffixCheck[k] = candidate->length; 173 setSuffixCheck(k); 165 setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex); 174 166 } 175 167 } … … 254 246 } 255 247 256 int getSuffixCheck(cellindex suff) {257 cellindex cell = suff >> 3;258 check remainder = suff & 0x07; // the last 3 bits259 if (suffixCheck[cell]& (1 << remainder)) {260 return 1;261 }262 return 0;263 }264 void setSuffixCheck(cellindex suff) {265 cellindex cell = suff >> 3;266 check remainder = suff & 0x07; // the last 3 bits267 suffixCheck[cell] |= (1 << remainder);268 269 }270 248 271 249 // Read the clauses.numbers file into the "symbols" array. … … 735 713 } 736 714 715 737 716 int main (int argc, char * argv[]) { 738 717 … … 762 741 cout << "suffix: the phrase extraction program" << endl; 763 742 } 764 765 743 if (verbosity > 1) { 766 744 if (phraseMode == STOPWORDS) { … … 786 764 else firstContent = firstStopSymbol; 787 765 766 // Allocate memory for the suffix & prefix arrays 788 767 cellcount contentLength = 0; 789 768 contentLength = getContentCount(firstContent); 790 791 // Create the suffix & prefix arrays792 769 suffixArray = new symbol *[contentLength]; 793 770 prefixArray = new symbol *[contentLength]; 794 795 cellcount here=0; 771 if (prefixArray == NULL) { 772 cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl; 773 exit(2); 774 } 775 allocateSuffixCheck(contentLength); 776 796 777 // Initialise prefix and suffix arrays, only use the needed suffixes 797 for (cellcount j = 0 ; j < inputLength; j++) {778 for (cellcount j = 0, here = 0; j < inputLength; j++) { 798 779 if (symbols[j]>=firstContent) { 799 780 suffixArray[here] = &symbols[j]; … … 805 786 qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare); 806 787 807 checkLength = contentLength/8 + 1;808 suffixCheck = new check[checkLength];809 if (suffixCheck == NULL) {810 cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;811 exit(2);812 }813 memset(suffixCheck, 0, sizeof(check)*checkLength);814 815 cout <<"\ngenerating the phrase hierarchy\n\n";816 817 788 // Create the document arrays 818 789 if (verbosity > 1) { … … 829 800 // each phrase occurs in. 830 801 documentArray = new symbol *[numberOfDocuments]; 802 if (documentArray == NULL) { 803 cerr << "Suffix: out of memory allocating document arrays." << endl; 804 exit(2); 805 } 831 806 832 807 // just scan through the input text to find the doc starts … … 857 832 // it and add its expansions to the set of output phrases. 858 833 834 cout <<"\ngenerating the phrase hierarchy\n\n"; 835 859 836 // Store the phrase data in the phrases file 860 837 char phraseDataName[FILENAME_MAX]; -
trunk/gsdl/src/phind/generate/suffix.h
r2487 r2867 46 46 typedef unsigned int frequency; 47 47 48 // The check type is used when we want to store low frequency values.49 // Required range: 0 - 8 (could be recoded to use booleans)50 typedef unsigned char check;51 52 53 48 // Global variables 54 49 … … 62 57 // Suffix and prefix arrays are used to extract phrases 63 58 extern symbol **suffixArray; 64 extern check *suffixCheck;65 59 extern symbol **prefixArray; 66 extern check *prefixCheck;67 60 68 61 // Collection-specific information about the first stopword/content symbols
Note:
See TracChangeset
for help on using the changeset viewer.