/********************************************************************** * * phrase.cpp -- implementation of the phrase object used by suffix.cpp * * Copyright 2000 Gordon W. Paynter * Copyright 2000 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include #include #if defined(GSDL_USE_STL_H) # include #else # include #endif #if defined(GSDL_USE_IOS_H) # include #else # include #endif #include "suffix.h" #include "phrase.h" // Phrase constructor functions Phrase::Phrase(symbol *words, cellcount size, int direction) { empty(); length = size; if (direction == SUFFIX) { forward = words; back = forward + size - 1; } else { back = words; forward = back - size + 1; } } Phrase::Phrase() { empty(); } Phrase::Phrase(const Phrase &p) { forward = p.forward; back = p.back; length = p.length; suffixFound = p.suffixFound; prefixFound = p.prefixFound; firstSuffix = p.firstSuffix; lastSuffix = p.lastSuffix; firstSuffixIndex = p.firstSuffixIndex; lastSuffixIndex = p.lastSuffixIndex; suffixFrequency = p.suffixFrequency; firstPrefix = p.firstPrefix; lastPrefix = p.lastPrefix; firstPrefixIndex = p.firstPrefixIndex; lastPrefixIndex = p.lastPrefixIndex; prefixFrequency = p.prefixFrequency; uniqueSuffixExtension = p.uniqueSuffixExtension; uniquePrefixExtension = p.uniquePrefixExtension; } // Empty the contents of a phrase int Phrase::empty() { forward = back = NULL; length = 0; suffixFound = prefixFound = 0; firstSuffix = firstPrefix = NULL; lastSuffix = lastPrefix = NULL; firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0; firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0; uniqueSuffixExtension = uniquePrefixExtension = -1; return 0; } // Remove the details of the location where a suffix/prefix // is found in the array. int Phrase::clearSuffix() { suffixFound = 0; firstSuffix = lastSuffix = NULL; firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0; uniqueSuffixExtension = -1; return 0; } int Phrase::clearPrefix() { prefixFound = 0; firstPrefix = lastPrefix = NULL; firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0; uniquePrefixExtension = -1; return 0; } // Increase the length of s phrase // // We increase the length of a string "in place". When we expand // the suffix, however, we invalidate the prefix data, and vice-versa. int Phrase::increaseSuffixLength(cellcount l) { length = l; clearPrefix(); back = forward + l - 1; return 0; } int Phrase::increasePrefixLength(cellcount l) { length = l; clearSuffix(); forward = back - l + 1; return 0; } // Shorten a phrase by one symbol int Phrase::shortenByOneAtSuffix() { --length; --back; if (phraseMode==STOPWORDS) { while (*back >=firstStopSymbol && *back <= lastStopSymbol) { --length; --back; } } clearSuffix(); clearPrefix(); return 0; } int Phrase::shortenByOneAtPrefix() { --length; ++forward; if (phraseMode==STOPWORDS) { while (*forward >=firstStopSymbol && *forward <= lastStopSymbol) { --length; ++forward; } } clearSuffix(); clearPrefix(); return 0; } // Output a phrase to a stream ostream &operator<<(ostream &stream, const Phrase &phrase) { assert(phrase.forward); symbol *s = phrase.forward; stream << "s" << *s++; for (cellcount i = 1; i < phrase.length; ++i) stream << " s" << *s++; return stream; } // Convert the phrase to a string // Note thgat you have to delete the memory yourself. char *Phrase::toString() { assert(forward); char *str; str = new char[length*20]; symbol *s = forward; sprintf(str, "s%d", *s++); for (cellcount i = 1; i < length; ++i) { sprintf(str, "%s s%d", str, *s++); } return str; } // Compare a phrase to an another array of symbols // // Given an array of words with a specified length, // compare those words to this phrase. // // Return 0 if the two phrases are the same over length length // Return 1 if the phrase is "greater" than tyhe words, or // Return -1 if the phrase is less. int Phrase::compareSuffix(symbol *words, cellcount length) { assert(forward); symbol *p = forward; for (cellcount i = 0; i < length; ++i) { if (*p > *words) { return 1; } else if (*p < *words) { return -1; } else { ++*p; ++*words; } } return 0; } int Phrase::comparePrefix(symbol *words, cellcount length) { assert(back); symbol *p = back; for (cellcount i = 0; i < length; ++i) { if (*p > *words) { return -1; } else if (*p < *words) { return 1; } else { --*p; --*words; } } return 0; } // Does the phrase have a unique suffix/prefix extension? // // For suffix: // If uniqueSuffixExtensions is 1, then phrase has some unique expansion. // If it is 0 then it does not. If it is -1 then we don't know, and have // to calculate it from the suffix array. Same goes for prefixes. int Phrase::hasUniqueSuffixExtension() { // if we don't know, then work it out if (uniqueSuffixExtension == -1) { ensureSuffixFound(); // pointers to the phrase's first and last occurances in te suffixArray symbol *fst = firstSuffix + length; symbol *lst = lastSuffix + length; // in ANYPHRASE mode, simply check the next symbol if (phraseMode == ANYPHRASE) { if ((*fst == *lst) && (*fst > LASTDELIMITER)) { uniqueSuffixExtension = 1; } else { uniqueSuffixExtension = 0; } } // in STOPWORDS mode, make sure there is a unique next content symbol else { uniqueSuffixExtension = 0; while ((*fst == *lst) && (*fst > LASTDELIMITER)) { if (*fst >= firstContentSymbol) { uniqueSuffixExtension = 1; break; } ++fst; ++lst; } } } return uniqueSuffixExtension; } int Phrase::hasUniquePrefixExtension() { // if we don't know, then work it out if (uniquePrefixExtension == -1) { ensurePrefixFound(); // pointers to the phrase's first and last occurances in the prefixArray symbol *fst = firstPrefix - length; symbol *lst = lastPrefix - length; // in ANYPHRASE mode, simply check the next symbol if (phraseMode == ANYPHRASE) { if ((*fst == *lst) && (*fst > LASTDELIMITER)) { uniquePrefixExtension = 1; } else { uniquePrefixExtension = 0; } } // in STOPWORDS mode, make sure there is a unique next content symbol else { uniquePrefixExtension = 0; while ((*fst == *lst) && (*fst > LASTDELIMITER)) { if (*fst >= firstContentSymbol) { uniquePrefixExtension = 1; break; } --fst; --lst; } } } return uniquePrefixExtension; } // Expand a phrase with a unique suffix/prefix by 1 symbol // // Note that in STOPWORDS mode a "unique extension" means a unique // extension that ends on a content symbol. Although we may extend // such a phrase by 2 or more symbols, we will only extend it by a // single content symbol. int Phrase::expandUniqueSuffixExtensionByOne() { assert(suffixFound); assert(uniqueSuffixExtension == 1); // in ANYPHRASE mode, expand the phrase by one symbol if (phraseMode == ANYPHRASE) { increaseSuffixLength(length + 1); } // in STOPWORDS mode, expand the phrase up to the next content symbol else { symbol *next = firstSuffix + length; cellcount newlength = length + 1; while (*next < firstContentSymbol) { ++next; ++newlength; } increaseSuffixLength(newlength); } uniqueSuffixExtension = -1; return 0; } int Phrase::expandUniquePrefixExtensionByOne() { assert(prefixFound); assert(uniquePrefixExtension == 1); // in ANYPHRASE mode, expand the phrase by one symbol if (phraseMode == ANYPHRASE) { increasePrefixLength(length + 1); } // in STOPWORDS mode, expand the phrase to the next content symbol else { symbol *next = firstPrefix - length; cellcount newlength = length + 1; while (*next < firstContentSymbol) { --next; ++newlength; } increasePrefixLength(newlength); } uniquePrefixExtension = -1; return 0; } // Find the phrase in a suffixArray // // Given a suffix array, find the first and last occurances of the phrase. // The begin and end parameters identify which cells to search (inclusive). int Phrase::findFirstAndLastSuffix() { return findFirstAndLastSuffix(0, inputLength-1); } int Phrase::findFirstAndLastSuffix(cellindex begin, cellindex end) { // First find *any* occurance of the phrase assert(begin <= end); // if we're only searching one cell, it is very easy if (begin == end) { assert(compareSuffix(suffixArray[begin], length) == 0); firstSuffix = lastSuffix = suffixArray[begin]; firstSuffixIndex = lastSuffixIndex = begin; suffixFrequency = 1; suffixFound = 1; return 0; } cellindex c; int cmp; do { c = (end + begin) / 2; cmp = compareSuffix(suffixArray[c], length); if (cmp == 1) { // target phrase is lower than phrase at suffixArray[c] begin = c + 1; } else if (cmp == -1) { // target phrase is higher than phrase at suffixArray[c] end = c - 1; } } while (cmp != 0); cellindex lastbegin = c; cellindex lastend = end; // Next find the first occurance of the phrase. // We know that the first occurance must be between the // current value of begin and the current value of c. end = c; do { if (begin == end) { c = begin; cmp = 0; } else { c = (begin + end) / 2; cmp = compareSuffix(suffixArray[c], length); if (cmp == 1) { // target phrase is lower than phrase at suffixArray[c] begin = c + 1; } else { assert(cmp == 0); // target phrase is the same as the phrase at suffixArray[c]. However, // to find the first occurance, suffixArray[c] must be the same as the // phrase, but suffixArray[c-1] must be different. if (c>0) { cmp = compareSuffix(suffixArray[c-1], length); if (cmp == 0) { end = c - 1; assert(end >= begin); cmp = 1; } else { cmp = 0; } } } } } while (cmp != 0); // we have found the first location firstSuffixIndex = c; firstSuffix = suffixArray[c]; // Next find the last occurance of the phrase. // We previously stored some bounds for its loccation. begin = lastbegin; end = lastend; do { if (begin == end) { c = begin; cmp = 0; } else { c = (begin + end) / 2; cmp = compareSuffix(suffixArray[c], length); if (cmp == -1) { // target phrase is greater than phrase at suffixArray[c] end = c - 1; } else { assert(cmp == 0); // target phrase is the same as the phrase at suffixArray[c]. However, // to find the last occurance, one of two additional condiditons must be met: // either c must be the very last cell in the array, or // suffixArray[c+1] must be different from phrase if (c < inputLength-1) { cmp = compareSuffix(suffixArray[c+1], length); if (cmp == 0) { begin = c + 1; cmp = -1; } else { cmp = 0; } } } } } while (cmp != 0); lastSuffixIndex = c; lastSuffix = suffixArray[c]; suffixFrequency = lastSuffixIndex - firstSuffixIndex + 1; suffixFound = 1; return 0; } // Find the phrase in a prefix array // // Given a prefix array, find the first and last occurances of the phrase. // The begin and end parameters identify which cells to search (inclusive). int Phrase::findFirstAndLastPrefix() { return findFirstAndLastPrefix(0, inputLength-1); } int Phrase::findFirstAndLastPrefix(cellindex begin, cellindex end) { // First find *any* occurance of the phrase assert(begin <= end); // if we're only searching one cell, it is very easy if (begin == end) { assert(comparePrefix(prefixArray[begin], length) == 0); firstPrefix = lastPrefix = prefixArray[begin]; firstPrefixIndex = lastPrefixIndex = begin; prefixFrequency = 1; prefixFound = 1; return 0; } cellindex c; int cmp; do { c = (end + begin) / 2; cmp = comparePrefix(prefixArray[c], length); if (cmp == 1) { begin = c + 1; } else if (cmp == -1) { end = c - 1; } } while (cmp != 0); cellindex lastbegin = c; cellindex lastend = end; // Next find the first occurance of the phrase. // We know that the first occurance must be between the // current value of begin and the cureent value of c. end = c; do { if (begin == end) { c = begin; cmp = 0; } else { c = (begin + end) / 2; cmp = comparePrefix(prefixArray[c], length); if (cmp == 1) { // target phrase is lower than phrase at prefixArray[c] begin = c + 1; } else { assert(cmp == 0); // target phrase is the same as the phrase at prefixArray[c]. However, to // find the first occurance, one of two additional condiditons must be met: // either c == the first cell in prefix array // or prefixArray[c-1] != phrase. if (c > 0) { cmp = comparePrefix(prefixArray[c-1], length); if (cmp == 0) { end = c - 1; assert(end >= begin); cmp = 1; } else { cmp = 0; } } } } } while (cmp != 0); // we have found the first location firstPrefixIndex = c; firstPrefix = prefixArray[c]; // Next find the last occurance of the phrase. // We previously stored some bounds for its loccation. begin = lastbegin; end = lastend; do { if (begin == end) { c = begin; cmp = 0; } else { c = (begin + end) / 2; cmp = comparePrefix(prefixArray[c], length); if (cmp == -1) { // target phrase is greater than phrase at prefixArray[c] end = c - 1; } else { assert(cmp == 0); // target phrase is the same as the phrase at prefixArray[c]. However, // to find the last occurance, prefixArray[c] must be the same as the // phrase, but prefixArray[c+1] must be different. cmp = comparePrefix(prefixArray[c+1], length); if (cmp == 0) { begin = c + 1; cmp = -1; } else { cmp = 0; } } } } while (cmp != 0); lastPrefixIndex = c; lastPrefix = prefixArray[c]; prefixFrequency = lastPrefixIndex - firstPrefixIndex + 1; prefixFound = 1; return 0; } // Calculate a set of initial suffix/prefix candidates // // Expand the phrase to find the initial // set of candidates that may be extensions of that phrase, // and add them to the end of the results vector void Phrase::initialSuffixCandidates(vector &results) { ensureSuffixFound(); Phrase next; cellindex i = firstSuffixIndex; // Find all the expansions of Phrase p while (i <= lastSuffixIndex) { // create new phrase exactly one longer than the current one next = newPhraseShortestSuffixExpansion(i); // If the expansion occurs more than once and is not delimited, expand it if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= minOccurs)) { next.expandWhileUniqueSuffixExtension(); results.push_back(next); } // Move onto the next expansion i = next.lastSuffixIndex + 1; } } void Phrase::initialPrefixCandidates(vector &results) { ensurePrefixFound(); Phrase next; cellindex i = firstPrefixIndex; // Find all the expansions of Phrase p while (i <= lastPrefixIndex) { // create new phrase exactly one longer than the current one next = newPhraseShortestPrefixExpansion(i); // If the expansion occurs more than once and is not delimited, expand it if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= minOccurs)) { next.expandWhileUniquePrefixExtension(); results.push_back(next); } // Move onto the next expansion i = next.lastPrefixIndex + 1; } } // Create a new phrase that is longer then the current one, but as short // as possible. In ANYPHRASE mode this means the new phrase is exactly // one symbol longer than the current one, but in STOPWORD mode the expansion // must introduce a new content symbol // // Note that we never expand across delimiters; if we encounter a delimiter // then this becomes the "end" symbol of the phrase. The calling function // should check for this condition. Phrase Phrase::newPhraseShortestSuffixExpansion(cellindex i) { // create a new phrase exactly one symbol longer than the current one Phrase p(suffixArray[i], length + 1, SUFFIX); if (phraseMode == STOPWORDS) { // in STOPWORDS mode we must introduce a new content symbol while ((*(p.back) >= firstStopSymbol) && (*(p.back) <= lastStopSymbol)) { p.increaseSuffixLength(p.length + 1); } } p.findFirstAndLastSuffix(i, lastSuffixIndex); return p; } Phrase Phrase::newPhraseShortestPrefixExpansion(cellindex i) { // create a new phrase exactly one symbol longer than the current one Phrase p(prefixArray[i], length + 1, PREFIX); if (phraseMode == STOPWORDS) { // in STOPWORDS mode we must introduce a new content symbol while ((*(p.forward) >= firstStopSymbol) && (*(p.forward) <= lastStopSymbol)) { p.increasePrefixLength(p.length + 1); } } p.findFirstAndLastPrefix(i, lastPrefixIndex); return p; } // Expand a phrase until it no longer has a unique suffix extension // // If the phrase only occurs once in the suffix array, do nothing. // // If we are in stopwords mode then we expand if there is a unique // suffix expansion and this expansion is a content word. We do this // with the content_len variable, which holds the length of the longest // phrase ending in a content word. int Phrase::expandWhileUniqueSuffixExtension() { ensureSuffixFound(); // if the phrase occurs only once, do nothing if (suffixFrequency < minOccurs) return 0; // if we already know there's no unique suffix extension, do nothing if (uniqueSuffixExtension == 0) return 0; // count the length over which the cells are the same // and no delimiter is crossed. symbol *fst = firstSuffix; symbol *lst = lastSuffix; cellindex len = 0; cellindex content_len = 0; // Calculate the length over which the phrases match while((*fst == *lst) && (*fst > LASTDELIMITER)) { ++len; if (*fst > lastStopSymbol) content_len = len; ++fst; ++lst; } // in ANYPHRASE mode, expand the phrase for as long as possible if (phraseMode == ANYPHRASE) { if (len > length) { increaseSuffixLength(len); } } // in STOPWORDS mode, expand only as far as the last contentword else { if (content_len > length) { increaseSuffixLength(content_len); } } uniqueSuffixExtension = 0; return 0; } int Phrase::expandWhileUniquePrefixExtension() { ensurePrefixFound(); // if the phrase occurs only once, do nothing if (prefixFrequency < minOccurs) return 0; // if we already know there's no unique extension, do nothing if (uniquePrefixExtension == 0) return 0; // count the length over which the cells are the same // and no delimiter is crossed. symbol *fst = firstPrefix; symbol *lst = lastPrefix; cellindex len = 0; cellindex content_len = 0; // Calculate the length over which the phrases match while((*fst == *lst) && (*fst > LASTDELIMITER)) { ++len; if (*fst > lastStopSymbol) content_len = len; --fst; --lst; } // in ANYPHRASE mode, expand the phrase for as long as possible if (phraseMode == ANYPHRASE) { if (len > length) { increasePrefixLength(len); } } // in STOPWORDS mode, expand only as far as the last contentword else { if (content_len > length) { increasePrefixLength(content_len); } } uniquePrefixExtension = 0; return 0; } // Compare the length of two phrases // // Given two phrases, return true if the first is shorter/longer, // otherwise return false. For use in various sort functions. bool isShorter(Phrase p1, Phrase p2) { if (p1.length < p2.length) { return true; } return false; } bool isLonger(Phrase p1, Phrase p2) { if (p1.length > p2.length) { return true; } return false; }