source: trunk/gsdl/src/phind/generate/phrase.cpp@ 2487

Last change on this file since 2487 was 2487, checked in by sjboddie, 23 years ago

Changes to get phind working under windows

  • Property svn:keywords set to Author Date Id Revision
File size: 19.7 KB
Line 
1/**********************************************************************
2 *
3 * phrase.cpp -- implementation of the phrase object used by suffix.cpp
4 *
5 * Copyright 2000 Gordon W. Paynter
6 * Copyright 2000 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include <assert.h>
29#include <stdio.h>
30
31#if defined(GSDL_USE_STL_H)
32# include <vector.h>
33#else
34# include <vector>
35#endif
36
37#if defined(GSDL_USE_IOS_H)
38# include <iostream.h>
39#else
40# include <iostream>
41#endif
42
43#include "suffix.h"
44#include "phrase.h"
45
46// Phrase constructor functions
47
48Phrase::Phrase(symbol *words, cellcount size, int direction) {
49 empty();
50 length = size;
51
52 if (direction == SUFFIX) {
53 forward = words;
54 back = forward + size - 1;
55 } else {
56 back = words;
57 forward = back - size + 1;
58 }
59}
60
61Phrase::Phrase() {
62 empty();
63}
64
65
66// Empty the contents of a phrase
67
68int Phrase::empty() {
69
70 forward = back = NULL;
71 length = 0;
72
73 suffixFound = prefixFound = 0;
74 firstSuffix = firstPrefix = NULL;
75 lastSuffix = lastPrefix = NULL;
76 suffixFrequency = prefixFrequency = 0;
77 uniqueSuffixExtension = uniquePrefixExtension = -1;
78
79 return 0;
80}
81
82// Remove the details of the location where a suffix/prefix
83// is found in the array.
84
85int Phrase::clearSuffix() {
86 suffixFound = 0;
87 firstSuffix = NULL;
88 lastSuffix = NULL;
89 suffixFrequency = 0;
90 uniqueSuffixExtension = -1;
91 return 0;
92}
93
94int Phrase::clearPrefix() {
95 prefixFound = 0;
96 firstPrefix = NULL;
97 lastPrefix = NULL;
98 prefixFrequency = 0;
99 uniquePrefixExtension = -1;
100 return 0;
101}
102
103// Increase the length of s phrase
104//
105// We increase the length of a string "in place". When we expand
106// the suffix, however, we invalidate the prefix data, and vice-versa.
107
108int Phrase::increaseSuffixLength(cellcount l) {
109 length = l;
110 clearPrefix();
111 back = forward + l - 1;
112 return 0;
113}
114
115int Phrase::increasePrefixLength(cellcount l) {
116 length = l;
117 clearSuffix();
118 forward = back - l + 1;
119 return 0;
120}
121
122
123// Convert the phrase to a string
124char *Phrase::toString() {
125
126 assert(forward);
127
128 char *str;
129 str = new char[length*20];
130 symbol *s = forward;
131 sprintf(str, "s%d", *s++);
132
133 for (cellcount i = 1; i < length; i++) {
134 sprintf(str, "%s s%d", str, *s++);
135 }
136
137 return str;
138}
139
140
141// Compare a phrase to an another array of symbols
142//
143// Given an array of words with a specified length,
144// compare those words to this phrase.
145//
146// Return 0 if the two phrases are the same over length length
147// Return 1 if the phrase is "greater" than tyhe words, or
148// Return -1 if the phrase is less.
149
150int Phrase::compareSuffix(symbol *words, cellcount length) {
151 assert(forward);
152 symbol *p = forward;
153
154 for (cellcount i = 0; i < length; i++) {
155 if (*p > *words) {
156 return 1;
157 } else if (*p < *words) {
158 return -1;
159 } else {
160 *p++;
161 *words++;
162 }
163 }
164
165 return 0;
166
167}
168
169int Phrase::comparePrefix(symbol *words, cellcount length) {
170
171 assert(back);
172 symbol *p = back;
173
174 for (cellcount i = 0; i < length; i++) {
175 if (*p > *words) {
176 return -1;
177 } else if (*p < *words) {
178 return 1;
179 } else {
180 *p--;
181 *words--;
182 }
183 }
184
185 return 0;
186
187}
188
189
190// Does the phrase have a unique suffix/prefix extension?
191//
192// For suffix:
193// If uniqueSuffixExtensions is 1, then phrase has some unique expansion.
194// If it is 0 then it does not. If it is -1 then we don't know, and have
195// to calculate it from the suffix array. Same goes for prefixes.
196
197int Phrase::hasUniqueSuffixExtension() {
198
199 // if we don't know, then work it out
200 if (uniqueSuffixExtension == -1) {
201
202 ensureSuffixFound();
203
204 // pointers to the phrase's first and last occurances in te suffixArray
205 symbol *fst = firstSuffix + length;
206 symbol *lst = lastSuffix + length;
207
208 // in ANYPHRASE mode, simply check the next symbol
209 if (phraseMode == ANYPHRASE) {
210 if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
211 uniqueSuffixExtension = 1;
212 } else {
213 uniqueSuffixExtension = 0;
214 }
215 }
216 // in STOPWORDS mode, make sure there is a unique next content symbol
217 else {
218 uniqueSuffixExtension = 0;
219 while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
220 if (*fst >= firstContentSymbol) {
221 uniqueSuffixExtension = 1;
222 break;
223 }
224 fst++;
225 lst++;
226 }
227 }
228 }
229
230 return uniqueSuffixExtension;
231}
232
233int Phrase::hasUniquePrefixExtension() {
234
235 // if we don't know, then work it out
236 if (uniquePrefixExtension == -1) {
237
238 ensurePrefixFound();
239
240 // pointers to the phrase's first and last occurances in the prefixArray
241 symbol *fst = firstPrefix - length;
242 symbol *lst = lastPrefix - length;
243
244 // in ANYPHRASE mode, simply check the next symbol
245 if (phraseMode == ANYPHRASE) {
246 if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
247 uniquePrefixExtension = 1;
248 } else {
249 uniquePrefixExtension = 0;
250 }
251 }
252 // in STOPWORDS mode, make sure there is a unique next content symbol
253 else {
254 uniquePrefixExtension = 0;
255 while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
256 if (*fst >= firstContentSymbol) {
257 uniquePrefixExtension = 1;
258 break;
259 }
260 fst--;
261 lst--;
262 }
263 }
264 }
265
266 return uniquePrefixExtension;
267}
268
269
270// Expand a phrase with a unique suffix/prefix by 1 symbol
271//
272// Note that in STOPWORDS mode a "unique extension" means a unique
273// extension that ends on a content symbol. Although we may extend
274// such a phrase by 2 or more symbols, we will only extend it by a
275// single content symbol.
276
277int Phrase::expandUniqueSuffixExtensionByOne() {
278 assert(suffixFound);
279 assert(uniqueSuffixExtension == 1);
280
281 // in ANYPHRASE mode, expand the phrase by one symbol
282 if (phraseMode == ANYPHRASE) {
283 increaseSuffixLength(length + 1);
284 }
285 // in STOPWORDS mode, expand the phrase up to the next content symbol
286 else {
287 symbol *next = firstSuffix + length;
288 cellcount newlength = length + 1;
289
290 while (*next < firstContentSymbol) {
291 next++;
292 newlength++;
293 }
294 increaseSuffixLength(newlength);
295 }
296
297 uniqueSuffixExtension = -1;
298 return 0;
299}
300
301int Phrase::expandUniquePrefixExtensionByOne() {
302 assert(prefixFound);
303 assert(uniquePrefixExtension == 1);
304
305 // in ANYPHRASE mode, expand the phrase by one symbol
306 if (phraseMode == ANYPHRASE) {
307 increasePrefixLength(length + 1);
308 }
309 // in STOPWORDS mode, expand the phrase to the next content symbol
310 else {
311 symbol *next = firstPrefix - length;
312 cellcount newlength = length + 1;
313 while (*next < firstContentSymbol) {
314 next--;
315 newlength++;
316 }
317 increasePrefixLength(newlength);
318
319 }
320
321 uniquePrefixExtension = -1;
322 return 0;
323}
324
325
326// Find the phrase in a suffixArray
327//
328// Given a suffix array, find the first and last occurances of the phrase.
329// The begin and end parameters identify which cells to search (inclusive).
330
331int Phrase::findFirstAndLastSuffix() {
332 return findFirstAndLastSuffix(0, inputLength-1);
333}
334
335int Phrase::findFirstAndLastSuffix(cellindex begin, cellindex end) {
336
337 // First find *any* occurance of the phrase
338 assert(begin <= end);
339
340 // cout << "findFirstAndLastSuffix (" << begin << " - " << end << ") : " << toString() << endl;
341
342 // if we're only searching one cell, it is very easy
343 if (begin == end) {
344 assert(compareSuffix(suffixArray[begin], length) == 0);
345 firstSuffix = lastSuffix = suffixArray[begin];
346 firstSuffixIndex = lastSuffixIndex = begin;
347 suffixFrequency = 1;
348 suffixFound = 1;
349 return 0;
350 }
351
352 cellindex c;
353 int cmp;
354
355 do {
356 // cout << "Find anywhere in " << begin << " - " << end << endl;
357 c = (end + begin) / 2;
358 cmp = compareSuffix(suffixArray[c], length);
359 if (cmp == 1) {
360 // target phrase is lower than phrase at suffixArray[c]
361 begin = c + 1;
362 } else if (cmp == -1) {
363 // target phrase is higher than phrase at suffixArray[c]
364 end = c - 1;
365 }
366 } while (cmp != 0);
367
368 cellindex lastbegin = c;
369 cellindex lastend = end;
370
371 // Next find the first occurance of the phrase.
372 // We know that the first occurance must be between the
373 // current value of begin and the current value of c.
374 end = c;
375
376 do {
377 // cout << "first-searching with " << begin << " - " << end << endl;
378 if (begin == end) {
379 c = begin;
380 cmp = 0;
381 } else {
382 c = (begin + end) / 2;
383 cmp = compareSuffix(suffixArray[c], length);
384 if (cmp == 1) {
385 // target phrase is lower than phrase at suffixArray[c]
386 begin = c + 1;
387 } else {
388 assert(cmp == 0);
389 // target phrase is the same as the phrase at suffixArray[c]. However,
390 // to find the first occurance, suffixArray[c] must be the same as the
391 // phrase, but suffixArray[c-1] must be different.
392 cmp = compareSuffix(suffixArray[c-1], length);
393 if (cmp == 0) {
394 end = c - 1;
395 assert(end >= begin);
396 cmp = 1;
397 } else {
398 cmp = 0;
399 }
400 }
401 }
402 } while (cmp != 0);
403
404 // we have found the first location
405 firstSuffixIndex = c;
406 firstSuffix = suffixArray[c];
407
408 // Next find the last occurance of the phrase.
409 // We previously stored some bounds for its loccation.
410 begin = lastbegin;
411 end = lastend;
412
413 do {
414 // cout << "last-searching with " << begin << " - " << end << endl;
415 if (begin == end) {
416 c = begin;
417 cmp = 0;
418 } else {
419 c = (begin + end) / 2;
420 cmp = compareSuffix(suffixArray[c], length);
421 if (cmp == -1) {
422 // target phrase is greater than phrase at suffixArray[c]
423 end = c - 1;
424 } else {
425 assert(cmp == 0);
426 // target phrase is the same as the phrase at suffixArray[c]. However,
427 // to find the last occurance, one of two additional condiditons must be met:
428 // either c must be the very last cell in the array, or
429 // suffixArray[c+1] must be different from phrase
430 if (c < inputLength-1) {
431 cmp = compareSuffix(suffixArray[c+1], length);
432 if (cmp == 0) {
433 begin = c + 1;
434 cmp = -1;
435 } else {
436 cmp = 0;
437 }
438 }
439 }
440 }
441 } while (cmp != 0);
442
443 lastSuffixIndex = c;
444 lastSuffix = suffixArray[c];
445 suffixFrequency = lastSuffixIndex - firstSuffixIndex + 1;
446 suffixFound = 1;
447
448 return 0;
449}
450
451
452// Find the phrase in a prefix array
453//
454// Given a prefix array, find the first and last occurances of the phrase.
455// The begin and end parameters identify which cells to search (inclusive).
456
457int Phrase::findFirstAndLastPrefix() {
458 return findFirstAndLastPrefix(0, inputLength-1);
459}
460
461int Phrase::findFirstAndLastPrefix(cellindex begin, cellindex end) {
462
463 // First find *any* occurance of the phrase
464 assert(begin <= end);
465
466 // cout << "findFirstAndLastPrefix (" << begin << " - " << end << ") : " << toString() << endl;
467
468 // if we're only searching one cell, it is very easy
469 if (begin == end) {
470 assert(comparePrefix(prefixArray[begin], length) == 0);
471 firstPrefix = lastPrefix = prefixArray[begin];
472 firstPrefixIndex = lastPrefixIndex = begin;
473 prefixFrequency = 1;
474 prefixFound = 1;
475 return 0;
476 }
477
478 cellindex c;
479 int cmp;
480
481 do {
482 // cout << "Find anywhere in prefixarray " << begin << " - " << end << endl;
483 c = (end + begin) / 2;
484 cmp = comparePrefix(prefixArray[c], length);
485 if (cmp == 1) {
486 begin = c + 1;
487 } else if (cmp == -1) {
488 end = c - 1;
489 }
490 } while (cmp != 0);
491
492 cellindex lastbegin = c;
493 cellindex lastend = end;
494
495 // Next find the first occurance of the phrase.
496 // We know that the first occurance must be between the
497 // current value of begin and the cureent value of c.
498 end = c;
499
500 do {
501 // cout << "first-prefix-searching with " << begin << " - " << end << endl;
502 if (begin == end) {
503 c = begin;
504 cmp = 0;
505 } else {
506 c = (begin + end) / 2;
507 cmp = comparePrefix(prefixArray[c], length);
508 // cout << "cmp: " << cmp << ", c: " << c << endl;
509 if (cmp == 1) {
510 // target phrase is lower than phrase at prefixArray[c]
511 begin = c + 1;
512 } else {
513 assert(cmp == 0);
514 // target phrase is the same as the phrase at prefixArray[c]. However, to
515 // find the first occurance, one of two additional condiditons must be met:
516 // either c == the first cell in prefix array
517 // or prefixArray[c-1] != phrase.
518 if (c > 0) {
519 cmp = comparePrefix(prefixArray[c-1], length);
520 if (cmp == 0) {
521 end = c - 1;
522 assert(end >= begin);
523 cmp = 1;
524 } else {
525 cmp = 0;
526 }
527 }
528 }
529 }
530 } while (cmp != 0);
531
532 // we have found the first location
533 firstPrefixIndex = c;
534 firstPrefix = prefixArray[c];
535
536 // Next find the last occurance of the phrase.
537 // We previously stored some bounds for its loccation.
538 begin = lastbegin;
539 end = lastend;
540
541 do {
542 // cout << "last-prefix-searching with " << begin << " - " << end << endl;
543 if (begin == end) {
544 c = begin;
545 cmp = 0;
546 } else {
547 c = (begin + end) / 2;
548 cmp = comparePrefix(prefixArray[c], length);
549 if (cmp == -1) {
550 // target phrase is greater than phrase at prefixArray[c]
551 end = c - 1;
552 } else {
553 assert(cmp == 0);
554 // target phrase is the same as the phrase at prefixArray[c]. However,
555 // to find the last occurance, prefixArray[c] must be the same as the
556 // phrase, but prefixArray[c+1] must be different.
557 cmp = comparePrefix(prefixArray[c+1], length);
558 if (cmp == 0) {
559 begin = c + 1;
560 cmp = -1;
561 } else {
562 cmp = 0;
563 }
564 }
565 }
566 } while (cmp != 0);
567
568 lastPrefixIndex = c;
569 lastPrefix = prefixArray[c];
570 prefixFrequency = lastPrefixIndex - firstPrefixIndex + 1;
571 prefixFound = 1;
572
573 return 0;
574}
575
576
577// Ensure that the phrase has been found in the suffix & prefix arrays
578
579int Phrase::ensureSuffixFound() {
580 if (!suffixFound) {
581 findFirstAndLastSuffix();
582 }
583 return 0;
584}
585
586int Phrase::ensurePrefixFound() {
587 if (!prefixFound) {
588 findFirstAndLastPrefix();
589 }
590 return 0;
591}
592
593
594// Calculate a set of initial suffix/prefix candidates
595//
596// Expand the phrase to find the initial
597// set of candidates that may be extensions of that phrase,
598// and add them to the end of the results vector
599
600int Phrase::initialSuffixCandidates(vector<Phrase> &results) {
601
602 ensureSuffixFound();
603 Phrase next;
604 cellindex i = firstSuffixIndex;
605
606 // Find all the expansions of Phrase p
607 while (i <= lastSuffixIndex) {
608
609 // create new phrase exactly one longer than the current one
610 next = newPhraseShortestSuffixExpansion(i);
611
612 // If the expansion occurs more than once and is not delimited, expand it
613 if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {
614 next.expandWhileUniqueSuffixExtension();
615 results.push_back(next);
616 }
617
618 // Move onto the next expansion
619 i = next.lastSuffixIndex + 1;
620
621 }
622 return 0;
623}
624
625
626int Phrase::initialPrefixCandidates(vector<Phrase> &results) {
627
628 ensurePrefixFound();
629 Phrase next;
630 cellindex i = firstPrefixIndex;
631
632 // Find all the expansions of Phrase p
633 while (i <= lastPrefixIndex) {
634
635 // create new phrase exactly one longer than the current one
636 next = newPhraseShortestPrefixExpansion(i);
637
638 // If the expansion occurs more than once and is not delimited, expand it
639 if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {
640 next.expandWhileUniquePrefixExtension();
641 results.push_back(next);
642 }
643
644 // Move onto the next expansion
645 i = next.lastPrefixIndex + 1;
646
647 }
648 return 0;
649}
650
651
652// Create a new phrase that is longer then the current one, but as short
653// as possible. In ANYPHRASE mode this means the new phrase is exactly
654// one symbol longer than the current one, but in STOPWORD mode the expansion
655// must introduce a new content symbol
656//
657// Note that we never expand across delimiters; if we encounter a delimiter
658// then this becomes the "end" symbol of the phrase. The calling function
659// should check for this condition.
660
661Phrase Phrase::newPhraseShortestSuffixExpansion(cellindex i) {
662
663 // create a new phrase exactly one symbol longer than the current one
664 Phrase p(suffixArray[i], length + 1, SUFFIX);
665
666 if (phraseMode == STOPWORDS) {
667 // in STOPWORDS mode we must introduce a new content symbol
668 while ((*(p.back) >= firstStopSymbol) &&
669 (*(p.back) <= lastStopSymbol)) {
670 p.increaseSuffixLength(p.length + 1);
671 }
672 }
673
674 p.findFirstAndLastSuffix(i, lastSuffixIndex);
675 return p;
676}
677
678Phrase Phrase::newPhraseShortestPrefixExpansion(cellindex i) {
679
680 // create a new phrase exactly one symbol longer than the current one
681 Phrase p(prefixArray[i], length + 1, PREFIX);
682
683 if (phraseMode == STOPWORDS) {
684 // in STOPWORDS mode we must introduce a new content symbol
685 while ((*(p.forward) >= firstStopSymbol) &&
686 (*(p.forward) <= lastStopSymbol)) {
687 p.increasePrefixLength(p.length + 1);
688 }
689 }
690
691 p.findFirstAndLastPrefix(i, lastPrefixIndex);
692 return p;
693}
694
695
696// Expand a phrase until it no longer has a unique suffix extension
697//
698// If the phrase only occurs once in the suffix array, do nothing.
699//
700// If we are in stopwords mode then we expand if there is a unique
701// suffix expansion and this expansion is a content word. We do this
702// with the content_len variable, which holds the length of the longest
703// phrase ending in a content word.
704
705int Phrase::expandWhileUniqueSuffixExtension() {
706 assert(forward);
707 assert(suffixFound);
708
709 // if the phrase occurs only once, do nothing
710 if (suffixFrequency < 2) {
711 return 0;
712 }
713
714 // count the length over which the cells are the same
715 // and no delimiter is crossed.
716 symbol *fst = firstSuffix;
717 symbol *lst = lastSuffix;
718 cellindex len = 0;
719 cellindex content_len = 0;
720
721 // Calculate the length over which the phrases match
722 while((*fst == *lst) && (*fst > LASTDELIMITER)) {
723 len++;
724 if (*fst > lastStopSymbol) content_len = len;
725 fst++;
726 lst++;
727 }
728
729 // in ANYPHRASE mode, expand the phrase for as long as possible
730 if (phraseMode == ANYPHRASE) {
731 if (len > length) {
732 increaseSuffixLength(len);
733 }
734 }
735 // in STOPWORDS mode, expand only as far as the last contentword
736 else {
737 if (content_len > length) {
738 increaseSuffixLength(content_len);
739 }
740 }
741
742 uniqueSuffixExtension = 0;
743 return 0;
744}
745
746int Phrase::expandWhileUniquePrefixExtension() {
747 assert(prefixFound);
748
749 // if the phrase occurs only once, do nothing
750 if (prefixFrequency < 2) {
751 return 0;
752 }
753
754 // count the length over which the cells are the same
755 // and no delimiter is crossed.
756 symbol *fst = firstPrefix;
757 symbol *lst = lastPrefix;
758 cellindex len = 0;
759 cellindex content_len = 0;
760
761 // Calculate the length over which the phrases match
762 while((*fst == *lst) && (*fst > LASTDELIMITER)) {
763 len++;
764 if (*fst > lastStopSymbol) content_len = len;
765 fst--;
766 lst--;
767 }
768
769 // in ANYPHRASE mode, expand the phrase for as long as possible
770 if (phraseMode == ANYPHRASE) {
771 if (len > length) {
772 increasePrefixLength(len);
773 }
774 }
775 // in STOPWORDS mode, expand only as far as the last contentword
776 else {
777 if (content_len > length) {
778 increasePrefixLength(content_len);
779 }
780 }
781
782 uniquePrefixExtension = 0;
783 return 0;
784}
785
786
787
788
789
790
791
792
793
794
795
796
797// Compare the length of two phrases
798//
799// Given two phrases, return true if the first is shorter/longer,
800// otherwise return false. For use in various sort functions.
801
802bool isShorter(Phrase p1, Phrase p2) {
803 if (p1.length < p2.length) {
804 return true;
805 }
806 return false;
807}
808
809bool isLonger(Phrase p1, Phrase p2) {
810 if (p1.length > p2.length) {
811 return true;
812 }
813 return false;
814}
815
816
Note: See TracBrowser for help on using the repository browser.