source: trunk/gsdl/src/phind/generate/phrase.cpp@ 1873

Last change on this file since 1873 was 1873, checked in by paynter, 23 years ago

Fixed a bug iin the Phrase extraction alogrithm that had the
"candidates" in the GetMinimalExpansions function sorted backwards.

  • Property svn:keywords set to Author Date Id Revision
File size: 19.6 KB
Line 
1/**********************************************************************
2 *
3 * phrase.cpp -- implementation of the phrase object used by suffix.cpp
4 *
5 * Copyright 2000 Gordon W. Paynter
6 * Copyright 2000 The New Zealand Digital Library Project
7 *
8 * A component of the Greenstone digital library software
9 * from the New Zealand Digital Library Project at the
10 * University of Waikato, New Zealand.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *
26 *********************************************************************/
27
28#include <iostream.h>
29#include <assert.h>
30#include <vector.h>
31#include <stdio.h>
32
33#include "suffix.h"
34#include "phrase.h"
35
36// Phrase constructor functions
37
38Phrase::Phrase(symbol *words, cellcount size, int direction) {
39 empty();
40 length = size;
41
42 if (direction == SUFFIX) {
43 forward = words;
44 back = forward + size - 1;
45 } else {
46 back = words;
47 forward = back - size + 1;
48 }
49}
50
51Phrase::Phrase() {
52 empty();
53}
54
55
56// Empty the contents of a phrase
57
58int Phrase::empty() {
59
60 forward = back = NULL;
61 length = 0;
62
63 suffixFound = prefixFound = 0;
64 firstSuffix = firstPrefix = NULL;
65 lastSuffix = lastPrefix = NULL;
66 suffixFrequency = prefixFrequency = 0;
67 uniqueSuffixExtension = uniquePrefixExtension = -1;
68
69 return 0;
70}
71
72// Remove the details of the location where a suffix/prefix
73// is found in the array.
74
75int Phrase::clearSuffix() {
76 suffixFound = 0;
77 firstSuffix = NULL;
78 lastSuffix = NULL;
79 suffixFrequency = 0;
80 uniqueSuffixExtension = -1;
81 return 0;
82}
83
84int Phrase::clearPrefix() {
85 prefixFound = 0;
86 firstPrefix = NULL;
87 lastPrefix = NULL;
88 prefixFrequency = 0;
89 uniquePrefixExtension = -1;
90 return 0;
91}
92
93// Increase the length of s phrase
94//
95// We increase the length of a string "in place". When we expand
96// the suffix, however, we invalidate the prefix data, and vice-versa.
97
98int Phrase::increaseSuffixLength(cellcount l) {
99 length = l;
100 clearPrefix();
101 back = forward + l - 1;
102 return 0;
103}
104
105int Phrase::increasePrefixLength(cellcount l) {
106 length = l;
107 clearSuffix();
108 forward = back - l + 1;
109 return 0;
110}
111
112
113// Convert the phrase to a string
114char *Phrase::toString() {
115
116 assert(forward);
117
118 char *str;
119 str = new char[length*20];
120 symbol *s = forward;
121 sprintf(str, "s%d", *s++);
122
123 for (cellcount i = 1; i < length; i++) {
124 sprintf(str, "%s s%d", str, *s++);
125 }
126
127 return str;
128}
129
130
131// Compare a phrase to an another array of symbols
132//
133// Given an array of words with a specified length,
134// compare those words to this phrase.
135//
136// Return 0 if the two phrases are the same over length length
137// Return 1 if the phrase is "greater" than tyhe words, or
138// Return -1 if the phrase is less.
139
140int Phrase::compareSuffix(symbol *words, cellcount length) {
141 assert(forward);
142 symbol *p = forward;
143
144 for (cellcount i = 0; i < length; i++) {
145 if (*p > *words) {
146 return 1;
147 } else if (*p < *words) {
148 return -1;
149 } else {
150 *p++;
151 *words++;
152 }
153 }
154
155 return 0;
156
157}
158
159int Phrase::comparePrefix(symbol *words, cellcount length) {
160
161 assert(back);
162 symbol *p = back;
163
164 for (cellcount i = 0; i < length; i++) {
165 if (*p > *words) {
166 return -1;
167 } else if (*p < *words) {
168 return 1;
169 } else {
170 *p--;
171 *words--;
172 }
173 }
174
175 return 0;
176
177}
178
179
180// Does the phrase have a unique suffix/prefix extension?
181//
182// For suffix:
183// If uniqueSuffixExtensions is 1, then phrase has some unique expansion.
184// If it is 0 then it does not. If it is -1 then we don't know, and have
185// to calculate it from the suffix array. Same goes for prefixes.
186
187int Phrase::hasUniqueSuffixExtension() {
188
189 // if we don't know, then work it out
190 if (uniqueSuffixExtension == -1) {
191
192 ensureSuffixFound();
193
194 // pointers to the phrase's first and last occurances in te suffixArray
195 symbol *fst = firstSuffix + length;
196 symbol *lst = lastSuffix + length;
197
198 // in ANYPHRASE mode, simply check the next symbol
199 if (phraseMode == ANYPHRASE) {
200 if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
201 uniqueSuffixExtension = 1;
202 } else {
203 uniqueSuffixExtension = 0;
204 }
205 }
206 // in STOPWORDS mode, make sure there is a unique next content symbol
207 else {
208 uniqueSuffixExtension = 0;
209 while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
210 if (*fst >= firstContentSymbol) {
211 uniqueSuffixExtension = 1;
212 break;
213 }
214 fst++;
215 lst++;
216 }
217 }
218 }
219
220 return uniqueSuffixExtension;
221}
222
223int Phrase::hasUniquePrefixExtension() {
224
225 // if we don't know, then work it out
226 if (uniquePrefixExtension == -1) {
227
228 ensurePrefixFound();
229
230 // pointers to the phrase's first and last occurances in the prefixArray
231 symbol *fst = firstPrefix - length;
232 symbol *lst = lastPrefix - length;
233
234 // in ANYPHRASE mode, simply check the next symbol
235 if (phraseMode == ANYPHRASE) {
236 if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
237 uniquePrefixExtension = 1;
238 } else {
239 uniquePrefixExtension = 0;
240 }
241 }
242 // in STOPWORDS mode, make sure there is a unique next content symbol
243 else {
244 uniquePrefixExtension = 0;
245 while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
246 if (*fst >= firstContentSymbol) {
247 uniquePrefixExtension = 1;
248 break;
249 }
250 fst--;
251 lst--;
252 }
253 }
254 }
255
256 return uniquePrefixExtension;
257}
258
259
260// Expand a phrase with a unique suffix/prefix by 1 symbol
261//
262// Note that in STOPWORDS mode a "unique extension" means a unique
263// extension that ends on a content symbol. Although we may extend
264// such a phrase by 2 or more symbols, we will only extend it by a
265// single content symbol.
266
267int Phrase::expandUniqueSuffixExtensionByOne() {
268 assert(suffixFound);
269 assert(uniqueSuffixExtension == 1);
270
271 // in ANYPHRASE mode, expand the phrase by one symbol
272 if (phraseMode == ANYPHRASE) {
273 increaseSuffixLength(length + 1);
274 }
275 // in STOPWORDS mode, expand the phrase up to the next content symbol
276 else {
277 symbol *next = firstSuffix + length;
278 cellcount newlength = length + 1;
279
280 while (*next < firstContentSymbol) {
281 next++;
282 newlength++;
283 }
284 increaseSuffixLength(newlength);
285 }
286
287 uniqueSuffixExtension = -1;
288 return 0;
289}
290
291int Phrase::expandUniquePrefixExtensionByOne() {
292 assert(prefixFound);
293 assert(uniquePrefixExtension == 1);
294
295 // in ANYPHRASE mode, expand the phrase by one symbol
296 if (phraseMode == ANYPHRASE) {
297 increasePrefixLength(length + 1);
298 }
299 // in STOPWORDS mode, expand the phrase to the next content symbol
300 else {
301 symbol *next = firstPrefix - length;
302 cellcount newlength = length + 1;
303 while (*next < firstContentSymbol) {
304 next--;
305 newlength++;
306 }
307 increasePrefixLength(newlength);
308
309 }
310
311 uniquePrefixExtension = -1;
312 return 0;
313}
314
315
316// Find the phrase in a suffixArray
317//
318// Given a suffix array, find the first and last occurances of the phrase.
319// The begin and end parameters identify which cells to search (inclusive).
320
321int Phrase::findFirstAndLastSuffix() {
322 return findFirstAndLastSuffix(0, inputLength-1);
323}
324
325int Phrase::findFirstAndLastSuffix(cellindex begin, cellindex end) {
326
327 // First find *any* occurance of the phrase
328 assert(begin <= end);
329
330 // cout << "findFirstAndLastSuffix (" << begin << " - " << end << ") : " << toString() << endl;
331
332 // if we're only searching one cell, it is very easy
333 if (begin == end) {
334 assert(compareSuffix(suffixArray[begin], length) == 0);
335 firstSuffix = lastSuffix = suffixArray[begin];
336 firstSuffixIndex = lastSuffixIndex = begin;
337 suffixFrequency = 1;
338 suffixFound = 1;
339 return 0;
340 }
341
342 cellindex c;
343 int cmp;
344
345 do {
346 // cout << "Find anywhere in " << begin << " - " << end << endl;
347 c = (end + begin) / 2;
348 cmp = compareSuffix(suffixArray[c], length);
349 if (cmp == 1) {
350 // target phrase is lower than phrase at suffixArray[c]
351 begin = c + 1;
352 } else if (cmp == -1) {
353 // target phrase is higher than phrase at suffixArray[c]
354 end = c - 1;
355 }
356 } while (cmp != 0);
357
358 cellindex lastbegin = c;
359 cellindex lastend = end;
360
361 // Next find the first occurance of the phrase.
362 // We know that the first occurance must be between the
363 // current value of begin and the current value of c.
364 end = c;
365
366 do {
367 // cout << "first-searching with " << begin << " - " << end << endl;
368 if (begin == end) {
369 c = begin;
370 cmp = 0;
371 } else {
372 c = (begin + end) / 2;
373 cmp = compareSuffix(suffixArray[c], length);
374 if (cmp == 1) {
375 // target phrase is lower than phrase at suffixArray[c]
376 begin = c + 1;
377 } else {
378 assert(cmp == 0);
379 // target phrase is the same as the phrase at suffixArray[c]. However,
380 // to find the first occurance, suffixArray[c] must be the same as the
381 // phrase, but suffixArray[c-1] must be different.
382 cmp = compareSuffix(suffixArray[c-1], length);
383 if (cmp == 0) {
384 end = c - 1;
385 assert(end >= begin);
386 cmp = 1;
387 } else {
388 cmp = 0;
389 }
390 }
391 }
392 } while (cmp != 0);
393
394 // we have found the first location
395 firstSuffixIndex = c;
396 firstSuffix = suffixArray[c];
397
398 // Next find the last occurance of the phrase.
399 // We previously stored some bounds for its loccation.
400 begin = lastbegin;
401 end = lastend;
402
403 do {
404 // cout << "last-searching with " << begin << " - " << end << endl;
405 if (begin == end) {
406 c = begin;
407 cmp = 0;
408 } else {
409 c = (begin + end) / 2;
410 cmp = compareSuffix(suffixArray[c], length);
411 if (cmp == -1) {
412 // target phrase is greater than phrase at suffixArray[c]
413 end = c - 1;
414 } else {
415 assert(cmp == 0);
416 // target phrase is the same as the phrase at suffixArray[c]. However,
417 // to find the last occurance, one of two additional condiditons must be met:
418 // either c must be the very last cell in the array, or
419 // suffixArray[c+1] must be different from phrase
420 if (c < inputLength-1) {
421 cmp = compareSuffix(suffixArray[c+1], length);
422 if (cmp == 0) {
423 begin = c + 1;
424 cmp = -1;
425 } else {
426 cmp = 0;
427 }
428 }
429 }
430 }
431 } while (cmp != 0);
432
433 lastSuffixIndex = c;
434 lastSuffix = suffixArray[c];
435 suffixFrequency = lastSuffixIndex - firstSuffixIndex + 1;
436 suffixFound = 1;
437
438 return 0;
439}
440
441
442// Find the phrase in a prefix array
443//
444// Given a prefix array, find the first and last occurances of the phrase.
445// The begin and end parameters identify which cells to search (inclusive).
446
447int Phrase::findFirstAndLastPrefix() {
448 return findFirstAndLastPrefix(0, inputLength-1);
449}
450
451int Phrase::findFirstAndLastPrefix(cellindex begin, cellindex end) {
452
453 // First find *any* occurance of the phrase
454 assert(begin <= end);
455
456 // cout << "findFirstAndLastPrefix (" << begin << " - " << end << ") : " << toString() << endl;
457
458 // if we're only searching one cell, it is very easy
459 if (begin == end) {
460 assert(comparePrefix(prefixArray[begin], length) == 0);
461 firstPrefix = lastPrefix = prefixArray[begin];
462 firstPrefixIndex = lastPrefixIndex = begin;
463 prefixFrequency = 1;
464 prefixFound = 1;
465 return 0;
466 }
467
468 cellindex c;
469 int cmp;
470
471 do {
472 // cout << "Find anywhere in prefixarray " << begin << " - " << end << endl;
473 c = (end + begin) / 2;
474 cmp = comparePrefix(prefixArray[c], length);
475 if (cmp == 1) {
476 begin = c + 1;
477 } else if (cmp == -1) {
478 end = c - 1;
479 }
480 } while (cmp != 0);
481
482 cellindex lastbegin = c;
483 cellindex lastend = end;
484
485 // Next find the first occurance of the phrase.
486 // We know that the first occurance must be between the
487 // current value of begin and the cureent value of c.
488 end = c;
489
490 do {
491 // cout << "first-prefix-searching with " << begin << " - " << end << endl;
492 if (begin == end) {
493 c = begin;
494 cmp = 0;
495 } else {
496 c = (begin + end) / 2;
497 cmp = comparePrefix(prefixArray[c], length);
498 // cout << "cmp: " << cmp << ", c: " << c << endl;
499 if (cmp == 1) {
500 // target phrase is lower than phrase at prefixArray[c]
501 begin = c + 1;
502 } else {
503 assert(cmp == 0);
504 // target phrase is the same as the phrase at prefixArray[c]. However, to
505 // find the first occurance, one of two additional condiditons must be met:
506 // either c == the first cell in prefix array
507 // or prefixArray[c-1] != phrase.
508 if (c > 0) {
509 cmp = comparePrefix(prefixArray[c-1], length);
510 if (cmp == 0) {
511 end = c - 1;
512 assert(end >= begin);
513 cmp = 1;
514 } else {
515 cmp = 0;
516 }
517 }
518 }
519 }
520 } while (cmp != 0);
521
522 // we have found the first location
523 firstPrefixIndex = c;
524 firstPrefix = prefixArray[c];
525
526 // Next find the last occurance of the phrase.
527 // We previously stored some bounds for its loccation.
528 begin = lastbegin;
529 end = lastend;
530
531 do {
532 // cout << "last-prefix-searching with " << begin << " - " << end << endl;
533 if (begin == end) {
534 c = begin;
535 cmp = 0;
536 } else {
537 c = (begin + end) / 2;
538 cmp = comparePrefix(prefixArray[c], length);
539 if (cmp == -1) {
540 // target phrase is greater than phrase at prefixArray[c]
541 end = c - 1;
542 } else {
543 assert(cmp == 0);
544 // target phrase is the same as the phrase at prefixArray[c]. However,
545 // to find the last occurance, prefixArray[c] must be the same as the
546 // phrase, but prefixArray[c+1] must be different.
547 cmp = comparePrefix(prefixArray[c+1], length);
548 if (cmp == 0) {
549 begin = c + 1;
550 cmp = -1;
551 } else {
552 cmp = 0;
553 }
554 }
555 }
556 } while (cmp != 0);
557
558 lastPrefixIndex = c;
559 lastPrefix = prefixArray[c];
560 prefixFrequency = lastPrefixIndex - firstPrefixIndex + 1;
561 prefixFound = 1;
562
563 return 0;
564}
565
566
567// Ensure that the phrase has been found in the suffix & prefix arrays
568
569int Phrase::ensureSuffixFound() {
570 if (!suffixFound) {
571 findFirstAndLastSuffix();
572 }
573 return 0;
574}
575
576int Phrase::ensurePrefixFound() {
577 if (!prefixFound) {
578 findFirstAndLastPrefix();
579 }
580 return 0;
581}
582
583
584// Calculate a set of initial suffix/prefix candidates
585//
586// Expand the phrase to find the initial
587// set of candidates that may be extensions of that phrase,
588// and add them to the end of the results vector
589
590int Phrase::initialSuffixCandidates(vector<Phrase> &results) {
591
592 ensureSuffixFound();
593 Phrase next;
594 cellindex i = firstSuffixIndex;
595
596 // Find all the expansions of Phrase p
597 while (i <= lastSuffixIndex) {
598
599 // create new phrase exactly one longer than the current one
600 next = newPhraseShortestSuffixExpansion(i);
601
602 // If the expansion occurs more than once and is not delimited, expand it
603 if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {
604 next.expandWhileUniqueSuffixExtension();
605 results.push_back(next);
606 }
607
608 // Move onto the next expansion
609 i = next.lastSuffixIndex + 1;
610
611 }
612 return 0;
613}
614
615
616int Phrase::initialPrefixCandidates(vector<Phrase> &results) {
617
618 ensurePrefixFound();
619 Phrase next;
620 cellindex i = firstPrefixIndex;
621
622 // Find all the expansions of Phrase p
623 while (i <= lastPrefixIndex) {
624
625 // create new phrase exactly one longer than the current one
626 next = newPhraseShortestPrefixExpansion(i);
627
628 // If the expansion occurs more than once and is not delimited, expand it
629 if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {
630 next.expandWhileUniquePrefixExtension();
631 results.push_back(next);
632 }
633
634 // Move onto the next expansion
635 i = next.lastPrefixIndex + 1;
636
637 }
638 return 0;
639}
640
641
642// Create a new phrase that is longer then the current one, but as short
643// as possible. In ANYPHRASE mode this means the new phrase is exactly
644// one symbol longer than the current one, but in STOPWORD mode the expansion
645// must introduce a new content symbol
646//
647// Note that we never expand across delimiters; if we encounter a delimiter
648// then this becomes the "end" symbol of the phrase. The calling function
649// should check for this condition.
650
651Phrase Phrase::newPhraseShortestSuffixExpansion(cellindex i) {
652
653 // create a new phrase exactly one symbol longer than the current one
654 Phrase p(suffixArray[i], length + 1, SUFFIX);
655
656 if (phraseMode == STOPWORDS) {
657 // in STOPWORDS mode we must introduce a new content symbol
658 while ((*(p.back) >= firstStopSymbol) &&
659 (*(p.back) <= lastStopSymbol)) {
660 p.increaseSuffixLength(p.length + 1);
661 }
662 }
663
664 p.findFirstAndLastSuffix(i, lastSuffixIndex);
665 return p;
666}
667
668Phrase Phrase::newPhraseShortestPrefixExpansion(cellindex i) {
669
670 // create a new phrase exactly one symbol longer than the current one
671 Phrase p(prefixArray[i], length + 1, PREFIX);
672
673 if (phraseMode == STOPWORDS) {
674 // in STOPWORDS mode we must introduce a new content symbol
675 while ((*(p.forward) >= firstStopSymbol) &&
676 (*(p.forward) <= lastStopSymbol)) {
677 p.increasePrefixLength(p.length + 1);
678 }
679 }
680
681 p.findFirstAndLastPrefix(i, lastPrefixIndex);
682 return p;
683}
684
685
686// Expand a phrase until it no longer has a unique suffix extension
687//
688// If the phrase only occurs once in the suffix array, do nothing.
689//
690// If we are in stopwords mode then we expand if there is a unique
691// suffix expansion and this expansion is a content word. We do this
692// with the content_len variable, which holds the length of the longest
693// phrase ending in a content word.
694
695int Phrase::expandWhileUniqueSuffixExtension() {
696 assert(forward);
697 assert(suffixFound);
698
699 // if the phrase occurs only once, do nothing
700 if (suffixFrequency < 2) {
701 return 0;
702 }
703
704 // count the length over which the cells are the same
705 // and no delimiter is crossed.
706 symbol *fst = firstSuffix;
707 symbol *lst = lastSuffix;
708 cellindex len = 0;
709 cellindex content_len = 0;
710
711 // Calculate the length over which the phrases match
712 while((*fst == *lst) && (*fst > LASTDELIMITER)) {
713 len++;
714 if (*fst > lastStopSymbol) content_len = len;
715 fst++;
716 lst++;
717 }
718
719 // in ANYPHRASE mode, expand the phrase for as long as possible
720 if (phraseMode == ANYPHRASE) {
721 if (len > length) {
722 increaseSuffixLength(len);
723 }
724 }
725 // in STOPWORDS mode, expand only as far as the last contentword
726 else {
727 if (content_len > length) {
728 increaseSuffixLength(content_len);
729 }
730 }
731
732 uniqueSuffixExtension = 0;
733 return 0;
734}
735
736int Phrase::expandWhileUniquePrefixExtension() {
737 assert(prefixFound);
738
739 // if the phrase occurs only once, do nothing
740 if (prefixFrequency < 2) {
741 return 0;
742 }
743
744 // count the length over which the cells are the same
745 // and no delimiter is crossed.
746 symbol *fst = firstPrefix;
747 symbol *lst = lastPrefix;
748 cellindex len = 0;
749 cellindex content_len = 0;
750
751 // Calculate the length over which the phrases match
752 while((*fst == *lst) && (*fst > LASTDELIMITER)) {
753 len++;
754 if (*fst > lastStopSymbol) content_len = len;
755 fst--;
756 lst--;
757 }
758
759 // in ANYPHRASE mode, expand the phrase for as long as possible
760 if (phraseMode == ANYPHRASE) {
761 if (len > length) {
762 increasePrefixLength(len);
763 }
764 }
765 // in STOPWORDS mode, expand only as far as the last contentword
766 else {
767 if (content_len > length) {
768 increasePrefixLength(content_len);
769 }
770 }
771
772 uniquePrefixExtension = 0;
773 return 0;
774}
775
776
777
778
779
780
781
782
783
784
785
786
787// Compare the length of two phrases
788//
789// Given two phrases, return true if the first is shorter/longer,
790// otherwise return false. For use in various sort functions.
791
792bool isShorter(Phrase p1, Phrase p2) {
793 if (p1.length < p2.length) {
794 return true;
795 }
796 return false;
797}
798
799bool isLonger(Phrase p1, Phrase p2) {
800 if (p1.length > p2.length) {
801 return true;
802 }
803 return false;
804}
805
806
Note: See TracBrowser for help on using the repository browser.