Context Navigation

source: trunk/gsdl/src/phind/generate/phrase.cpp@ 1873

Last change on this file since 1873 was 1873, checked in by paynter, 23 years ago
Fixed a bug iin the Phrase extraction alogrithm that had the "candidates" in the GetMinimalExpansions function sorted backwards.
Property svn:keywords set to `Author Date Id Revision`
File size: 19.6 KB

Line
1	/**********************************************************************
2	*
3	* phrase.cpp -- implementation of the phrase object used by suffix.cpp
4	*
5	* Copyright 2000 Gordon W. Paynter
6	* Copyright 2000 The New Zealand Digital Library Project
7	*
8	* A component of the Greenstone digital library software
9	* from the New Zealand Digital Library Project at the
10	* University of Waikato, New Zealand.
11	*
12	* This program is free software; you can redistribute it and/or modify
13	* it under the terms of the GNU General Public License as published by
14	* the Free Software Foundation; either version 2 of the License, or
15	* (at your option) any later version.
16	*
17	* This program is distributed in the hope that it will be useful,
18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	* GNU General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, write to the Free Software
24	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	*
26	*********************************************************************/
27
28	#include <iostream.h>
29	#include <assert.h>
30	#include <vector.h>
31	#include <stdio.h>
32
33	#include "suffix.h"
34	#include "phrase.h"
35
36	// Phrase constructor functions
37
38	Phrase::Phrase(symbol *words, cellcount size, int direction) {
39	empty();
40	length = size;
41
42	if (direction == SUFFIX) {
43	forward = words;
44	back = forward + size - 1;
45	} else {
46	back = words;
47	forward = back - size + 1;
48	}
49	}
50
51	Phrase::Phrase() {
52	empty();
53	}
54
55
56	// Empty the contents of a phrase
57
58	int Phrase::empty() {
59
60	forward = back = NULL;
61	length = 0;
62
63	suffixFound = prefixFound = 0;
64	firstSuffix = firstPrefix = NULL;
65	lastSuffix = lastPrefix = NULL;
66	suffixFrequency = prefixFrequency = 0;
67	uniqueSuffixExtension = uniquePrefixExtension = -1;
68
69	return 0;
70	}
71
72	// Remove the details of the location where a suffix/prefix
73	// is found in the array.
74
75	int Phrase::clearSuffix() {
76	suffixFound = 0;
77	firstSuffix = NULL;
78	lastSuffix = NULL;
79	suffixFrequency = 0;
80	uniqueSuffixExtension = -1;
81	return 0;
82	}
83
84	int Phrase::clearPrefix() {
85	prefixFound = 0;
86	firstPrefix = NULL;
87	lastPrefix = NULL;
88	prefixFrequency = 0;
89	uniquePrefixExtension = -1;
90	return 0;
91	}
92
93	// Increase the length of s phrase
94	//
95	// We increase the length of a string "in place". When we expand
96	// the suffix, however, we invalidate the prefix data, and vice-versa.
97
98	int Phrase::increaseSuffixLength(cellcount l) {
99	length = l;
100	clearPrefix();
101	back = forward + l - 1;
102	return 0;
103	}
104
105	int Phrase::increasePrefixLength(cellcount l) {
106	length = l;
107	clearSuffix();
108	forward = back - l + 1;
109	return 0;
110	}
111
112
113	// Convert the phrase to a string
114	char *Phrase::toString() {
115
116	assert(forward);
117
118	char *str;
119	str = new char[length*20];
120	symbol *s = forward;
121	sprintf(str, "s%d", *s++);
122
123	for (cellcount i = 1; i < length; i++) {
124	sprintf(str, "%s s%d", str, *s++);
125	}
126
127	return str;
128	}
129
130
131	// Compare a phrase to an another array of symbols
132	//
133	// Given an array of words with a specified length,
134	// compare those words to this phrase.
135	//
136	// Return 0 if the two phrases are the same over length length
137	// Return 1 if the phrase is "greater" than tyhe words, or
138	// Return -1 if the phrase is less.
139
140	int Phrase::compareSuffix(symbol *words, cellcount length) {
141	assert(forward);
142	symbol *p = forward;
143
144	for (cellcount i = 0; i < length; i++) {
145	if (p > words) {
146	return 1;
147	} else if (p < words) {
148	return -1;
149	} else {
150	*p++;
151	*words++;
152	}
153	}
154
155	return 0;
156
157	}
158
159	int Phrase::comparePrefix(symbol *words, cellcount length) {
160
161	assert(back);
162	symbol *p = back;
163
164	for (cellcount i = 0; i < length; i++) {
165	if (p > words) {
166	return -1;
167	} else if (p < words) {
168	return 1;
169	} else {
170	*p--;
171	*words--;
172	}
173	}
174
175	return 0;
176
177	}
178
179
180	// Does the phrase have a unique suffix/prefix extension?
181	//
182	// For suffix:
183	// If uniqueSuffixExtensions is 1, then phrase has some unique expansion.
184	// If it is 0 then it does not. If it is -1 then we don't know, and have
185	// to calculate it from the suffix array. Same goes for prefixes.
186
187	int Phrase::hasUniqueSuffixExtension() {
188
189	// if we don't know, then work it out
190	if (uniqueSuffixExtension == -1) {
191
192	ensureSuffixFound();
193
194	// pointers to the phrase's first and last occurances in te suffixArray
195	symbol *fst = firstSuffix + length;
196	symbol *lst = lastSuffix + length;
197
198	// in ANYPHRASE mode, simply check the next symbol
199	if (phraseMode == ANYPHRASE) {
200	if ((fst == lst) && (*fst > LASTDELIMITER)) {
201	uniqueSuffixExtension = 1;
202	} else {
203	uniqueSuffixExtension = 0;
204	}
205	}
206	// in STOPWORDS mode, make sure there is a unique next content symbol
207	else {
208	uniqueSuffixExtension = 0;
209	while ((fst == lst) && (*fst > LASTDELIMITER)) {
210	if (*fst >= firstContentSymbol) {
211	uniqueSuffixExtension = 1;
212	break;
213	}
214	fst++;
215	lst++;
216	}
217	}
218	}
219
220	return uniqueSuffixExtension;
221	}
222
223	int Phrase::hasUniquePrefixExtension() {
224
225	// if we don't know, then work it out
226	if (uniquePrefixExtension == -1) {
227
228	ensurePrefixFound();
229
230	// pointers to the phrase's first and last occurances in the prefixArray
231	symbol *fst = firstPrefix - length;
232	symbol *lst = lastPrefix - length;
233
234	// in ANYPHRASE mode, simply check the next symbol
235	if (phraseMode == ANYPHRASE) {
236	if ((fst == lst) && (*fst > LASTDELIMITER)) {
237	uniquePrefixExtension = 1;
238	} else {
239	uniquePrefixExtension = 0;
240	}
241	}
242	// in STOPWORDS mode, make sure there is a unique next content symbol
243	else {
244	uniquePrefixExtension = 0;
245	while ((fst == lst) && (*fst > LASTDELIMITER)) {
246	if (*fst >= firstContentSymbol) {
247	uniquePrefixExtension = 1;
248	break;
249	}
250	fst--;
251	lst--;
252	}
253	}
254	}
255
256	return uniquePrefixExtension;
257	}
258
259
260	// Expand a phrase with a unique suffix/prefix by 1 symbol
261	//
262	// Note that in STOPWORDS mode a "unique extension" means a unique
263	// extension that ends on a content symbol. Although we may extend
264	// such a phrase by 2 or more symbols, we will only extend it by a
265	// single content symbol.
266
267	int Phrase::expandUniqueSuffixExtensionByOne() {
268	assert(suffixFound);
269	assert(uniqueSuffixExtension == 1);
270
271	// in ANYPHRASE mode, expand the phrase by one symbol
272	if (phraseMode == ANYPHRASE) {
273	increaseSuffixLength(length + 1);
274	}
275	// in STOPWORDS mode, expand the phrase up to the next content symbol
276	else {
277	symbol *next = firstSuffix + length;
278	cellcount newlength = length + 1;
279
280	while (*next < firstContentSymbol) {
281	next++;
282	newlength++;
283	}
284	increaseSuffixLength(newlength);
285	}
286
287	uniqueSuffixExtension = -1;
288	return 0;
289	}
290
291	int Phrase::expandUniquePrefixExtensionByOne() {
292	assert(prefixFound);
293	assert(uniquePrefixExtension == 1);
294
295	// in ANYPHRASE mode, expand the phrase by one symbol
296	if (phraseMode == ANYPHRASE) {
297	increasePrefixLength(length + 1);
298	}
299	// in STOPWORDS mode, expand the phrase to the next content symbol
300	else {
301	symbol *next = firstPrefix - length;
302	cellcount newlength = length + 1;
303	while (*next < firstContentSymbol) {
304	next--;
305	newlength++;
306	}
307	increasePrefixLength(newlength);
308
309	}
310
311	uniquePrefixExtension = -1;
312	return 0;
313	}
314
315
316	// Find the phrase in a suffixArray
317	//
318	// Given a suffix array, find the first and last occurances of the phrase.
319	// The begin and end parameters identify which cells to search (inclusive).
320
321	int Phrase::findFirstAndLastSuffix() {
322	return findFirstAndLastSuffix(0, inputLength-1);
323	}
324
325	int Phrase::findFirstAndLastSuffix(cellindex begin, cellindex end) {
326
327	// First find any occurance of the phrase
328	assert(begin <= end);
329
330	// cout << "findFirstAndLastSuffix (" << begin << " - " << end << ") : " << toString() << endl;
331
332	// if we're only searching one cell, it is very easy
333	if (begin == end) {
334	assert(compareSuffix(suffixArray[begin], length) == 0);
335	firstSuffix = lastSuffix = suffixArray[begin];
336	firstSuffixIndex = lastSuffixIndex = begin;
337	suffixFrequency = 1;
338	suffixFound = 1;
339	return 0;
340	}
341
342	cellindex c;
343	int cmp;
344
345	do {
346	// cout << "Find anywhere in " << begin << " - " << end << endl;
347	c = (end + begin) / 2;
348	cmp = compareSuffix(suffixArray[c], length);
349	if (cmp == 1) {
350	// target phrase is lower than phrase at suffixArray[c]
351	begin = c + 1;
352	} else if (cmp == -1) {
353	// target phrase is higher than phrase at suffixArray[c]
354	end = c - 1;
355	}
356	} while (cmp != 0);
357
358	cellindex lastbegin = c;
359	cellindex lastend = end;
360
361	// Next find the first occurance of the phrase.
362	// We know that the first occurance must be between the
363	// current value of begin and the current value of c.
364	end = c;
365
366	do {
367	// cout << "first-searching with " << begin << " - " << end << endl;
368	if (begin == end) {
369	c = begin;
370	cmp = 0;
371	} else {
372	c = (begin + end) / 2;
373	cmp = compareSuffix(suffixArray[c], length);
374	if (cmp == 1) {
375	// target phrase is lower than phrase at suffixArray[c]
376	begin = c + 1;
377	} else {
378	assert(cmp == 0);
379	// target phrase is the same as the phrase at suffixArray[c]. However,
380	// to find the first occurance, suffixArray[c] must be the same as the
381	// phrase, but suffixArray[c-1] must be different.
382	cmp = compareSuffix(suffixArray[c-1], length);
383	if (cmp == 0) {
384	end = c - 1;
385	assert(end >= begin);
386	cmp = 1;
387	} else {
388	cmp = 0;
389	}
390	}
391	}
392	} while (cmp != 0);
393
394	// we have found the first location
395	firstSuffixIndex = c;
396	firstSuffix = suffixArray[c];
397
398	// Next find the last occurance of the phrase.
399	// We previously stored some bounds for its loccation.
400	begin = lastbegin;
401	end = lastend;
402
403	do {
404	// cout << "last-searching with " << begin << " - " << end << endl;
405	if (begin == end) {
406	c = begin;
407	cmp = 0;
408	} else {
409	c = (begin + end) / 2;
410	cmp = compareSuffix(suffixArray[c], length);
411	if (cmp == -1) {
412	// target phrase is greater than phrase at suffixArray[c]
413	end = c - 1;
414	} else {
415	assert(cmp == 0);
416	// target phrase is the same as the phrase at suffixArray[c]. However,
417	// to find the last occurance, one of two additional condiditons must be met:
418	// either c must be the very last cell in the array, or
419	// suffixArray[c+1] must be different from phrase
420	if (c < inputLength-1) {
421	cmp = compareSuffix(suffixArray[c+1], length);
422	if (cmp == 0) {
423	begin = c + 1;
424	cmp = -1;
425	} else {
426	cmp = 0;
427	}
428	}
429	}
430	}
431	} while (cmp != 0);
432
433	lastSuffixIndex = c;
434	lastSuffix = suffixArray[c];
435	suffixFrequency = lastSuffixIndex - firstSuffixIndex + 1;
436	suffixFound = 1;
437
438	return 0;
439	}
440
441
442	// Find the phrase in a prefix array
443	//
444	// Given a prefix array, find the first and last occurances of the phrase.
445	// The begin and end parameters identify which cells to search (inclusive).
446
447	int Phrase::findFirstAndLastPrefix() {
448	return findFirstAndLastPrefix(0, inputLength-1);
449	}
450
451	int Phrase::findFirstAndLastPrefix(cellindex begin, cellindex end) {
452
453	// First find any occurance of the phrase
454	assert(begin <= end);
455
456	// cout << "findFirstAndLastPrefix (" << begin << " - " << end << ") : " << toString() << endl;
457
458	// if we're only searching one cell, it is very easy
459	if (begin == end) {
460	assert(comparePrefix(prefixArray[begin], length) == 0);
461	firstPrefix = lastPrefix = prefixArray[begin];
462	firstPrefixIndex = lastPrefixIndex = begin;
463	prefixFrequency = 1;
464	prefixFound = 1;
465	return 0;
466	}
467
468	cellindex c;
469	int cmp;
470
471	do {
472	// cout << "Find anywhere in prefixarray " << begin << " - " << end << endl;
473	c = (end + begin) / 2;
474	cmp = comparePrefix(prefixArray[c], length);
475	if (cmp == 1) {
476	begin = c + 1;
477	} else if (cmp == -1) {
478	end = c - 1;
479	}
480	} while (cmp != 0);
481
482	cellindex lastbegin = c;
483	cellindex lastend = end;
484
485	// Next find the first occurance of the phrase.
486	// We know that the first occurance must be between the
487	// current value of begin and the cureent value of c.
488	end = c;
489
490	do {
491	// cout << "first-prefix-searching with " << begin << " - " << end << endl;
492	if (begin == end) {
493	c = begin;
494	cmp = 0;
495	} else {
496	c = (begin + end) / 2;
497	cmp = comparePrefix(prefixArray[c], length);
498	// cout << "cmp: " << cmp << ", c: " << c << endl;
499	if (cmp == 1) {
500	// target phrase is lower than phrase at prefixArray[c]
501	begin = c + 1;
502	} else {
503	assert(cmp == 0);
504	// target phrase is the same as the phrase at prefixArray[c]. However, to
505	// find the first occurance, one of two additional condiditons must be met:
506	// either c == the first cell in prefix array
507	// or prefixArray[c-1] != phrase.
508	if (c > 0) {
509	cmp = comparePrefix(prefixArray[c-1], length);
510	if (cmp == 0) {
511	end = c - 1;
512	assert(end >= begin);
513	cmp = 1;
514	} else {
515	cmp = 0;
516	}
517	}
518	}
519	}
520	} while (cmp != 0);
521
522	// we have found the first location
523	firstPrefixIndex = c;
524	firstPrefix = prefixArray[c];
525
526	// Next find the last occurance of the phrase.
527	// We previously stored some bounds for its loccation.
528	begin = lastbegin;
529	end = lastend;
530
531	do {
532	// cout << "last-prefix-searching with " << begin << " - " << end << endl;
533	if (begin == end) {
534	c = begin;
535	cmp = 0;
536	} else {
537	c = (begin + end) / 2;
538	cmp = comparePrefix(prefixArray[c], length);
539	if (cmp == -1) {
540	// target phrase is greater than phrase at prefixArray[c]
541	end = c - 1;
542	} else {
543	assert(cmp == 0);
544	// target phrase is the same as the phrase at prefixArray[c]. However,
545	// to find the last occurance, prefixArray[c] must be the same as the
546	// phrase, but prefixArray[c+1] must be different.
547	cmp = comparePrefix(prefixArray[c+1], length);
548	if (cmp == 0) {
549	begin = c + 1;
550	cmp = -1;
551	} else {
552	cmp = 0;
553	}
554	}
555	}
556	} while (cmp != 0);
557
558	lastPrefixIndex = c;
559	lastPrefix = prefixArray[c];
560	prefixFrequency = lastPrefixIndex - firstPrefixIndex + 1;
561	prefixFound = 1;
562
563	return 0;
564	}
565
566
567	// Ensure that the phrase has been found in the suffix & prefix arrays
568
569	int Phrase::ensureSuffixFound() {
570	if (!suffixFound) {
571	findFirstAndLastSuffix();
572	}
573	return 0;
574	}
575
576	int Phrase::ensurePrefixFound() {
577	if (!prefixFound) {
578	findFirstAndLastPrefix();
579	}
580	return 0;
581	}
582
583
584	// Calculate a set of initial suffix/prefix candidates
585	//
586	// Expand the phrase to find the initial
587	// set of candidates that may be extensions of that phrase,
588	// and add them to the end of the results vector
589
590	int Phrase::initialSuffixCandidates(vector<Phrase> &results) {
591
592	ensureSuffixFound();
593	Phrase next;
594	cellindex i = firstSuffixIndex;
595
596	// Find all the expansions of Phrase p
597	while (i <= lastSuffixIndex) {
598
599	// create new phrase exactly one longer than the current one
600	next = newPhraseShortestSuffixExpansion(i);
601
602	// If the expansion occurs more than once and is not delimited, expand it
603	if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {
604	next.expandWhileUniqueSuffixExtension();
605	results.push_back(next);
606	}
607
608	// Move onto the next expansion
609	i = next.lastSuffixIndex + 1;
610
611	}
612	return 0;
613	}
614
615
616	int Phrase::initialPrefixCandidates(vector<Phrase> &results) {
617
618	ensurePrefixFound();
619	Phrase next;
620	cellindex i = firstPrefixIndex;
621
622	// Find all the expansions of Phrase p
623	while (i <= lastPrefixIndex) {
624
625	// create new phrase exactly one longer than the current one
626	next = newPhraseShortestPrefixExpansion(i);
627
628	// If the expansion occurs more than once and is not delimited, expand it
629	if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {
630	next.expandWhileUniquePrefixExtension();
631	results.push_back(next);
632	}
633
634	// Move onto the next expansion
635	i = next.lastPrefixIndex + 1;
636
637	}
638	return 0;
639	}
640
641
642	// Create a new phrase that is longer then the current one, but as short
643	// as possible. In ANYPHRASE mode this means the new phrase is exactly
644	// one symbol longer than the current one, but in STOPWORD mode the expansion
645	// must introduce a new content symbol
646	//
647	// Note that we never expand across delimiters; if we encounter a delimiter
648	// then this becomes the "end" symbol of the phrase. The calling function
649	// should check for this condition.
650
651	Phrase Phrase::newPhraseShortestSuffixExpansion(cellindex i) {
652
653	// create a new phrase exactly one symbol longer than the current one
654	Phrase p(suffixArray[i], length + 1, SUFFIX);
655
656	if (phraseMode == STOPWORDS) {
657	// in STOPWORDS mode we must introduce a new content symbol
658	while ((*(p.back) >= firstStopSymbol) &&
659	(*(p.back) <= lastStopSymbol)) {
660	p.increaseSuffixLength(p.length + 1);
661	}
662	}
663
664	p.findFirstAndLastSuffix(i, lastSuffixIndex);
665	return p;
666	}
667
668	Phrase Phrase::newPhraseShortestPrefixExpansion(cellindex i) {
669
670	// create a new phrase exactly one symbol longer than the current one
671	Phrase p(prefixArray[i], length + 1, PREFIX);
672
673	if (phraseMode == STOPWORDS) {
674	// in STOPWORDS mode we must introduce a new content symbol
675	while ((*(p.forward) >= firstStopSymbol) &&
676	(*(p.forward) <= lastStopSymbol)) {
677	p.increasePrefixLength(p.length + 1);
678	}
679	}
680
681	p.findFirstAndLastPrefix(i, lastPrefixIndex);
682	return p;
683	}
684
685
686	// Expand a phrase until it no longer has a unique suffix extension
687	//
688	// If the phrase only occurs once in the suffix array, do nothing.
689	//
690	// If we are in stopwords mode then we expand if there is a unique
691	// suffix expansion and this expansion is a content word. We do this
692	// with the content_len variable, which holds the length of the longest
693	// phrase ending in a content word.
694
695	int Phrase::expandWhileUniqueSuffixExtension() {
696	assert(forward);
697	assert(suffixFound);
698
699	// if the phrase occurs only once, do nothing
700	if (suffixFrequency < 2) {
701	return 0;
702	}
703
704	// count the length over which the cells are the same
705	// and no delimiter is crossed.
706	symbol *fst = firstSuffix;
707	symbol *lst = lastSuffix;
708	cellindex len = 0;
709	cellindex content_len = 0;
710
711	// Calculate the length over which the phrases match
712	while((fst == lst) && (*fst > LASTDELIMITER)) {
713	len++;
714	if (*fst > lastStopSymbol) content_len = len;
715	fst++;
716	lst++;
717	}
718
719	// in ANYPHRASE mode, expand the phrase for as long as possible
720	if (phraseMode == ANYPHRASE) {
721	if (len > length) {
722	increaseSuffixLength(len);
723	}
724	}
725	// in STOPWORDS mode, expand only as far as the last contentword
726	else {
727	if (content_len > length) {
728	increaseSuffixLength(content_len);
729	}
730	}
731
732	uniqueSuffixExtension = 0;
733	return 0;
734	}
735
736	int Phrase::expandWhileUniquePrefixExtension() {
737	assert(prefixFound);
738
739	// if the phrase occurs only once, do nothing
740	if (prefixFrequency < 2) {
741	return 0;
742	}
743
744	// count the length over which the cells are the same
745	// and no delimiter is crossed.
746	symbol *fst = firstPrefix;
747	symbol *lst = lastPrefix;
748	cellindex len = 0;
749	cellindex content_len = 0;
750
751	// Calculate the length over which the phrases match
752	while((fst == lst) && (*fst > LASTDELIMITER)) {
753	len++;
754	if (*fst > lastStopSymbol) content_len = len;
755	fst--;
756	lst--;
757	}
758
759	// in ANYPHRASE mode, expand the phrase for as long as possible
760	if (phraseMode == ANYPHRASE) {
761	if (len > length) {
762	increasePrefixLength(len);
763	}
764	}
765	// in STOPWORDS mode, expand only as far as the last contentword
766	else {
767	if (content_len > length) {
768	increasePrefixLength(content_len);
769	}
770	}
771
772	uniquePrefixExtension = 0;
773	return 0;
774	}
775
776
777
778
779
780
781
782
783
784
785
786
787	// Compare the length of two phrases
788	//
789	// Given two phrases, return true if the first is shorter/longer,
790	// otherwise return false. For use in various sort functions.
791
792	bool isShorter(Phrase p1, Phrase p2) {
793	if (p1.length < p2.length) {
794	return true;
795	}
796	return false;
797	}
798
799	bool isLonger(Phrase p1, Phrase p2) {
800	if (p1.length > p2.length) {
801	return true;
802	}
803	return false;
804	}
805
806

Note: See TracBrowser for help on using the repository browser.

Download in other formats: