Context Navigation

source: trunk/gsdl/src/phind/generate/phrase.cpp@ 2487

Last change on this file since 2487 was 2487, checked in by sjboddie, 23 years ago
Changes to get phind working under windows
Property svn:keywords set to `Author Date Id Revision`
File size: 19.7 KB

Line
1	/**********************************************************************
2	*
3	* phrase.cpp -- implementation of the phrase object used by suffix.cpp
4	*
5	* Copyright 2000 Gordon W. Paynter
6	* Copyright 2000 The New Zealand Digital Library Project
7	*
8	* A component of the Greenstone digital library software
9	* from the New Zealand Digital Library Project at the
10	* University of Waikato, New Zealand.
11	*
12	* This program is free software; you can redistribute it and/or modify
13	* it under the terms of the GNU General Public License as published by
14	* the Free Software Foundation; either version 2 of the License, or
15	* (at your option) any later version.
16	*
17	* This program is distributed in the hope that it will be useful,
18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	* GNU General Public License for more details.
21	*
22	* You should have received a copy of the GNU General Public License
23	* along with this program; if not, write to the Free Software
24	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	*
26	*********************************************************************/
27
28	#include <assert.h>
29	#include <stdio.h>
30
31	#if defined(GSDL_USE_STL_H)
32	# include <vector.h>
33	#else
34	# include <vector>
35	#endif
36
37	#if defined(GSDL_USE_IOS_H)
38	# include <iostream.h>
39	#else
40	# include <iostream>
41	#endif
42
43	#include "suffix.h"
44	#include "phrase.h"
45
46	// Phrase constructor functions
47
48	Phrase::Phrase(symbol *words, cellcount size, int direction) {
49	empty();
50	length = size;
51
52	if (direction == SUFFIX) {
53	forward = words;
54	back = forward + size - 1;
55	} else {
56	back = words;
57	forward = back - size + 1;
58	}
59	}
60
61	Phrase::Phrase() {
62	empty();
63	}
64
65
66	// Empty the contents of a phrase
67
68	int Phrase::empty() {
69
70	forward = back = NULL;
71	length = 0;
72
73	suffixFound = prefixFound = 0;
74	firstSuffix = firstPrefix = NULL;
75	lastSuffix = lastPrefix = NULL;
76	suffixFrequency = prefixFrequency = 0;
77	uniqueSuffixExtension = uniquePrefixExtension = -1;
78
79	return 0;
80	}
81
82	// Remove the details of the location where a suffix/prefix
83	// is found in the array.
84
85	int Phrase::clearSuffix() {
86	suffixFound = 0;
87	firstSuffix = NULL;
88	lastSuffix = NULL;
89	suffixFrequency = 0;
90	uniqueSuffixExtension = -1;
91	return 0;
92	}
93
94	int Phrase::clearPrefix() {
95	prefixFound = 0;
96	firstPrefix = NULL;
97	lastPrefix = NULL;
98	prefixFrequency = 0;
99	uniquePrefixExtension = -1;
100	return 0;
101	}
102
103	// Increase the length of s phrase
104	//
105	// We increase the length of a string "in place". When we expand
106	// the suffix, however, we invalidate the prefix data, and vice-versa.
107
108	int Phrase::increaseSuffixLength(cellcount l) {
109	length = l;
110	clearPrefix();
111	back = forward + l - 1;
112	return 0;
113	}
114
115	int Phrase::increasePrefixLength(cellcount l) {
116	length = l;
117	clearSuffix();
118	forward = back - l + 1;
119	return 0;
120	}
121
122
123	// Convert the phrase to a string
124	char *Phrase::toString() {
125
126	assert(forward);
127
128	char *str;
129	str = new char[length*20];
130	symbol *s = forward;
131	sprintf(str, "s%d", *s++);
132
133	for (cellcount i = 1; i < length; i++) {
134	sprintf(str, "%s s%d", str, *s++);
135	}
136
137	return str;
138	}
139
140
141	// Compare a phrase to an another array of symbols
142	//
143	// Given an array of words with a specified length,
144	// compare those words to this phrase.
145	//
146	// Return 0 if the two phrases are the same over length length
147	// Return 1 if the phrase is "greater" than tyhe words, or
148	// Return -1 if the phrase is less.
149
150	int Phrase::compareSuffix(symbol *words, cellcount length) {
151	assert(forward);
152	symbol *p = forward;
153
154	for (cellcount i = 0; i < length; i++) {
155	if (p > words) {
156	return 1;
157	} else if (p < words) {
158	return -1;
159	} else {
160	*p++;
161	*words++;
162	}
163	}
164
165	return 0;
166
167	}
168
169	int Phrase::comparePrefix(symbol *words, cellcount length) {
170
171	assert(back);
172	symbol *p = back;
173
174	for (cellcount i = 0; i < length; i++) {
175	if (p > words) {
176	return -1;
177	} else if (p < words) {
178	return 1;
179	} else {
180	*p--;
181	*words--;
182	}
183	}
184
185	return 0;
186
187	}
188
189
190	// Does the phrase have a unique suffix/prefix extension?
191	//
192	// For suffix:
193	// If uniqueSuffixExtensions is 1, then phrase has some unique expansion.
194	// If it is 0 then it does not. If it is -1 then we don't know, and have
195	// to calculate it from the suffix array. Same goes for prefixes.
196
197	int Phrase::hasUniqueSuffixExtension() {
198
199	// if we don't know, then work it out
200	if (uniqueSuffixExtension == -1) {
201
202	ensureSuffixFound();
203
204	// pointers to the phrase's first and last occurances in te suffixArray
205	symbol *fst = firstSuffix + length;
206	symbol *lst = lastSuffix + length;
207
208	// in ANYPHRASE mode, simply check the next symbol
209	if (phraseMode == ANYPHRASE) {
210	if ((fst == lst) && (*fst > LASTDELIMITER)) {
211	uniqueSuffixExtension = 1;
212	} else {
213	uniqueSuffixExtension = 0;
214	}
215	}
216	// in STOPWORDS mode, make sure there is a unique next content symbol
217	else {
218	uniqueSuffixExtension = 0;
219	while ((fst == lst) && (*fst > LASTDELIMITER)) {
220	if (*fst >= firstContentSymbol) {
221	uniqueSuffixExtension = 1;
222	break;
223	}
224	fst++;
225	lst++;
226	}
227	}
228	}
229
230	return uniqueSuffixExtension;
231	}
232
233	int Phrase::hasUniquePrefixExtension() {
234
235	// if we don't know, then work it out
236	if (uniquePrefixExtension == -1) {
237
238	ensurePrefixFound();
239
240	// pointers to the phrase's first and last occurances in the prefixArray
241	symbol *fst = firstPrefix - length;
242	symbol *lst = lastPrefix - length;
243
244	// in ANYPHRASE mode, simply check the next symbol
245	if (phraseMode == ANYPHRASE) {
246	if ((fst == lst) && (*fst > LASTDELIMITER)) {
247	uniquePrefixExtension = 1;
248	} else {
249	uniquePrefixExtension = 0;
250	}
251	}
252	// in STOPWORDS mode, make sure there is a unique next content symbol
253	else {
254	uniquePrefixExtension = 0;
255	while ((fst == lst) && (*fst > LASTDELIMITER)) {
256	if (*fst >= firstContentSymbol) {
257	uniquePrefixExtension = 1;
258	break;
259	}
260	fst--;
261	lst--;
262	}
263	}
264	}
265
266	return uniquePrefixExtension;
267	}
268
269
270	// Expand a phrase with a unique suffix/prefix by 1 symbol
271	//
272	// Note that in STOPWORDS mode a "unique extension" means a unique
273	// extension that ends on a content symbol. Although we may extend
274	// such a phrase by 2 or more symbols, we will only extend it by a
275	// single content symbol.
276
277	int Phrase::expandUniqueSuffixExtensionByOne() {
278	assert(suffixFound);
279	assert(uniqueSuffixExtension == 1);
280
281	// in ANYPHRASE mode, expand the phrase by one symbol
282	if (phraseMode == ANYPHRASE) {
283	increaseSuffixLength(length + 1);
284	}
285	// in STOPWORDS mode, expand the phrase up to the next content symbol
286	else {
287	symbol *next = firstSuffix + length;
288	cellcount newlength = length + 1;
289
290	while (*next < firstContentSymbol) {
291	next++;
292	newlength++;
293	}
294	increaseSuffixLength(newlength);
295	}
296
297	uniqueSuffixExtension = -1;
298	return 0;
299	}
300
301	int Phrase::expandUniquePrefixExtensionByOne() {
302	assert(prefixFound);
303	assert(uniquePrefixExtension == 1);
304
305	// in ANYPHRASE mode, expand the phrase by one symbol
306	if (phraseMode == ANYPHRASE) {
307	increasePrefixLength(length + 1);
308	}
309	// in STOPWORDS mode, expand the phrase to the next content symbol
310	else {
311	symbol *next = firstPrefix - length;
312	cellcount newlength = length + 1;
313	while (*next < firstContentSymbol) {
314	next--;
315	newlength++;
316	}
317	increasePrefixLength(newlength);
318
319	}
320
321	uniquePrefixExtension = -1;
322	return 0;
323	}
324
325
326	// Find the phrase in a suffixArray
327	//
328	// Given a suffix array, find the first and last occurances of the phrase.
329	// The begin and end parameters identify which cells to search (inclusive).
330
331	int Phrase::findFirstAndLastSuffix() {
332	return findFirstAndLastSuffix(0, inputLength-1);
333	}
334
335	int Phrase::findFirstAndLastSuffix(cellindex begin, cellindex end) {
336
337	// First find any occurance of the phrase
338	assert(begin <= end);
339
340	// cout << "findFirstAndLastSuffix (" << begin << " - " << end << ") : " << toString() << endl;
341
342	// if we're only searching one cell, it is very easy
343	if (begin == end) {
344	assert(compareSuffix(suffixArray[begin], length) == 0);
345	firstSuffix = lastSuffix = suffixArray[begin];
346	firstSuffixIndex = lastSuffixIndex = begin;
347	suffixFrequency = 1;
348	suffixFound = 1;
349	return 0;
350	}
351
352	cellindex c;
353	int cmp;
354
355	do {
356	// cout << "Find anywhere in " << begin << " - " << end << endl;
357	c = (end + begin) / 2;
358	cmp = compareSuffix(suffixArray[c], length);
359	if (cmp == 1) {
360	// target phrase is lower than phrase at suffixArray[c]
361	begin = c + 1;
362	} else if (cmp == -1) {
363	// target phrase is higher than phrase at suffixArray[c]
364	end = c - 1;
365	}
366	} while (cmp != 0);
367
368	cellindex lastbegin = c;
369	cellindex lastend = end;
370
371	// Next find the first occurance of the phrase.
372	// We know that the first occurance must be between the
373	// current value of begin and the current value of c.
374	end = c;
375
376	do {
377	// cout << "first-searching with " << begin << " - " << end << endl;
378	if (begin == end) {
379	c = begin;
380	cmp = 0;
381	} else {
382	c = (begin + end) / 2;
383	cmp = compareSuffix(suffixArray[c], length);
384	if (cmp == 1) {
385	// target phrase is lower than phrase at suffixArray[c]
386	begin = c + 1;
387	} else {
388	assert(cmp == 0);
389	// target phrase is the same as the phrase at suffixArray[c]. However,
390	// to find the first occurance, suffixArray[c] must be the same as the
391	// phrase, but suffixArray[c-1] must be different.
392	cmp = compareSuffix(suffixArray[c-1], length);
393	if (cmp == 0) {
394	end = c - 1;
395	assert(end >= begin);
396	cmp = 1;
397	} else {
398	cmp = 0;
399	}
400	}
401	}
402	} while (cmp != 0);
403
404	// we have found the first location
405	firstSuffixIndex = c;
406	firstSuffix = suffixArray[c];
407
408	// Next find the last occurance of the phrase.
409	// We previously stored some bounds for its loccation.
410	begin = lastbegin;
411	end = lastend;
412
413	do {
414	// cout << "last-searching with " << begin << " - " << end << endl;
415	if (begin == end) {
416	c = begin;
417	cmp = 0;
418	} else {
419	c = (begin + end) / 2;
420	cmp = compareSuffix(suffixArray[c], length);
421	if (cmp == -1) {
422	// target phrase is greater than phrase at suffixArray[c]
423	end = c - 1;
424	} else {
425	assert(cmp == 0);
426	// target phrase is the same as the phrase at suffixArray[c]. However,
427	// to find the last occurance, one of two additional condiditons must be met:
428	// either c must be the very last cell in the array, or
429	// suffixArray[c+1] must be different from phrase
430	if (c < inputLength-1) {
431	cmp = compareSuffix(suffixArray[c+1], length);
432	if (cmp == 0) {
433	begin = c + 1;
434	cmp = -1;
435	} else {
436	cmp = 0;
437	}
438	}
439	}
440	}
441	} while (cmp != 0);
442
443	lastSuffixIndex = c;
444	lastSuffix = suffixArray[c];
445	suffixFrequency = lastSuffixIndex - firstSuffixIndex + 1;
446	suffixFound = 1;
447
448	return 0;
449	}
450
451
452	// Find the phrase in a prefix array
453	//
454	// Given a prefix array, find the first and last occurances of the phrase.
455	// The begin and end parameters identify which cells to search (inclusive).
456
457	int Phrase::findFirstAndLastPrefix() {
458	return findFirstAndLastPrefix(0, inputLength-1);
459	}
460
461	int Phrase::findFirstAndLastPrefix(cellindex begin, cellindex end) {
462
463	// First find any occurance of the phrase
464	assert(begin <= end);
465
466	// cout << "findFirstAndLastPrefix (" << begin << " - " << end << ") : " << toString() << endl;
467
468	// if we're only searching one cell, it is very easy
469	if (begin == end) {
470	assert(comparePrefix(prefixArray[begin], length) == 0);
471	firstPrefix = lastPrefix = prefixArray[begin];
472	firstPrefixIndex = lastPrefixIndex = begin;
473	prefixFrequency = 1;
474	prefixFound = 1;
475	return 0;
476	}
477
478	cellindex c;
479	int cmp;
480
481	do {
482	// cout << "Find anywhere in prefixarray " << begin << " - " << end << endl;
483	c = (end + begin) / 2;
484	cmp = comparePrefix(prefixArray[c], length);
485	if (cmp == 1) {
486	begin = c + 1;
487	} else if (cmp == -1) {
488	end = c - 1;
489	}
490	} while (cmp != 0);
491
492	cellindex lastbegin = c;
493	cellindex lastend = end;
494
495	// Next find the first occurance of the phrase.
496	// We know that the first occurance must be between the
497	// current value of begin and the cureent value of c.
498	end = c;
499
500	do {
501	// cout << "first-prefix-searching with " << begin << " - " << end << endl;
502	if (begin == end) {
503	c = begin;
504	cmp = 0;
505	} else {
506	c = (begin + end) / 2;
507	cmp = comparePrefix(prefixArray[c], length);
508	// cout << "cmp: " << cmp << ", c: " << c << endl;
509	if (cmp == 1) {
510	// target phrase is lower than phrase at prefixArray[c]
511	begin = c + 1;
512	} else {
513	assert(cmp == 0);
514	// target phrase is the same as the phrase at prefixArray[c]. However, to
515	// find the first occurance, one of two additional condiditons must be met:
516	// either c == the first cell in prefix array
517	// or prefixArray[c-1] != phrase.
518	if (c > 0) {
519	cmp = comparePrefix(prefixArray[c-1], length);
520	if (cmp == 0) {
521	end = c - 1;
522	assert(end >= begin);
523	cmp = 1;
524	} else {
525	cmp = 0;
526	}
527	}
528	}
529	}
530	} while (cmp != 0);
531
532	// we have found the first location
533	firstPrefixIndex = c;
534	firstPrefix = prefixArray[c];
535
536	// Next find the last occurance of the phrase.
537	// We previously stored some bounds for its loccation.
538	begin = lastbegin;
539	end = lastend;
540
541	do {
542	// cout << "last-prefix-searching with " << begin << " - " << end << endl;
543	if (begin == end) {
544	c = begin;
545	cmp = 0;
546	} else {
547	c = (begin + end) / 2;
548	cmp = comparePrefix(prefixArray[c], length);
549	if (cmp == -1) {
550	// target phrase is greater than phrase at prefixArray[c]
551	end = c - 1;
552	} else {
553	assert(cmp == 0);
554	// target phrase is the same as the phrase at prefixArray[c]. However,
555	// to find the last occurance, prefixArray[c] must be the same as the
556	// phrase, but prefixArray[c+1] must be different.
557	cmp = comparePrefix(prefixArray[c+1], length);
558	if (cmp == 0) {
559	begin = c + 1;
560	cmp = -1;
561	} else {
562	cmp = 0;
563	}
564	}
565	}
566	} while (cmp != 0);
567
568	lastPrefixIndex = c;
569	lastPrefix = prefixArray[c];
570	prefixFrequency = lastPrefixIndex - firstPrefixIndex + 1;
571	prefixFound = 1;
572
573	return 0;
574	}
575
576
577	// Ensure that the phrase has been found in the suffix & prefix arrays
578
579	int Phrase::ensureSuffixFound() {
580	if (!suffixFound) {
581	findFirstAndLastSuffix();
582	}
583	return 0;
584	}
585
586	int Phrase::ensurePrefixFound() {
587	if (!prefixFound) {
588	findFirstAndLastPrefix();
589	}
590	return 0;
591	}
592
593
594	// Calculate a set of initial suffix/prefix candidates
595	//
596	// Expand the phrase to find the initial
597	// set of candidates that may be extensions of that phrase,
598	// and add them to the end of the results vector
599
600	int Phrase::initialSuffixCandidates(vector<Phrase> &results) {
601
602	ensureSuffixFound();
603	Phrase next;
604	cellindex i = firstSuffixIndex;
605
606	// Find all the expansions of Phrase p
607	while (i <= lastSuffixIndex) {
608
609	// create new phrase exactly one longer than the current one
610	next = newPhraseShortestSuffixExpansion(i);
611
612	// If the expansion occurs more than once and is not delimited, expand it
613	if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {
614	next.expandWhileUniqueSuffixExtension();
615	results.push_back(next);
616	}
617
618	// Move onto the next expansion
619	i = next.lastSuffixIndex + 1;
620
621	}
622	return 0;
623	}
624
625
626	int Phrase::initialPrefixCandidates(vector<Phrase> &results) {
627
628	ensurePrefixFound();
629	Phrase next;
630	cellindex i = firstPrefixIndex;
631
632	// Find all the expansions of Phrase p
633	while (i <= lastPrefixIndex) {
634
635	// create new phrase exactly one longer than the current one
636	next = newPhraseShortestPrefixExpansion(i);
637
638	// If the expansion occurs more than once and is not delimited, expand it
639	if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {
640	next.expandWhileUniquePrefixExtension();
641	results.push_back(next);
642	}
643
644	// Move onto the next expansion
645	i = next.lastPrefixIndex + 1;
646
647	}
648	return 0;
649	}
650
651
652	// Create a new phrase that is longer then the current one, but as short
653	// as possible. In ANYPHRASE mode this means the new phrase is exactly
654	// one symbol longer than the current one, but in STOPWORD mode the expansion
655	// must introduce a new content symbol
656	//
657	// Note that we never expand across delimiters; if we encounter a delimiter
658	// then this becomes the "end" symbol of the phrase. The calling function
659	// should check for this condition.
660
661	Phrase Phrase::newPhraseShortestSuffixExpansion(cellindex i) {
662
663	// create a new phrase exactly one symbol longer than the current one
664	Phrase p(suffixArray[i], length + 1, SUFFIX);
665
666	if (phraseMode == STOPWORDS) {
667	// in STOPWORDS mode we must introduce a new content symbol
668	while ((*(p.back) >= firstStopSymbol) &&
669	(*(p.back) <= lastStopSymbol)) {
670	p.increaseSuffixLength(p.length + 1);
671	}
672	}
673
674	p.findFirstAndLastSuffix(i, lastSuffixIndex);
675	return p;
676	}
677
678	Phrase Phrase::newPhraseShortestPrefixExpansion(cellindex i) {
679
680	// create a new phrase exactly one symbol longer than the current one
681	Phrase p(prefixArray[i], length + 1, PREFIX);
682
683	if (phraseMode == STOPWORDS) {
684	// in STOPWORDS mode we must introduce a new content symbol
685	while ((*(p.forward) >= firstStopSymbol) &&
686	(*(p.forward) <= lastStopSymbol)) {
687	p.increasePrefixLength(p.length + 1);
688	}
689	}
690
691	p.findFirstAndLastPrefix(i, lastPrefixIndex);
692	return p;
693	}
694
695
696	// Expand a phrase until it no longer has a unique suffix extension
697	//
698	// If the phrase only occurs once in the suffix array, do nothing.
699	//
700	// If we are in stopwords mode then we expand if there is a unique
701	// suffix expansion and this expansion is a content word. We do this
702	// with the content_len variable, which holds the length of the longest
703	// phrase ending in a content word.
704
705	int Phrase::expandWhileUniqueSuffixExtension() {
706	assert(forward);
707	assert(suffixFound);
708
709	// if the phrase occurs only once, do nothing
710	if (suffixFrequency < 2) {
711	return 0;
712	}
713
714	// count the length over which the cells are the same
715	// and no delimiter is crossed.
716	symbol *fst = firstSuffix;
717	symbol *lst = lastSuffix;
718	cellindex len = 0;
719	cellindex content_len = 0;
720
721	// Calculate the length over which the phrases match
722	while((fst == lst) && (*fst > LASTDELIMITER)) {
723	len++;
724	if (*fst > lastStopSymbol) content_len = len;
725	fst++;
726	lst++;
727	}
728
729	// in ANYPHRASE mode, expand the phrase for as long as possible
730	if (phraseMode == ANYPHRASE) {
731	if (len > length) {
732	increaseSuffixLength(len);
733	}
734	}
735	// in STOPWORDS mode, expand only as far as the last contentword
736	else {
737	if (content_len > length) {
738	increaseSuffixLength(content_len);
739	}
740	}
741
742	uniqueSuffixExtension = 0;
743	return 0;
744	}
745
746	int Phrase::expandWhileUniquePrefixExtension() {
747	assert(prefixFound);
748
749	// if the phrase occurs only once, do nothing
750	if (prefixFrequency < 2) {
751	return 0;
752	}
753
754	// count the length over which the cells are the same
755	// and no delimiter is crossed.
756	symbol *fst = firstPrefix;
757	symbol *lst = lastPrefix;
758	cellindex len = 0;
759	cellindex content_len = 0;
760
761	// Calculate the length over which the phrases match
762	while((fst == lst) && (*fst > LASTDELIMITER)) {
763	len++;
764	if (*fst > lastStopSymbol) content_len = len;
765	fst--;
766	lst--;
767	}
768
769	// in ANYPHRASE mode, expand the phrase for as long as possible
770	if (phraseMode == ANYPHRASE) {
771	if (len > length) {
772	increasePrefixLength(len);
773	}
774	}
775	// in STOPWORDS mode, expand only as far as the last contentword
776	else {
777	if (content_len > length) {
778	increasePrefixLength(content_len);
779	}
780	}
781
782	uniquePrefixExtension = 0;
783	return 0;
784	}
785
786
787
788
789
790
791
792
793
794
795
796
797	// Compare the length of two phrases
798	//
799	// Given two phrases, return true if the first is shorter/longer,
800	// otherwise return false. For use in various sort functions.
801
802	bool isShorter(Phrase p1, Phrase p2) {
803	if (p1.length < p2.length) {
804	return true;
805	}
806	return false;
807	}
808
809	bool isLonger(Phrase p1, Phrase p2) {
810	if (p1.length > p2.length) {
811	return true;
812	}
813	return false;
814	}
815
816

Note: See TracBrowser for help on using the repository browser.

Download in other formats: