1 | /**********************************************************************
|
---|
2 | *
|
---|
3 | * phrase.cpp -- implementation of the phrase object used by suffix.cpp
|
---|
4 | *
|
---|
5 | * Copyright 2000 Gordon W. Paynter
|
---|
6 | * Copyright 2000 The New Zealand Digital Library Project
|
---|
7 | *
|
---|
8 | * A component of the Greenstone digital library software
|
---|
9 | * from the New Zealand Digital Library Project at the
|
---|
10 | * University of Waikato, New Zealand.
|
---|
11 | *
|
---|
12 | * This program is free software; you can redistribute it and/or modify
|
---|
13 | * it under the terms of the GNU General Public License as published by
|
---|
14 | * the Free Software Foundation; either version 2 of the License, or
|
---|
15 | * (at your option) any later version.
|
---|
16 | *
|
---|
17 | * This program is distributed in the hope that it will be useful,
|
---|
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | * GNU General Public License for more details.
|
---|
21 | *
|
---|
22 | * You should have received a copy of the GNU General Public License
|
---|
23 | * along with this program; if not, write to the Free Software
|
---|
24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | *
|
---|
26 | *********************************************************************/
|
---|
27 |
|
---|
28 | #include <assert.h>
|
---|
29 | #include <stdio.h>
|
---|
30 |
|
---|
31 | #if defined(GSDL_USE_STL_H)
|
---|
32 | # include <vector.h>
|
---|
33 | #else
|
---|
34 | # include <vector>
|
---|
35 | #endif
|
---|
36 |
|
---|
37 | #if defined(GSDL_USE_IOS_H)
|
---|
38 | # include <iostream.h>
|
---|
39 | #else
|
---|
40 | # include <iostream>
|
---|
41 | #endif
|
---|
42 |
|
---|
43 | #include "suffix.h"
|
---|
44 | #include "phrase.h"
|
---|
45 |
|
---|
46 | // Phrase constructor functions
|
---|
47 |
|
---|
48 | Phrase::Phrase(symbol *words, cellcount size, int direction) {
|
---|
49 | empty();
|
---|
50 | length = size;
|
---|
51 |
|
---|
52 | if (direction == SUFFIX) {
|
---|
53 | forward = words;
|
---|
54 | back = forward + size - 1;
|
---|
55 | } else {
|
---|
56 | back = words;
|
---|
57 | forward = back - size + 1;
|
---|
58 | }
|
---|
59 | }
|
---|
60 |
|
---|
61 | Phrase::Phrase() {
|
---|
62 | empty();
|
---|
63 | }
|
---|
64 |
|
---|
65 |
|
---|
66 | // Empty the contents of a phrase
|
---|
67 |
|
---|
68 | int Phrase::empty() {
|
---|
69 |
|
---|
70 | forward = back = NULL;
|
---|
71 | length = 0;
|
---|
72 |
|
---|
73 | suffixFound = prefixFound = 0;
|
---|
74 | firstSuffix = firstPrefix = NULL;
|
---|
75 | lastSuffix = lastPrefix = NULL;
|
---|
76 | suffixFrequency = prefixFrequency = 0;
|
---|
77 | uniqueSuffixExtension = uniquePrefixExtension = -1;
|
---|
78 |
|
---|
79 | return 0;
|
---|
80 | }
|
---|
81 |
|
---|
82 | // Remove the details of the location where a suffix/prefix
|
---|
83 | // is found in the array.
|
---|
84 |
|
---|
85 | int Phrase::clearSuffix() {
|
---|
86 | suffixFound = 0;
|
---|
87 | firstSuffix = NULL;
|
---|
88 | lastSuffix = NULL;
|
---|
89 | suffixFrequency = 0;
|
---|
90 | uniqueSuffixExtension = -1;
|
---|
91 | return 0;
|
---|
92 | }
|
---|
93 |
|
---|
94 | int Phrase::clearPrefix() {
|
---|
95 | prefixFound = 0;
|
---|
96 | firstPrefix = NULL;
|
---|
97 | lastPrefix = NULL;
|
---|
98 | prefixFrequency = 0;
|
---|
99 | uniquePrefixExtension = -1;
|
---|
100 | return 0;
|
---|
101 | }
|
---|
102 |
|
---|
103 | // Increase the length of s phrase
|
---|
104 | //
|
---|
105 | // We increase the length of a string "in place". When we expand
|
---|
106 | // the suffix, however, we invalidate the prefix data, and vice-versa.
|
---|
107 |
|
---|
108 | int Phrase::increaseSuffixLength(cellcount l) {
|
---|
109 | length = l;
|
---|
110 | clearPrefix();
|
---|
111 | back = forward + l - 1;
|
---|
112 | return 0;
|
---|
113 | }
|
---|
114 |
|
---|
115 | int Phrase::increasePrefixLength(cellcount l) {
|
---|
116 | length = l;
|
---|
117 | clearSuffix();
|
---|
118 | forward = back - l + 1;
|
---|
119 | return 0;
|
---|
120 | }
|
---|
121 |
|
---|
122 |
|
---|
123 | // Convert the phrase to a string
|
---|
124 | char *Phrase::toString() {
|
---|
125 |
|
---|
126 | assert(forward);
|
---|
127 |
|
---|
128 | char *str;
|
---|
129 | str = new char[length*20];
|
---|
130 | symbol *s = forward;
|
---|
131 | sprintf(str, "s%d", *s++);
|
---|
132 |
|
---|
133 | for (cellcount i = 1; i < length; i++) {
|
---|
134 | sprintf(str, "%s s%d", str, *s++);
|
---|
135 | }
|
---|
136 |
|
---|
137 | return str;
|
---|
138 | }
|
---|
139 |
|
---|
140 |
|
---|
141 | // Compare a phrase to an another array of symbols
|
---|
142 | //
|
---|
143 | // Given an array of words with a specified length,
|
---|
144 | // compare those words to this phrase.
|
---|
145 | //
|
---|
146 | // Return 0 if the two phrases are the same over length length
|
---|
147 | // Return 1 if the phrase is "greater" than tyhe words, or
|
---|
148 | // Return -1 if the phrase is less.
|
---|
149 |
|
---|
150 | int Phrase::compareSuffix(symbol *words, cellcount length) {
|
---|
151 | assert(forward);
|
---|
152 | symbol *p = forward;
|
---|
153 |
|
---|
154 | for (cellcount i = 0; i < length; i++) {
|
---|
155 | if (*p > *words) {
|
---|
156 | return 1;
|
---|
157 | } else if (*p < *words) {
|
---|
158 | return -1;
|
---|
159 | } else {
|
---|
160 | *p++;
|
---|
161 | *words++;
|
---|
162 | }
|
---|
163 | }
|
---|
164 |
|
---|
165 | return 0;
|
---|
166 |
|
---|
167 | }
|
---|
168 |
|
---|
169 | int Phrase::comparePrefix(symbol *words, cellcount length) {
|
---|
170 |
|
---|
171 | assert(back);
|
---|
172 | symbol *p = back;
|
---|
173 |
|
---|
174 | for (cellcount i = 0; i < length; i++) {
|
---|
175 | if (*p > *words) {
|
---|
176 | return -1;
|
---|
177 | } else if (*p < *words) {
|
---|
178 | return 1;
|
---|
179 | } else {
|
---|
180 | *p--;
|
---|
181 | *words--;
|
---|
182 | }
|
---|
183 | }
|
---|
184 |
|
---|
185 | return 0;
|
---|
186 |
|
---|
187 | }
|
---|
188 |
|
---|
189 |
|
---|
190 | // Does the phrase have a unique suffix/prefix extension?
|
---|
191 | //
|
---|
192 | // For suffix:
|
---|
193 | // If uniqueSuffixExtensions is 1, then phrase has some unique expansion.
|
---|
194 | // If it is 0 then it does not. If it is -1 then we don't know, and have
|
---|
195 | // to calculate it from the suffix array. Same goes for prefixes.
|
---|
196 |
|
---|
197 | int Phrase::hasUniqueSuffixExtension() {
|
---|
198 |
|
---|
199 | // if we don't know, then work it out
|
---|
200 | if (uniqueSuffixExtension == -1) {
|
---|
201 |
|
---|
202 | ensureSuffixFound();
|
---|
203 |
|
---|
204 | // pointers to the phrase's first and last occurances in te suffixArray
|
---|
205 | symbol *fst = firstSuffix + length;
|
---|
206 | symbol *lst = lastSuffix + length;
|
---|
207 |
|
---|
208 | // in ANYPHRASE mode, simply check the next symbol
|
---|
209 | if (phraseMode == ANYPHRASE) {
|
---|
210 | if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
211 | uniqueSuffixExtension = 1;
|
---|
212 | } else {
|
---|
213 | uniqueSuffixExtension = 0;
|
---|
214 | }
|
---|
215 | }
|
---|
216 | // in STOPWORDS mode, make sure there is a unique next content symbol
|
---|
217 | else {
|
---|
218 | uniqueSuffixExtension = 0;
|
---|
219 | while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
220 | if (*fst >= firstContentSymbol) {
|
---|
221 | uniqueSuffixExtension = 1;
|
---|
222 | break;
|
---|
223 | }
|
---|
224 | fst++;
|
---|
225 | lst++;
|
---|
226 | }
|
---|
227 | }
|
---|
228 | }
|
---|
229 |
|
---|
230 | return uniqueSuffixExtension;
|
---|
231 | }
|
---|
232 |
|
---|
233 | int Phrase::hasUniquePrefixExtension() {
|
---|
234 |
|
---|
235 | // if we don't know, then work it out
|
---|
236 | if (uniquePrefixExtension == -1) {
|
---|
237 |
|
---|
238 | ensurePrefixFound();
|
---|
239 |
|
---|
240 | // pointers to the phrase's first and last occurances in the prefixArray
|
---|
241 | symbol *fst = firstPrefix - length;
|
---|
242 | symbol *lst = lastPrefix - length;
|
---|
243 |
|
---|
244 | // in ANYPHRASE mode, simply check the next symbol
|
---|
245 | if (phraseMode == ANYPHRASE) {
|
---|
246 | if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
247 | uniquePrefixExtension = 1;
|
---|
248 | } else {
|
---|
249 | uniquePrefixExtension = 0;
|
---|
250 | }
|
---|
251 | }
|
---|
252 | // in STOPWORDS mode, make sure there is a unique next content symbol
|
---|
253 | else {
|
---|
254 | uniquePrefixExtension = 0;
|
---|
255 | while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
256 | if (*fst >= firstContentSymbol) {
|
---|
257 | uniquePrefixExtension = 1;
|
---|
258 | break;
|
---|
259 | }
|
---|
260 | fst--;
|
---|
261 | lst--;
|
---|
262 | }
|
---|
263 | }
|
---|
264 | }
|
---|
265 |
|
---|
266 | return uniquePrefixExtension;
|
---|
267 | }
|
---|
268 |
|
---|
269 |
|
---|
270 | // Expand a phrase with a unique suffix/prefix by 1 symbol
|
---|
271 | //
|
---|
272 | // Note that in STOPWORDS mode a "unique extension" means a unique
|
---|
273 | // extension that ends on a content symbol. Although we may extend
|
---|
274 | // such a phrase by 2 or more symbols, we will only extend it by a
|
---|
275 | // single content symbol.
|
---|
276 |
|
---|
277 | int Phrase::expandUniqueSuffixExtensionByOne() {
|
---|
278 | assert(suffixFound);
|
---|
279 | assert(uniqueSuffixExtension == 1);
|
---|
280 |
|
---|
281 | // in ANYPHRASE mode, expand the phrase by one symbol
|
---|
282 | if (phraseMode == ANYPHRASE) {
|
---|
283 | increaseSuffixLength(length + 1);
|
---|
284 | }
|
---|
285 | // in STOPWORDS mode, expand the phrase up to the next content symbol
|
---|
286 | else {
|
---|
287 | symbol *next = firstSuffix + length;
|
---|
288 | cellcount newlength = length + 1;
|
---|
289 |
|
---|
290 | while (*next < firstContentSymbol) {
|
---|
291 | next++;
|
---|
292 | newlength++;
|
---|
293 | }
|
---|
294 | increaseSuffixLength(newlength);
|
---|
295 | }
|
---|
296 |
|
---|
297 | uniqueSuffixExtension = -1;
|
---|
298 | return 0;
|
---|
299 | }
|
---|
300 |
|
---|
301 | int Phrase::expandUniquePrefixExtensionByOne() {
|
---|
302 | assert(prefixFound);
|
---|
303 | assert(uniquePrefixExtension == 1);
|
---|
304 |
|
---|
305 | // in ANYPHRASE mode, expand the phrase by one symbol
|
---|
306 | if (phraseMode == ANYPHRASE) {
|
---|
307 | increasePrefixLength(length + 1);
|
---|
308 | }
|
---|
309 | // in STOPWORDS mode, expand the phrase to the next content symbol
|
---|
310 | else {
|
---|
311 | symbol *next = firstPrefix - length;
|
---|
312 | cellcount newlength = length + 1;
|
---|
313 | while (*next < firstContentSymbol) {
|
---|
314 | next--;
|
---|
315 | newlength++;
|
---|
316 | }
|
---|
317 | increasePrefixLength(newlength);
|
---|
318 |
|
---|
319 | }
|
---|
320 |
|
---|
321 | uniquePrefixExtension = -1;
|
---|
322 | return 0;
|
---|
323 | }
|
---|
324 |
|
---|
325 |
|
---|
326 | // Find the phrase in a suffixArray
|
---|
327 | //
|
---|
328 | // Given a suffix array, find the first and last occurances of the phrase.
|
---|
329 | // The begin and end parameters identify which cells to search (inclusive).
|
---|
330 |
|
---|
331 | int Phrase::findFirstAndLastSuffix() {
|
---|
332 | return findFirstAndLastSuffix(0, inputLength-1);
|
---|
333 | }
|
---|
334 |
|
---|
335 | int Phrase::findFirstAndLastSuffix(cellindex begin, cellindex end) {
|
---|
336 |
|
---|
337 | // First find *any* occurance of the phrase
|
---|
338 | assert(begin <= end);
|
---|
339 |
|
---|
340 | // cout << "findFirstAndLastSuffix (" << begin << " - " << end << ") : " << toString() << endl;
|
---|
341 |
|
---|
342 | // if we're only searching one cell, it is very easy
|
---|
343 | if (begin == end) {
|
---|
344 | assert(compareSuffix(suffixArray[begin], length) == 0);
|
---|
345 | firstSuffix = lastSuffix = suffixArray[begin];
|
---|
346 | firstSuffixIndex = lastSuffixIndex = begin;
|
---|
347 | suffixFrequency = 1;
|
---|
348 | suffixFound = 1;
|
---|
349 | return 0;
|
---|
350 | }
|
---|
351 |
|
---|
352 | cellindex c;
|
---|
353 | int cmp;
|
---|
354 |
|
---|
355 | do {
|
---|
356 | // cout << "Find anywhere in " << begin << " - " << end << endl;
|
---|
357 | c = (end + begin) / 2;
|
---|
358 | cmp = compareSuffix(suffixArray[c], length);
|
---|
359 | if (cmp == 1) {
|
---|
360 | // target phrase is lower than phrase at suffixArray[c]
|
---|
361 | begin = c + 1;
|
---|
362 | } else if (cmp == -1) {
|
---|
363 | // target phrase is higher than phrase at suffixArray[c]
|
---|
364 | end = c - 1;
|
---|
365 | }
|
---|
366 | } while (cmp != 0);
|
---|
367 |
|
---|
368 | cellindex lastbegin = c;
|
---|
369 | cellindex lastend = end;
|
---|
370 |
|
---|
371 | // Next find the first occurance of the phrase.
|
---|
372 | // We know that the first occurance must be between the
|
---|
373 | // current value of begin and the current value of c.
|
---|
374 | end = c;
|
---|
375 |
|
---|
376 | do {
|
---|
377 | // cout << "first-searching with " << begin << " - " << end << endl;
|
---|
378 | if (begin == end) {
|
---|
379 | c = begin;
|
---|
380 | cmp = 0;
|
---|
381 | } else {
|
---|
382 | c = (begin + end) / 2;
|
---|
383 | cmp = compareSuffix(suffixArray[c], length);
|
---|
384 | if (cmp == 1) {
|
---|
385 | // target phrase is lower than phrase at suffixArray[c]
|
---|
386 | begin = c + 1;
|
---|
387 | } else {
|
---|
388 | assert(cmp == 0);
|
---|
389 | // target phrase is the same as the phrase at suffixArray[c]. However,
|
---|
390 | // to find the first occurance, suffixArray[c] must be the same as the
|
---|
391 | // phrase, but suffixArray[c-1] must be different.
|
---|
392 | cmp = compareSuffix(suffixArray[c-1], length);
|
---|
393 | if (cmp == 0) {
|
---|
394 | end = c - 1;
|
---|
395 | assert(end >= begin);
|
---|
396 | cmp = 1;
|
---|
397 | } else {
|
---|
398 | cmp = 0;
|
---|
399 | }
|
---|
400 | }
|
---|
401 | }
|
---|
402 | } while (cmp != 0);
|
---|
403 |
|
---|
404 | // we have found the first location
|
---|
405 | firstSuffixIndex = c;
|
---|
406 | firstSuffix = suffixArray[c];
|
---|
407 |
|
---|
408 | // Next find the last occurance of the phrase.
|
---|
409 | // We previously stored some bounds for its loccation.
|
---|
410 | begin = lastbegin;
|
---|
411 | end = lastend;
|
---|
412 |
|
---|
413 | do {
|
---|
414 | // cout << "last-searching with " << begin << " - " << end << endl;
|
---|
415 | if (begin == end) {
|
---|
416 | c = begin;
|
---|
417 | cmp = 0;
|
---|
418 | } else {
|
---|
419 | c = (begin + end) / 2;
|
---|
420 | cmp = compareSuffix(suffixArray[c], length);
|
---|
421 | if (cmp == -1) {
|
---|
422 | // target phrase is greater than phrase at suffixArray[c]
|
---|
423 | end = c - 1;
|
---|
424 | } else {
|
---|
425 | assert(cmp == 0);
|
---|
426 | // target phrase is the same as the phrase at suffixArray[c]. However,
|
---|
427 | // to find the last occurance, one of two additional condiditons must be met:
|
---|
428 | // either c must be the very last cell in the array, or
|
---|
429 | // suffixArray[c+1] must be different from phrase
|
---|
430 | if (c < inputLength-1) {
|
---|
431 | cmp = compareSuffix(suffixArray[c+1], length);
|
---|
432 | if (cmp == 0) {
|
---|
433 | begin = c + 1;
|
---|
434 | cmp = -1;
|
---|
435 | } else {
|
---|
436 | cmp = 0;
|
---|
437 | }
|
---|
438 | }
|
---|
439 | }
|
---|
440 | }
|
---|
441 | } while (cmp != 0);
|
---|
442 |
|
---|
443 | lastSuffixIndex = c;
|
---|
444 | lastSuffix = suffixArray[c];
|
---|
445 | suffixFrequency = lastSuffixIndex - firstSuffixIndex + 1;
|
---|
446 | suffixFound = 1;
|
---|
447 |
|
---|
448 | return 0;
|
---|
449 | }
|
---|
450 |
|
---|
451 |
|
---|
452 | // Find the phrase in a prefix array
|
---|
453 | //
|
---|
454 | // Given a prefix array, find the first and last occurances of the phrase.
|
---|
455 | // The begin and end parameters identify which cells to search (inclusive).
|
---|
456 |
|
---|
457 | int Phrase::findFirstAndLastPrefix() {
|
---|
458 | return findFirstAndLastPrefix(0, inputLength-1);
|
---|
459 | }
|
---|
460 |
|
---|
461 | int Phrase::findFirstAndLastPrefix(cellindex begin, cellindex end) {
|
---|
462 |
|
---|
463 | // First find *any* occurance of the phrase
|
---|
464 | assert(begin <= end);
|
---|
465 |
|
---|
466 | // cout << "findFirstAndLastPrefix (" << begin << " - " << end << ") : " << toString() << endl;
|
---|
467 |
|
---|
468 | // if we're only searching one cell, it is very easy
|
---|
469 | if (begin == end) {
|
---|
470 | assert(comparePrefix(prefixArray[begin], length) == 0);
|
---|
471 | firstPrefix = lastPrefix = prefixArray[begin];
|
---|
472 | firstPrefixIndex = lastPrefixIndex = begin;
|
---|
473 | prefixFrequency = 1;
|
---|
474 | prefixFound = 1;
|
---|
475 | return 0;
|
---|
476 | }
|
---|
477 |
|
---|
478 | cellindex c;
|
---|
479 | int cmp;
|
---|
480 |
|
---|
481 | do {
|
---|
482 | // cout << "Find anywhere in prefixarray " << begin << " - " << end << endl;
|
---|
483 | c = (end + begin) / 2;
|
---|
484 | cmp = comparePrefix(prefixArray[c], length);
|
---|
485 | if (cmp == 1) {
|
---|
486 | begin = c + 1;
|
---|
487 | } else if (cmp == -1) {
|
---|
488 | end = c - 1;
|
---|
489 | }
|
---|
490 | } while (cmp != 0);
|
---|
491 |
|
---|
492 | cellindex lastbegin = c;
|
---|
493 | cellindex lastend = end;
|
---|
494 |
|
---|
495 | // Next find the first occurance of the phrase.
|
---|
496 | // We know that the first occurance must be between the
|
---|
497 | // current value of begin and the cureent value of c.
|
---|
498 | end = c;
|
---|
499 |
|
---|
500 | do {
|
---|
501 | // cout << "first-prefix-searching with " << begin << " - " << end << endl;
|
---|
502 | if (begin == end) {
|
---|
503 | c = begin;
|
---|
504 | cmp = 0;
|
---|
505 | } else {
|
---|
506 | c = (begin + end) / 2;
|
---|
507 | cmp = comparePrefix(prefixArray[c], length);
|
---|
508 | // cout << "cmp: " << cmp << ", c: " << c << endl;
|
---|
509 | if (cmp == 1) {
|
---|
510 | // target phrase is lower than phrase at prefixArray[c]
|
---|
511 | begin = c + 1;
|
---|
512 | } else {
|
---|
513 | assert(cmp == 0);
|
---|
514 | // target phrase is the same as the phrase at prefixArray[c]. However, to
|
---|
515 | // find the first occurance, one of two additional condiditons must be met:
|
---|
516 | // either c == the first cell in prefix array
|
---|
517 | // or prefixArray[c-1] != phrase.
|
---|
518 | if (c > 0) {
|
---|
519 | cmp = comparePrefix(prefixArray[c-1], length);
|
---|
520 | if (cmp == 0) {
|
---|
521 | end = c - 1;
|
---|
522 | assert(end >= begin);
|
---|
523 | cmp = 1;
|
---|
524 | } else {
|
---|
525 | cmp = 0;
|
---|
526 | }
|
---|
527 | }
|
---|
528 | }
|
---|
529 | }
|
---|
530 | } while (cmp != 0);
|
---|
531 |
|
---|
532 | // we have found the first location
|
---|
533 | firstPrefixIndex = c;
|
---|
534 | firstPrefix = prefixArray[c];
|
---|
535 |
|
---|
536 | // Next find the last occurance of the phrase.
|
---|
537 | // We previously stored some bounds for its loccation.
|
---|
538 | begin = lastbegin;
|
---|
539 | end = lastend;
|
---|
540 |
|
---|
541 | do {
|
---|
542 | // cout << "last-prefix-searching with " << begin << " - " << end << endl;
|
---|
543 | if (begin == end) {
|
---|
544 | c = begin;
|
---|
545 | cmp = 0;
|
---|
546 | } else {
|
---|
547 | c = (begin + end) / 2;
|
---|
548 | cmp = comparePrefix(prefixArray[c], length);
|
---|
549 | if (cmp == -1) {
|
---|
550 | // target phrase is greater than phrase at prefixArray[c]
|
---|
551 | end = c - 1;
|
---|
552 | } else {
|
---|
553 | assert(cmp == 0);
|
---|
554 | // target phrase is the same as the phrase at prefixArray[c]. However,
|
---|
555 | // to find the last occurance, prefixArray[c] must be the same as the
|
---|
556 | // phrase, but prefixArray[c+1] must be different.
|
---|
557 | cmp = comparePrefix(prefixArray[c+1], length);
|
---|
558 | if (cmp == 0) {
|
---|
559 | begin = c + 1;
|
---|
560 | cmp = -1;
|
---|
561 | } else {
|
---|
562 | cmp = 0;
|
---|
563 | }
|
---|
564 | }
|
---|
565 | }
|
---|
566 | } while (cmp != 0);
|
---|
567 |
|
---|
568 | lastPrefixIndex = c;
|
---|
569 | lastPrefix = prefixArray[c];
|
---|
570 | prefixFrequency = lastPrefixIndex - firstPrefixIndex + 1;
|
---|
571 | prefixFound = 1;
|
---|
572 |
|
---|
573 | return 0;
|
---|
574 | }
|
---|
575 |
|
---|
576 |
|
---|
577 | // Ensure that the phrase has been found in the suffix & prefix arrays
|
---|
578 |
|
---|
579 | int Phrase::ensureSuffixFound() {
|
---|
580 | if (!suffixFound) {
|
---|
581 | findFirstAndLastSuffix();
|
---|
582 | }
|
---|
583 | return 0;
|
---|
584 | }
|
---|
585 |
|
---|
586 | int Phrase::ensurePrefixFound() {
|
---|
587 | if (!prefixFound) {
|
---|
588 | findFirstAndLastPrefix();
|
---|
589 | }
|
---|
590 | return 0;
|
---|
591 | }
|
---|
592 |
|
---|
593 |
|
---|
594 | // Calculate a set of initial suffix/prefix candidates
|
---|
595 | //
|
---|
596 | // Expand the phrase to find the initial
|
---|
597 | // set of candidates that may be extensions of that phrase,
|
---|
598 | // and add them to the end of the results vector
|
---|
599 |
|
---|
600 | int Phrase::initialSuffixCandidates(vector<Phrase> &results) {
|
---|
601 |
|
---|
602 | ensureSuffixFound();
|
---|
603 | Phrase next;
|
---|
604 | cellindex i = firstSuffixIndex;
|
---|
605 |
|
---|
606 | // Find all the expansions of Phrase p
|
---|
607 | while (i <= lastSuffixIndex) {
|
---|
608 |
|
---|
609 | // create new phrase exactly one longer than the current one
|
---|
610 | next = newPhraseShortestSuffixExpansion(i);
|
---|
611 |
|
---|
612 | // If the expansion occurs more than once and is not delimited, expand it
|
---|
613 | if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {
|
---|
614 | next.expandWhileUniqueSuffixExtension();
|
---|
615 | results.push_back(next);
|
---|
616 | }
|
---|
617 |
|
---|
618 | // Move onto the next expansion
|
---|
619 | i = next.lastSuffixIndex + 1;
|
---|
620 |
|
---|
621 | }
|
---|
622 | return 0;
|
---|
623 | }
|
---|
624 |
|
---|
625 |
|
---|
626 | int Phrase::initialPrefixCandidates(vector<Phrase> &results) {
|
---|
627 |
|
---|
628 | ensurePrefixFound();
|
---|
629 | Phrase next;
|
---|
630 | cellindex i = firstPrefixIndex;
|
---|
631 |
|
---|
632 | // Find all the expansions of Phrase p
|
---|
633 | while (i <= lastPrefixIndex) {
|
---|
634 |
|
---|
635 | // create new phrase exactly one longer than the current one
|
---|
636 | next = newPhraseShortestPrefixExpansion(i);
|
---|
637 |
|
---|
638 | // If the expansion occurs more than once and is not delimited, expand it
|
---|
639 | if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {
|
---|
640 | next.expandWhileUniquePrefixExtension();
|
---|
641 | results.push_back(next);
|
---|
642 | }
|
---|
643 |
|
---|
644 | // Move onto the next expansion
|
---|
645 | i = next.lastPrefixIndex + 1;
|
---|
646 |
|
---|
647 | }
|
---|
648 | return 0;
|
---|
649 | }
|
---|
650 |
|
---|
651 |
|
---|
652 | // Create a new phrase that is longer then the current one, but as short
|
---|
653 | // as possible. In ANYPHRASE mode this means the new phrase is exactly
|
---|
654 | // one symbol longer than the current one, but in STOPWORD mode the expansion
|
---|
655 | // must introduce a new content symbol
|
---|
656 | //
|
---|
657 | // Note that we never expand across delimiters; if we encounter a delimiter
|
---|
658 | // then this becomes the "end" symbol of the phrase. The calling function
|
---|
659 | // should check for this condition.
|
---|
660 |
|
---|
661 | Phrase Phrase::newPhraseShortestSuffixExpansion(cellindex i) {
|
---|
662 |
|
---|
663 | // create a new phrase exactly one symbol longer than the current one
|
---|
664 | Phrase p(suffixArray[i], length + 1, SUFFIX);
|
---|
665 |
|
---|
666 | if (phraseMode == STOPWORDS) {
|
---|
667 | // in STOPWORDS mode we must introduce a new content symbol
|
---|
668 | while ((*(p.back) >= firstStopSymbol) &&
|
---|
669 | (*(p.back) <= lastStopSymbol)) {
|
---|
670 | p.increaseSuffixLength(p.length + 1);
|
---|
671 | }
|
---|
672 | }
|
---|
673 |
|
---|
674 | p.findFirstAndLastSuffix(i, lastSuffixIndex);
|
---|
675 | return p;
|
---|
676 | }
|
---|
677 |
|
---|
678 | Phrase Phrase::newPhraseShortestPrefixExpansion(cellindex i) {
|
---|
679 |
|
---|
680 | // create a new phrase exactly one symbol longer than the current one
|
---|
681 | Phrase p(prefixArray[i], length + 1, PREFIX);
|
---|
682 |
|
---|
683 | if (phraseMode == STOPWORDS) {
|
---|
684 | // in STOPWORDS mode we must introduce a new content symbol
|
---|
685 | while ((*(p.forward) >= firstStopSymbol) &&
|
---|
686 | (*(p.forward) <= lastStopSymbol)) {
|
---|
687 | p.increasePrefixLength(p.length + 1);
|
---|
688 | }
|
---|
689 | }
|
---|
690 |
|
---|
691 | p.findFirstAndLastPrefix(i, lastPrefixIndex);
|
---|
692 | return p;
|
---|
693 | }
|
---|
694 |
|
---|
695 |
|
---|
696 | // Expand a phrase until it no longer has a unique suffix extension
|
---|
697 | //
|
---|
698 | // If the phrase only occurs once in the suffix array, do nothing.
|
---|
699 | //
|
---|
700 | // If we are in stopwords mode then we expand if there is a unique
|
---|
701 | // suffix expansion and this expansion is a content word. We do this
|
---|
702 | // with the content_len variable, which holds the length of the longest
|
---|
703 | // phrase ending in a content word.
|
---|
704 |
|
---|
705 | int Phrase::expandWhileUniqueSuffixExtension() {
|
---|
706 | assert(forward);
|
---|
707 | assert(suffixFound);
|
---|
708 |
|
---|
709 | // if the phrase occurs only once, do nothing
|
---|
710 | if (suffixFrequency < 2) {
|
---|
711 | return 0;
|
---|
712 | }
|
---|
713 |
|
---|
714 | // count the length over which the cells are the same
|
---|
715 | // and no delimiter is crossed.
|
---|
716 | symbol *fst = firstSuffix;
|
---|
717 | symbol *lst = lastSuffix;
|
---|
718 | cellindex len = 0;
|
---|
719 | cellindex content_len = 0;
|
---|
720 |
|
---|
721 | // Calculate the length over which the phrases match
|
---|
722 | while((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
723 | len++;
|
---|
724 | if (*fst > lastStopSymbol) content_len = len;
|
---|
725 | fst++;
|
---|
726 | lst++;
|
---|
727 | }
|
---|
728 |
|
---|
729 | // in ANYPHRASE mode, expand the phrase for as long as possible
|
---|
730 | if (phraseMode == ANYPHRASE) {
|
---|
731 | if (len > length) {
|
---|
732 | increaseSuffixLength(len);
|
---|
733 | }
|
---|
734 | }
|
---|
735 | // in STOPWORDS mode, expand only as far as the last contentword
|
---|
736 | else {
|
---|
737 | if (content_len > length) {
|
---|
738 | increaseSuffixLength(content_len);
|
---|
739 | }
|
---|
740 | }
|
---|
741 |
|
---|
742 | uniqueSuffixExtension = 0;
|
---|
743 | return 0;
|
---|
744 | }
|
---|
745 |
|
---|
746 | int Phrase::expandWhileUniquePrefixExtension() {
|
---|
747 | assert(prefixFound);
|
---|
748 |
|
---|
749 | // if the phrase occurs only once, do nothing
|
---|
750 | if (prefixFrequency < 2) {
|
---|
751 | return 0;
|
---|
752 | }
|
---|
753 |
|
---|
754 | // count the length over which the cells are the same
|
---|
755 | // and no delimiter is crossed.
|
---|
756 | symbol *fst = firstPrefix;
|
---|
757 | symbol *lst = lastPrefix;
|
---|
758 | cellindex len = 0;
|
---|
759 | cellindex content_len = 0;
|
---|
760 |
|
---|
761 | // Calculate the length over which the phrases match
|
---|
762 | while((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
763 | len++;
|
---|
764 | if (*fst > lastStopSymbol) content_len = len;
|
---|
765 | fst--;
|
---|
766 | lst--;
|
---|
767 | }
|
---|
768 |
|
---|
769 | // in ANYPHRASE mode, expand the phrase for as long as possible
|
---|
770 | if (phraseMode == ANYPHRASE) {
|
---|
771 | if (len > length) {
|
---|
772 | increasePrefixLength(len);
|
---|
773 | }
|
---|
774 | }
|
---|
775 | // in STOPWORDS mode, expand only as far as the last contentword
|
---|
776 | else {
|
---|
777 | if (content_len > length) {
|
---|
778 | increasePrefixLength(content_len);
|
---|
779 | }
|
---|
780 | }
|
---|
781 |
|
---|
782 | uniquePrefixExtension = 0;
|
---|
783 | return 0;
|
---|
784 | }
|
---|
785 |
|
---|
786 |
|
---|
787 |
|
---|
788 |
|
---|
789 |
|
---|
790 |
|
---|
791 |
|
---|
792 |
|
---|
793 |
|
---|
794 |
|
---|
795 |
|
---|
796 |
|
---|
797 | // Compare the length of two phrases
|
---|
798 | //
|
---|
799 | // Given two phrases, return true if the first is shorter/longer,
|
---|
800 | // otherwise return false. For use in various sort functions.
|
---|
801 |
|
---|
802 | bool isShorter(Phrase p1, Phrase p2) {
|
---|
803 | if (p1.length < p2.length) {
|
---|
804 | return true;
|
---|
805 | }
|
---|
806 | return false;
|
---|
807 | }
|
---|
808 |
|
---|
809 | bool isLonger(Phrase p1, Phrase p2) {
|
---|
810 | if (p1.length > p2.length) {
|
---|
811 | return true;
|
---|
812 | }
|
---|
813 | return false;
|
---|
814 | }
|
---|
815 |
|
---|
816 |
|
---|