1 | /**********************************************************************
|
---|
2 | *
|
---|
3 | * phrase.cpp -- implementation of the phrase object used by suffix.cpp
|
---|
4 | *
|
---|
5 | * Copyright 2000 Gordon W. Paynter
|
---|
6 | * Copyright 2000 The New Zealand Digital Library Project
|
---|
7 | *
|
---|
8 | * A component of the Greenstone digital library software
|
---|
9 | * from the New Zealand Digital Library Project at the
|
---|
10 | * University of Waikato, New Zealand.
|
---|
11 | *
|
---|
12 | * This program is free software; you can redistribute it and/or modify
|
---|
13 | * it under the terms of the GNU General Public License as published by
|
---|
14 | * the Free Software Foundation; either version 2 of the License, or
|
---|
15 | * (at your option) any later version.
|
---|
16 | *
|
---|
17 | * This program is distributed in the hope that it will be useful,
|
---|
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | * GNU General Public License for more details.
|
---|
21 | *
|
---|
22 | * You should have received a copy of the GNU General Public License
|
---|
23 | * along with this program; if not, write to the Free Software
|
---|
24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | *
|
---|
26 | *********************************************************************/
|
---|
27 |
|
---|
28 | #include <assert.h>
|
---|
29 | #include <stdio.h>
|
---|
30 |
|
---|
31 | #if defined(GSDL_USE_STL_H)
|
---|
32 | # include <vector.h>
|
---|
33 | #else
|
---|
34 | # include <vector>
|
---|
35 | #endif
|
---|
36 |
|
---|
37 | #if defined(GSDL_USE_IOS_H)
|
---|
38 | # include <iostream.h>
|
---|
39 | #else
|
---|
40 | # include <iostream>
|
---|
41 | #endif
|
---|
42 |
|
---|
43 | #include "suffix.h"
|
---|
44 | #include "phrase.h"
|
---|
45 |
|
---|
46 |
|
---|
47 | // Phrase constructor functions
|
---|
48 |
|
---|
49 |
|
---|
50 | Phrase::Phrase(symbol *words, cellcount size, int direction) {
|
---|
51 | empty();
|
---|
52 | length = size;
|
---|
53 |
|
---|
54 | if (direction == SUFFIX) {
|
---|
55 | forward = words;
|
---|
56 | back = forward + size - 1;
|
---|
57 | } else {
|
---|
58 | back = words;
|
---|
59 | forward = back - size + 1;
|
---|
60 | }
|
---|
61 | }
|
---|
62 |
|
---|
63 |
|
---|
64 | Phrase::Phrase() {
|
---|
65 | empty();
|
---|
66 | }
|
---|
67 |
|
---|
68 |
|
---|
69 | Phrase::Phrase(const Phrase &p) {
|
---|
70 | forward = p.forward;
|
---|
71 | back = p.back;
|
---|
72 | length = p.length;
|
---|
73 |
|
---|
74 | suffixFound = p.suffixFound;
|
---|
75 | prefixFound = p.prefixFound;
|
---|
76 |
|
---|
77 | firstSuffix = p.firstSuffix;
|
---|
78 | lastSuffix = p.lastSuffix;
|
---|
79 | firstSuffixIndex = p.firstSuffixIndex;
|
---|
80 | lastSuffixIndex = p.lastSuffixIndex;
|
---|
81 | suffixFrequency = p.suffixFrequency;
|
---|
82 |
|
---|
83 | firstPrefix = p.firstPrefix;
|
---|
84 | lastPrefix = p.lastPrefix;
|
---|
85 | firstPrefixIndex = p.firstPrefixIndex;
|
---|
86 | lastPrefixIndex = p.lastPrefixIndex;
|
---|
87 | prefixFrequency = p.prefixFrequency;
|
---|
88 |
|
---|
89 | uniqueSuffixExtension = p.uniqueSuffixExtension;
|
---|
90 | uniquePrefixExtension = p.uniquePrefixExtension;
|
---|
91 | }
|
---|
92 |
|
---|
93 |
|
---|
94 | // Empty the contents of a phrase
|
---|
95 |
|
---|
96 | int Phrase::empty() {
|
---|
97 |
|
---|
98 | forward = back = NULL;
|
---|
99 | length = 0;
|
---|
100 |
|
---|
101 | suffixFound = prefixFound = 0;
|
---|
102 | firstSuffix = firstPrefix = NULL;
|
---|
103 | lastSuffix = lastPrefix = NULL;
|
---|
104 |
|
---|
105 | firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0;
|
---|
106 | firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0;
|
---|
107 |
|
---|
108 | uniqueSuffixExtension = uniquePrefixExtension = -1;
|
---|
109 |
|
---|
110 | return 0;
|
---|
111 | }
|
---|
112 |
|
---|
113 | // Remove the details of the location where a suffix/prefix
|
---|
114 | // is found in the array.
|
---|
115 |
|
---|
116 | int Phrase::clearSuffix() {
|
---|
117 | suffixFound = 0;
|
---|
118 | firstSuffix = lastSuffix = NULL;
|
---|
119 | firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0;
|
---|
120 | uniqueSuffixExtension = -1;
|
---|
121 | return 0;
|
---|
122 | }
|
---|
123 |
|
---|
124 | int Phrase::clearPrefix() {
|
---|
125 | prefixFound = 0;
|
---|
126 | firstPrefix = lastPrefix = NULL;
|
---|
127 | firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0;
|
---|
128 | uniquePrefixExtension = -1;
|
---|
129 | return 0;
|
---|
130 | }
|
---|
131 |
|
---|
132 | // Increase the length of s phrase
|
---|
133 | //
|
---|
134 | // We increase the length of a string "in place". When we expand
|
---|
135 | // the suffix, however, we invalidate the prefix data, and vice-versa.
|
---|
136 |
|
---|
137 | int Phrase::increaseSuffixLength(cellcount l) {
|
---|
138 | length = l;
|
---|
139 | clearPrefix();
|
---|
140 | back = forward + l - 1;
|
---|
141 | return 0;
|
---|
142 | }
|
---|
143 |
|
---|
144 | int Phrase::increasePrefixLength(cellcount l) {
|
---|
145 | length = l;
|
---|
146 | clearSuffix();
|
---|
147 | forward = back - l + 1;
|
---|
148 | return 0;
|
---|
149 | }
|
---|
150 |
|
---|
151 | // Shorten a phrase by one symbol
|
---|
152 | int Phrase::shortenByOneAtSuffix() {
|
---|
153 | --length;
|
---|
154 | --back;
|
---|
155 | if (phraseMode==STOPWORDS) {
|
---|
156 | while (*back >=firstStopSymbol && *back <= lastStopSymbol) {
|
---|
157 | --length;
|
---|
158 | --back;
|
---|
159 | }
|
---|
160 | }
|
---|
161 | clearSuffix();
|
---|
162 | clearPrefix();
|
---|
163 | return 0;
|
---|
164 | }
|
---|
165 |
|
---|
166 | int Phrase::shortenByOneAtPrefix() {
|
---|
167 | --length;
|
---|
168 | ++forward;
|
---|
169 | if (phraseMode==STOPWORDS) {
|
---|
170 | while (*forward >=firstStopSymbol && *forward <= lastStopSymbol) {
|
---|
171 | --length;
|
---|
172 | ++forward;
|
---|
173 | }
|
---|
174 | }
|
---|
175 | clearSuffix();
|
---|
176 | clearPrefix();
|
---|
177 | return 0;
|
---|
178 | }
|
---|
179 |
|
---|
180 |
|
---|
181 | // Output a phrase to a stream
|
---|
182 | ostream &operator<<(ostream &stream, const Phrase &phrase)
|
---|
183 | {
|
---|
184 | assert(phrase.forward);
|
---|
185 | symbol *s = phrase.forward;
|
---|
186 |
|
---|
187 | stream << "s" << *s++;
|
---|
188 | for (cellcount i = 1; i < phrase.length; i++)
|
---|
189 | stream << " s" << *s++;
|
---|
190 |
|
---|
191 | return stream;
|
---|
192 | }
|
---|
193 |
|
---|
194 |
|
---|
195 | // Convert the phrase to a string
|
---|
196 | // Note thgat you have to delete the memory yourself.
|
---|
197 | char *Phrase::toString()
|
---|
198 | {
|
---|
199 | assert(forward);
|
---|
200 |
|
---|
201 | char *str;
|
---|
202 | str = new char[length*20];
|
---|
203 | symbol *s = forward;
|
---|
204 | sprintf(str, "s%d", *s++);
|
---|
205 |
|
---|
206 | for (cellcount i = 1; i < length; i++) {
|
---|
207 | sprintf(str, "%s s%d", str, *s++);
|
---|
208 | }
|
---|
209 |
|
---|
210 | return str;
|
---|
211 | }
|
---|
212 |
|
---|
213 |
|
---|
214 | // Compare a phrase to an another array of symbols
|
---|
215 | //
|
---|
216 | // Given an array of words with a specified length,
|
---|
217 | // compare those words to this phrase.
|
---|
218 | //
|
---|
219 | // Return 0 if the two phrases are the same over length length
|
---|
220 | // Return 1 if the phrase is "greater" than tyhe words, or
|
---|
221 | // Return -1 if the phrase is less.
|
---|
222 |
|
---|
223 | int Phrase::compareSuffix(symbol *words, cellcount length) {
|
---|
224 | assert(forward);
|
---|
225 | symbol *p = forward;
|
---|
226 |
|
---|
227 | for (cellcount i = 0; i < length; i++) {
|
---|
228 | if (*p > *words) {
|
---|
229 | return 1;
|
---|
230 | } else if (*p < *words) {
|
---|
231 | return -1;
|
---|
232 | } else {
|
---|
233 | *p++;
|
---|
234 | *words++;
|
---|
235 | }
|
---|
236 | }
|
---|
237 |
|
---|
238 | return 0;
|
---|
239 |
|
---|
240 | }
|
---|
241 |
|
---|
242 | int Phrase::comparePrefix(symbol *words, cellcount length) {
|
---|
243 |
|
---|
244 | assert(back);
|
---|
245 | symbol *p = back;
|
---|
246 |
|
---|
247 | for (cellcount i = 0; i < length; i++) {
|
---|
248 | if (*p > *words) {
|
---|
249 | return -1;
|
---|
250 | } else if (*p < *words) {
|
---|
251 | return 1;
|
---|
252 | } else {
|
---|
253 | *p--;
|
---|
254 | *words--;
|
---|
255 | }
|
---|
256 | }
|
---|
257 |
|
---|
258 | return 0;
|
---|
259 |
|
---|
260 | }
|
---|
261 |
|
---|
262 |
|
---|
263 | // Does the phrase have a unique suffix/prefix extension?
|
---|
264 | //
|
---|
265 | // For suffix:
|
---|
266 | // If uniqueSuffixExtensions is 1, then phrase has some unique expansion.
|
---|
267 | // If it is 0 then it does not. If it is -1 then we don't know, and have
|
---|
268 | // to calculate it from the suffix array. Same goes for prefixes.
|
---|
269 |
|
---|
270 | int Phrase::hasUniqueSuffixExtension() {
|
---|
271 |
|
---|
272 | // if we don't know, then work it out
|
---|
273 | if (uniqueSuffixExtension == -1) {
|
---|
274 |
|
---|
275 | ensureSuffixFound();
|
---|
276 |
|
---|
277 | // pointers to the phrase's first and last occurances in te suffixArray
|
---|
278 | symbol *fst = firstSuffix + length;
|
---|
279 | symbol *lst = lastSuffix + length;
|
---|
280 |
|
---|
281 | // in ANYPHRASE mode, simply check the next symbol
|
---|
282 | if (phraseMode == ANYPHRASE) {
|
---|
283 | if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
284 | uniqueSuffixExtension = 1;
|
---|
285 | } else {
|
---|
286 | uniqueSuffixExtension = 0;
|
---|
287 | }
|
---|
288 | }
|
---|
289 | // in STOPWORDS mode, make sure there is a unique next content symbol
|
---|
290 | else {
|
---|
291 | uniqueSuffixExtension = 0;
|
---|
292 | while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
293 | if (*fst >= firstContentSymbol) {
|
---|
294 | uniqueSuffixExtension = 1;
|
---|
295 | break;
|
---|
296 | }
|
---|
297 | fst++;
|
---|
298 | lst++;
|
---|
299 | }
|
---|
300 | }
|
---|
301 | }
|
---|
302 |
|
---|
303 | return uniqueSuffixExtension;
|
---|
304 | }
|
---|
305 |
|
---|
306 | int Phrase::hasUniquePrefixExtension() {
|
---|
307 |
|
---|
308 | // if we don't know, then work it out
|
---|
309 | if (uniquePrefixExtension == -1) {
|
---|
310 |
|
---|
311 | ensurePrefixFound();
|
---|
312 |
|
---|
313 | // pointers to the phrase's first and last occurances in the prefixArray
|
---|
314 | symbol *fst = firstPrefix - length;
|
---|
315 | symbol *lst = lastPrefix - length;
|
---|
316 |
|
---|
317 | // in ANYPHRASE mode, simply check the next symbol
|
---|
318 | if (phraseMode == ANYPHRASE) {
|
---|
319 | if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
320 | uniquePrefixExtension = 1;
|
---|
321 | } else {
|
---|
322 | uniquePrefixExtension = 0;
|
---|
323 | }
|
---|
324 | }
|
---|
325 | // in STOPWORDS mode, make sure there is a unique next content symbol
|
---|
326 | else {
|
---|
327 | uniquePrefixExtension = 0;
|
---|
328 | while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
329 | if (*fst >= firstContentSymbol) {
|
---|
330 | uniquePrefixExtension = 1;
|
---|
331 | break;
|
---|
332 | }
|
---|
333 | fst--;
|
---|
334 | lst--;
|
---|
335 | }
|
---|
336 | }
|
---|
337 | }
|
---|
338 |
|
---|
339 | return uniquePrefixExtension;
|
---|
340 | }
|
---|
341 |
|
---|
342 |
|
---|
343 | // Expand a phrase with a unique suffix/prefix by 1 symbol
|
---|
344 | //
|
---|
345 | // Note that in STOPWORDS mode a "unique extension" means a unique
|
---|
346 | // extension that ends on a content symbol. Although we may extend
|
---|
347 | // such a phrase by 2 or more symbols, we will only extend it by a
|
---|
348 | // single content symbol.
|
---|
349 |
|
---|
350 | int Phrase::expandUniqueSuffixExtensionByOne() {
|
---|
351 | assert(suffixFound);
|
---|
352 | assert(uniqueSuffixExtension == 1);
|
---|
353 |
|
---|
354 | // in ANYPHRASE mode, expand the phrase by one symbol
|
---|
355 | if (phraseMode == ANYPHRASE) {
|
---|
356 | increaseSuffixLength(length + 1);
|
---|
357 | }
|
---|
358 | // in STOPWORDS mode, expand the phrase up to the next content symbol
|
---|
359 | else {
|
---|
360 | symbol *next = firstSuffix + length;
|
---|
361 | cellcount newlength = length + 1;
|
---|
362 |
|
---|
363 | while (*next < firstContentSymbol) {
|
---|
364 | next++;
|
---|
365 | newlength++;
|
---|
366 | }
|
---|
367 | increaseSuffixLength(newlength);
|
---|
368 | }
|
---|
369 |
|
---|
370 | uniqueSuffixExtension = -1;
|
---|
371 | return 0;
|
---|
372 | }
|
---|
373 |
|
---|
374 | int Phrase::expandUniquePrefixExtensionByOne() {
|
---|
375 | assert(prefixFound);
|
---|
376 | assert(uniquePrefixExtension == 1);
|
---|
377 |
|
---|
378 | // in ANYPHRASE mode, expand the phrase by one symbol
|
---|
379 | if (phraseMode == ANYPHRASE) {
|
---|
380 | increasePrefixLength(length + 1);
|
---|
381 | }
|
---|
382 | // in STOPWORDS mode, expand the phrase to the next content symbol
|
---|
383 | else {
|
---|
384 | symbol *next = firstPrefix - length;
|
---|
385 | cellcount newlength = length + 1;
|
---|
386 | while (*next < firstContentSymbol) {
|
---|
387 | next--;
|
---|
388 | newlength++;
|
---|
389 | }
|
---|
390 | increasePrefixLength(newlength);
|
---|
391 |
|
---|
392 | }
|
---|
393 |
|
---|
394 | uniquePrefixExtension = -1;
|
---|
395 | return 0;
|
---|
396 | }
|
---|
397 |
|
---|
398 |
|
---|
399 | // Find the phrase in a suffixArray
|
---|
400 | //
|
---|
401 | // Given a suffix array, find the first and last occurances of the phrase.
|
---|
402 | // The begin and end parameters identify which cells to search (inclusive).
|
---|
403 |
|
---|
404 | int Phrase::findFirstAndLastSuffix() {
|
---|
405 |
|
---|
406 | return findFirstAndLastSuffix(0, inputLength-1);
|
---|
407 | }
|
---|
408 |
|
---|
409 | int Phrase::findFirstAndLastSuffix(cellindex begin, cellindex end) {
|
---|
410 |
|
---|
411 | // First find *any* occurance of the phrase
|
---|
412 | assert(begin <= end);
|
---|
413 |
|
---|
414 | // if we're only searching one cell, it is very easy
|
---|
415 | if (begin == end) {
|
---|
416 | assert(compareSuffix(suffixArray[begin], length) == 0);
|
---|
417 | firstSuffix = lastSuffix = suffixArray[begin];
|
---|
418 | firstSuffixIndex = lastSuffixIndex = begin;
|
---|
419 | suffixFrequency = 1;
|
---|
420 | suffixFound = 1;
|
---|
421 | return 0;
|
---|
422 | }
|
---|
423 |
|
---|
424 | cellindex c;
|
---|
425 | int cmp;
|
---|
426 |
|
---|
427 | do {
|
---|
428 | c = (end + begin) / 2;
|
---|
429 | cmp = compareSuffix(suffixArray[c], length);
|
---|
430 | if (cmp == 1) {
|
---|
431 | // target phrase is lower than phrase at suffixArray[c]
|
---|
432 | begin = c + 1;
|
---|
433 | } else if (cmp == -1) {
|
---|
434 | // target phrase is higher than phrase at suffixArray[c]
|
---|
435 | end = c - 1;
|
---|
436 | }
|
---|
437 | } while (cmp != 0);
|
---|
438 |
|
---|
439 | cellindex lastbegin = c;
|
---|
440 | cellindex lastend = end;
|
---|
441 |
|
---|
442 | // Next find the first occurance of the phrase.
|
---|
443 | // We know that the first occurance must be between the
|
---|
444 | // current value of begin and the current value of c.
|
---|
445 | end = c;
|
---|
446 |
|
---|
447 | do {
|
---|
448 | if (begin == end) {
|
---|
449 | c = begin;
|
---|
450 | cmp = 0;
|
---|
451 | } else {
|
---|
452 | c = (begin + end) / 2;
|
---|
453 | cmp = compareSuffix(suffixArray[c], length);
|
---|
454 | if (cmp == 1) {
|
---|
455 | // target phrase is lower than phrase at suffixArray[c]
|
---|
456 | begin = c + 1;
|
---|
457 | } else {
|
---|
458 | assert(cmp == 0);
|
---|
459 | // target phrase is the same as the phrase at suffixArray[c]. However,
|
---|
460 | // to find the first occurance, suffixArray[c] must be the same as the
|
---|
461 | // phrase, but suffixArray[c-1] must be different.
|
---|
462 | if (c>0) {
|
---|
463 | cmp = compareSuffix(suffixArray[c-1], length);
|
---|
464 | if (cmp == 0) {
|
---|
465 | end = c - 1;
|
---|
466 | assert(end >= begin);
|
---|
467 | cmp = 1;
|
---|
468 | } else {
|
---|
469 | cmp = 0;
|
---|
470 | }
|
---|
471 | }
|
---|
472 | }
|
---|
473 | }
|
---|
474 | } while (cmp != 0);
|
---|
475 |
|
---|
476 | // we have found the first location
|
---|
477 | firstSuffixIndex = c;
|
---|
478 | firstSuffix = suffixArray[c];
|
---|
479 |
|
---|
480 | // Next find the last occurance of the phrase.
|
---|
481 | // We previously stored some bounds for its loccation.
|
---|
482 | begin = lastbegin;
|
---|
483 | end = lastend;
|
---|
484 |
|
---|
485 | do {
|
---|
486 | if (begin == end) {
|
---|
487 | c = begin;
|
---|
488 | cmp = 0;
|
---|
489 | } else {
|
---|
490 | c = (begin + end) / 2;
|
---|
491 | cmp = compareSuffix(suffixArray[c], length);
|
---|
492 | if (cmp == -1) {
|
---|
493 | // target phrase is greater than phrase at suffixArray[c]
|
---|
494 | end = c - 1;
|
---|
495 | } else {
|
---|
496 | assert(cmp == 0);
|
---|
497 | // target phrase is the same as the phrase at suffixArray[c]. However,
|
---|
498 | // to find the last occurance, one of two additional condiditons must be met:
|
---|
499 | // either c must be the very last cell in the array, or
|
---|
500 | // suffixArray[c+1] must be different from phrase
|
---|
501 | if (c < inputLength-1) {
|
---|
502 | cmp = compareSuffix(suffixArray[c+1], length);
|
---|
503 | if (cmp == 0) {
|
---|
504 | begin = c + 1;
|
---|
505 | cmp = -1;
|
---|
506 | } else {
|
---|
507 | cmp = 0;
|
---|
508 | }
|
---|
509 | }
|
---|
510 | }
|
---|
511 | }
|
---|
512 | } while (cmp != 0);
|
---|
513 |
|
---|
514 | lastSuffixIndex = c;
|
---|
515 | lastSuffix = suffixArray[c];
|
---|
516 | suffixFrequency = lastSuffixIndex - firstSuffixIndex + 1;
|
---|
517 | suffixFound = 1;
|
---|
518 |
|
---|
519 | return 0;
|
---|
520 | }
|
---|
521 |
|
---|
522 |
|
---|
523 | // Find the phrase in a prefix array
|
---|
524 | //
|
---|
525 | // Given a prefix array, find the first and last occurances of the phrase.
|
---|
526 | // The begin and end parameters identify which cells to search (inclusive).
|
---|
527 |
|
---|
528 | int Phrase::findFirstAndLastPrefix() {
|
---|
529 | return findFirstAndLastPrefix(0, inputLength-1);
|
---|
530 | }
|
---|
531 |
|
---|
532 | int Phrase::findFirstAndLastPrefix(cellindex begin, cellindex end) {
|
---|
533 |
|
---|
534 | // First find *any* occurance of the phrase
|
---|
535 | assert(begin <= end);
|
---|
536 |
|
---|
537 | // if we're only searching one cell, it is very easy
|
---|
538 | if (begin == end) {
|
---|
539 | assert(comparePrefix(prefixArray[begin], length) == 0);
|
---|
540 | firstPrefix = lastPrefix = prefixArray[begin];
|
---|
541 | firstPrefixIndex = lastPrefixIndex = begin;
|
---|
542 | prefixFrequency = 1;
|
---|
543 | prefixFound = 1;
|
---|
544 | return 0;
|
---|
545 | }
|
---|
546 |
|
---|
547 | cellindex c;
|
---|
548 | int cmp;
|
---|
549 |
|
---|
550 | do {
|
---|
551 | c = (end + begin) / 2;
|
---|
552 | cmp = comparePrefix(prefixArray[c], length);
|
---|
553 | if (cmp == 1) {
|
---|
554 | begin = c + 1;
|
---|
555 | } else if (cmp == -1) {
|
---|
556 | end = c - 1;
|
---|
557 | }
|
---|
558 | } while (cmp != 0);
|
---|
559 |
|
---|
560 | cellindex lastbegin = c;
|
---|
561 | cellindex lastend = end;
|
---|
562 |
|
---|
563 | // Next find the first occurance of the phrase.
|
---|
564 | // We know that the first occurance must be between the
|
---|
565 | // current value of begin and the cureent value of c.
|
---|
566 | end = c;
|
---|
567 |
|
---|
568 | do {
|
---|
569 | if (begin == end) {
|
---|
570 | c = begin;
|
---|
571 | cmp = 0;
|
---|
572 | } else {
|
---|
573 | c = (begin + end) / 2;
|
---|
574 | cmp = comparePrefix(prefixArray[c], length);
|
---|
575 | if (cmp == 1) {
|
---|
576 | // target phrase is lower than phrase at prefixArray[c]
|
---|
577 | begin = c + 1;
|
---|
578 | } else {
|
---|
579 | assert(cmp == 0);
|
---|
580 | // target phrase is the same as the phrase at prefixArray[c]. However, to
|
---|
581 | // find the first occurance, one of two additional condiditons must be met:
|
---|
582 | // either c == the first cell in prefix array
|
---|
583 | // or prefixArray[c-1] != phrase.
|
---|
584 | if (c > 0) {
|
---|
585 | cmp = comparePrefix(prefixArray[c-1], length);
|
---|
586 | if (cmp == 0) {
|
---|
587 | end = c - 1;
|
---|
588 | assert(end >= begin);
|
---|
589 | cmp = 1;
|
---|
590 | } else {
|
---|
591 | cmp = 0;
|
---|
592 | }
|
---|
593 | }
|
---|
594 | }
|
---|
595 | }
|
---|
596 | } while (cmp != 0);
|
---|
597 |
|
---|
598 | // we have found the first location
|
---|
599 | firstPrefixIndex = c;
|
---|
600 | firstPrefix = prefixArray[c];
|
---|
601 |
|
---|
602 | // Next find the last occurance of the phrase.
|
---|
603 | // We previously stored some bounds for its loccation.
|
---|
604 | begin = lastbegin;
|
---|
605 | end = lastend;
|
---|
606 |
|
---|
607 | do {
|
---|
608 | if (begin == end) {
|
---|
609 | c = begin;
|
---|
610 | cmp = 0;
|
---|
611 | } else {
|
---|
612 | c = (begin + end) / 2;
|
---|
613 | cmp = comparePrefix(prefixArray[c], length);
|
---|
614 | if (cmp == -1) {
|
---|
615 | // target phrase is greater than phrase at prefixArray[c]
|
---|
616 | end = c - 1;
|
---|
617 | } else {
|
---|
618 | assert(cmp == 0);
|
---|
619 | // target phrase is the same as the phrase at prefixArray[c]. However,
|
---|
620 | // to find the last occurance, prefixArray[c] must be the same as the
|
---|
621 | // phrase, but prefixArray[c+1] must be different.
|
---|
622 | cmp = comparePrefix(prefixArray[c+1], length);
|
---|
623 | if (cmp == 0) {
|
---|
624 | begin = c + 1;
|
---|
625 | cmp = -1;
|
---|
626 | } else {
|
---|
627 | cmp = 0;
|
---|
628 | }
|
---|
629 | }
|
---|
630 | }
|
---|
631 | } while (cmp != 0);
|
---|
632 |
|
---|
633 | lastPrefixIndex = c;
|
---|
634 | lastPrefix = prefixArray[c];
|
---|
635 | prefixFrequency = lastPrefixIndex - firstPrefixIndex + 1;
|
---|
636 | prefixFound = 1;
|
---|
637 |
|
---|
638 | return 0;
|
---|
639 | }
|
---|
640 |
|
---|
641 |
|
---|
642 | // Calculate a set of initial suffix/prefix candidates
|
---|
643 | //
|
---|
644 | // Expand the phrase to find the initial
|
---|
645 | // set of candidates that may be extensions of that phrase,
|
---|
646 | // and add them to the end of the results vector
|
---|
647 |
|
---|
648 | void Phrase::initialSuffixCandidates(vector<Phrase> &results) {
|
---|
649 |
|
---|
650 | ensureSuffixFound();
|
---|
651 | Phrase next;
|
---|
652 | cellindex i = firstSuffixIndex;
|
---|
653 |
|
---|
654 | // Find all the expansions of Phrase p
|
---|
655 | while (i <= lastSuffixIndex) {
|
---|
656 |
|
---|
657 | // create new phrase exactly one longer than the current one
|
---|
658 | next = newPhraseShortestSuffixExpansion(i);
|
---|
659 |
|
---|
660 | // If the expansion occurs more than once and is not delimited, expand it
|
---|
661 | if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {
|
---|
662 | next.expandWhileUniqueSuffixExtension();
|
---|
663 | results.push_back(next);
|
---|
664 | }
|
---|
665 |
|
---|
666 | // Move onto the next expansion
|
---|
667 | i = next.lastSuffixIndex + 1;
|
---|
668 | }
|
---|
669 | }
|
---|
670 |
|
---|
671 |
|
---|
672 | void Phrase::initialPrefixCandidates(vector<Phrase> &results) {
|
---|
673 |
|
---|
674 | ensurePrefixFound();
|
---|
675 | Phrase next;
|
---|
676 | cellindex i = firstPrefixIndex;
|
---|
677 |
|
---|
678 | // Find all the expansions of Phrase p
|
---|
679 | while (i <= lastPrefixIndex) {
|
---|
680 |
|
---|
681 | // create new phrase exactly one longer than the current one
|
---|
682 | next = newPhraseShortestPrefixExpansion(i);
|
---|
683 |
|
---|
684 | // If the expansion occurs more than once and is not delimited, expand it
|
---|
685 | if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {
|
---|
686 | next.expandWhileUniquePrefixExtension();
|
---|
687 | results.push_back(next);
|
---|
688 | }
|
---|
689 |
|
---|
690 | // Move onto the next expansion
|
---|
691 | i = next.lastPrefixIndex + 1;
|
---|
692 | }
|
---|
693 | }
|
---|
694 |
|
---|
695 |
|
---|
696 | // Create a new phrase that is longer then the current one, but as short
|
---|
697 | // as possible. In ANYPHRASE mode this means the new phrase is exactly
|
---|
698 | // one symbol longer than the current one, but in STOPWORD mode the expansion
|
---|
699 | // must introduce a new content symbol
|
---|
700 | //
|
---|
701 | // Note that we never expand across delimiters; if we encounter a delimiter
|
---|
702 | // then this becomes the "end" symbol of the phrase. The calling function
|
---|
703 | // should check for this condition.
|
---|
704 |
|
---|
705 | Phrase Phrase::newPhraseShortestSuffixExpansion(cellindex i) {
|
---|
706 |
|
---|
707 | // create a new phrase exactly one symbol longer than the current one
|
---|
708 | Phrase p(suffixArray[i], length + 1, SUFFIX);
|
---|
709 |
|
---|
710 | if (phraseMode == STOPWORDS) {
|
---|
711 | // in STOPWORDS mode we must introduce a new content symbol
|
---|
712 | while ((*(p.back) >= firstStopSymbol) &&
|
---|
713 | (*(p.back) <= lastStopSymbol)) {
|
---|
714 | p.increaseSuffixLength(p.length + 1);
|
---|
715 | }
|
---|
716 | }
|
---|
717 |
|
---|
718 | p.findFirstAndLastSuffix(i, lastSuffixIndex);
|
---|
719 | return p;
|
---|
720 | }
|
---|
721 |
|
---|
722 | Phrase Phrase::newPhraseShortestPrefixExpansion(cellindex i) {
|
---|
723 |
|
---|
724 | // create a new phrase exactly one symbol longer than the current one
|
---|
725 | Phrase p(prefixArray[i], length + 1, PREFIX);
|
---|
726 |
|
---|
727 | if (phraseMode == STOPWORDS) {
|
---|
728 | // in STOPWORDS mode we must introduce a new content symbol
|
---|
729 | while ((*(p.forward) >= firstStopSymbol) &&
|
---|
730 | (*(p.forward) <= lastStopSymbol)) {
|
---|
731 | p.increasePrefixLength(p.length + 1);
|
---|
732 | }
|
---|
733 | }
|
---|
734 |
|
---|
735 | p.findFirstAndLastPrefix(i, lastPrefixIndex);
|
---|
736 | return p;
|
---|
737 | }
|
---|
738 |
|
---|
739 |
|
---|
740 | // Expand a phrase until it no longer has a unique suffix extension
|
---|
741 | //
|
---|
742 | // If the phrase only occurs once in the suffix array, do nothing.
|
---|
743 | //
|
---|
744 | // If we are in stopwords mode then we expand if there is a unique
|
---|
745 | // suffix expansion and this expansion is a content word. We do this
|
---|
746 | // with the content_len variable, which holds the length of the longest
|
---|
747 | // phrase ending in a content word.
|
---|
748 |
|
---|
749 | int Phrase::expandWhileUniqueSuffixExtension() {
|
---|
750 | ensureSuffixFound();
|
---|
751 |
|
---|
752 | // if the phrase occurs only once, do nothing
|
---|
753 | if (suffixFrequency < 2)
|
---|
754 | return 0;
|
---|
755 |
|
---|
756 | // if we already know there's no unique suffix extension, do nothing
|
---|
757 | if (uniqueSuffixExtension == 0)
|
---|
758 | return 0;
|
---|
759 |
|
---|
760 | // count the length over which the cells are the same
|
---|
761 | // and no delimiter is crossed.
|
---|
762 | symbol *fst = firstSuffix;
|
---|
763 | symbol *lst = lastSuffix;
|
---|
764 | cellindex len = 0;
|
---|
765 | cellindex content_len = 0;
|
---|
766 |
|
---|
767 | // Calculate the length over which the phrases match
|
---|
768 | while((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
769 | len++;
|
---|
770 | if (*fst > lastStopSymbol) content_len = len;
|
---|
771 | fst++;
|
---|
772 | lst++;
|
---|
773 | }
|
---|
774 |
|
---|
775 | // in ANYPHRASE mode, expand the phrase for as long as possible
|
---|
776 | if (phraseMode == ANYPHRASE) {
|
---|
777 | if (len > length) {
|
---|
778 | increaseSuffixLength(len);
|
---|
779 | }
|
---|
780 | }
|
---|
781 | // in STOPWORDS mode, expand only as far as the last contentword
|
---|
782 | else {
|
---|
783 | if (content_len > length) {
|
---|
784 | increaseSuffixLength(content_len);
|
---|
785 | }
|
---|
786 | }
|
---|
787 |
|
---|
788 | uniqueSuffixExtension = 0;
|
---|
789 | return 0;
|
---|
790 | }
|
---|
791 |
|
---|
792 | int Phrase::expandWhileUniquePrefixExtension() {
|
---|
793 | ensurePrefixFound();
|
---|
794 |
|
---|
795 | // if the phrase occurs only once, do nothing
|
---|
796 | if (prefixFrequency < 2)
|
---|
797 | return 0;
|
---|
798 |
|
---|
799 | // if we already know there's no unique extension, do nothing
|
---|
800 | if (uniquePrefixExtension == 0)
|
---|
801 | return 0;
|
---|
802 |
|
---|
803 | // count the length over which the cells are the same
|
---|
804 | // and no delimiter is crossed.
|
---|
805 | symbol *fst = firstPrefix;
|
---|
806 | symbol *lst = lastPrefix;
|
---|
807 | cellindex len = 0;
|
---|
808 | cellindex content_len = 0;
|
---|
809 |
|
---|
810 | // Calculate the length over which the phrases match
|
---|
811 | while((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
812 | len++;
|
---|
813 | if (*fst > lastStopSymbol) content_len = len;
|
---|
814 | fst--;
|
---|
815 | lst--;
|
---|
816 | }
|
---|
817 |
|
---|
818 | // in ANYPHRASE mode, expand the phrase for as long as possible
|
---|
819 | if (phraseMode == ANYPHRASE) {
|
---|
820 | if (len > length) {
|
---|
821 | increasePrefixLength(len);
|
---|
822 | }
|
---|
823 | }
|
---|
824 | // in STOPWORDS mode, expand only as far as the last contentword
|
---|
825 | else {
|
---|
826 | if (content_len > length) {
|
---|
827 | increasePrefixLength(content_len);
|
---|
828 | }
|
---|
829 | }
|
---|
830 |
|
---|
831 | uniquePrefixExtension = 0;
|
---|
832 | return 0;
|
---|
833 | }
|
---|
834 |
|
---|
835 | // Compare the length of two phrases
|
---|
836 | //
|
---|
837 | // Given two phrases, return true if the first is shorter/longer,
|
---|
838 | // otherwise return false. For use in various sort functions.
|
---|
839 |
|
---|
840 | bool isShorter(Phrase p1, Phrase p2) {
|
---|
841 | if (p1.length < p2.length) {
|
---|
842 | return true;
|
---|
843 | }
|
---|
844 | return false;
|
---|
845 | }
|
---|
846 |
|
---|
847 | bool isLonger(Phrase p1, Phrase p2) {
|
---|
848 | if (p1.length > p2.length) {
|
---|
849 | return true;
|
---|
850 | }
|
---|
851 | return false;
|
---|
852 | }
|
---|
853 |
|
---|
854 |
|
---|