1 | /**********************************************************************
|
---|
2 | *
|
---|
3 | * phrase.cpp -- implementation of the phrase object used by suffix.cpp
|
---|
4 | *
|
---|
5 | * Copyright 2000 Gordon W. Paynter
|
---|
6 | * Copyright 2000 The New Zealand Digital Library Project
|
---|
7 | *
|
---|
8 | * A component of the Greenstone digital library software
|
---|
9 | * from the New Zealand Digital Library Project at the
|
---|
10 | * University of Waikato, New Zealand.
|
---|
11 | *
|
---|
12 | * This program is free software; you can redistribute it and/or modify
|
---|
13 | * it under the terms of the GNU General Public License as published by
|
---|
14 | * the Free Software Foundation; either version 2 of the License, or
|
---|
15 | * (at your option) any later version.
|
---|
16 | *
|
---|
17 | * This program is distributed in the hope that it will be useful,
|
---|
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | * GNU General Public License for more details.
|
---|
21 | *
|
---|
22 | * You should have received a copy of the GNU General Public License
|
---|
23 | * along with this program; if not, write to the Free Software
|
---|
24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | *
|
---|
26 | *********************************************************************/
|
---|
27 |
|
---|
28 | #include <assert.h>
|
---|
29 | #include <stdio.h>
|
---|
30 |
|
---|
31 | #if defined(GSDL_USE_STL_H)
|
---|
32 | # include <vector.h>
|
---|
33 | #else
|
---|
34 | # include <vector>
|
---|
35 | #endif
|
---|
36 |
|
---|
37 | #if defined(GSDL_USE_IOS_H)
|
---|
38 | # include <iostream.h>
|
---|
39 | #else
|
---|
40 | # include <iostream>
|
---|
41 | #endif
|
---|
42 |
|
---|
43 | #include "suffix.h"
|
---|
44 | #include "phrase.h"
|
---|
45 |
|
---|
46 |
|
---|
47 | // Phrase constructor functions
|
---|
48 |
|
---|
49 | Phrase::Phrase(symbol *words, cellcount size, int direction) {
|
---|
50 | empty();
|
---|
51 | length = size;
|
---|
52 |
|
---|
53 | if (direction == SUFFIX) {
|
---|
54 | forward = words;
|
---|
55 | back = forward + size - 1;
|
---|
56 | } else {
|
---|
57 | back = words;
|
---|
58 | forward = back - size + 1;
|
---|
59 | }
|
---|
60 | }
|
---|
61 |
|
---|
62 |
|
---|
63 | Phrase::Phrase() {
|
---|
64 | empty();
|
---|
65 | }
|
---|
66 |
|
---|
67 |
|
---|
68 | Phrase::Phrase(const Phrase &p) {
|
---|
69 | forward = p.forward;
|
---|
70 | back = p.back;
|
---|
71 | length = p.length;
|
---|
72 |
|
---|
73 | suffixFound = p.suffixFound;
|
---|
74 | prefixFound = p.prefixFound;
|
---|
75 |
|
---|
76 | firstSuffix = p.firstSuffix;
|
---|
77 | lastSuffix = p.lastSuffix;
|
---|
78 | firstSuffixIndex = p.firstSuffixIndex;
|
---|
79 | lastSuffixIndex = p.lastSuffixIndex;
|
---|
80 | suffixFrequency = p.suffixFrequency;
|
---|
81 |
|
---|
82 | firstPrefix = p.firstPrefix;
|
---|
83 | lastPrefix = p.lastPrefix;
|
---|
84 | firstPrefixIndex = p.firstPrefixIndex;
|
---|
85 | lastPrefixIndex = p.lastPrefixIndex;
|
---|
86 | prefixFrequency = p.prefixFrequency;
|
---|
87 |
|
---|
88 | uniqueSuffixExtension = p.uniqueSuffixExtension;
|
---|
89 | uniquePrefixExtension = p.uniquePrefixExtension;
|
---|
90 | }
|
---|
91 |
|
---|
92 |
|
---|
93 | // Empty the contents of a phrase
|
---|
94 |
|
---|
95 | int Phrase::empty() {
|
---|
96 |
|
---|
97 | forward = back = NULL;
|
---|
98 | length = 0;
|
---|
99 |
|
---|
100 | suffixFound = prefixFound = 0;
|
---|
101 | firstSuffix = firstPrefix = NULL;
|
---|
102 | lastSuffix = lastPrefix = NULL;
|
---|
103 |
|
---|
104 | firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0;
|
---|
105 | firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0;
|
---|
106 |
|
---|
107 | uniqueSuffixExtension = uniquePrefixExtension = -1;
|
---|
108 |
|
---|
109 | return 0;
|
---|
110 | }
|
---|
111 |
|
---|
112 | // Remove the details of the location where a suffix/prefix
|
---|
113 | // is found in the array.
|
---|
114 |
|
---|
115 | int Phrase::clearSuffix() {
|
---|
116 | suffixFound = 0;
|
---|
117 | firstSuffix = lastSuffix = NULL;
|
---|
118 | firstSuffixIndex = lastSuffixIndex = suffixFrequency = 0;
|
---|
119 | uniqueSuffixExtension = -1;
|
---|
120 | return 0;
|
---|
121 | }
|
---|
122 |
|
---|
123 | int Phrase::clearPrefix() {
|
---|
124 | prefixFound = 0;
|
---|
125 | firstPrefix = lastPrefix = NULL;
|
---|
126 | firstPrefixIndex = lastPrefixIndex = prefixFrequency = 0;
|
---|
127 | uniquePrefixExtension = -1;
|
---|
128 | return 0;
|
---|
129 | }
|
---|
130 |
|
---|
131 | // Increase the length of s phrase
|
---|
132 | //
|
---|
133 | // We increase the length of a string "in place". When we expand
|
---|
134 | // the suffix, however, we invalidate the prefix data, and vice-versa.
|
---|
135 |
|
---|
136 | int Phrase::increaseSuffixLength(cellcount l) {
|
---|
137 | length = l;
|
---|
138 | clearPrefix();
|
---|
139 | back = forward + l - 1;
|
---|
140 | return 0;
|
---|
141 | }
|
---|
142 |
|
---|
143 | int Phrase::increasePrefixLength(cellcount l) {
|
---|
144 | length = l;
|
---|
145 | clearSuffix();
|
---|
146 | forward = back - l + 1;
|
---|
147 | return 0;
|
---|
148 | }
|
---|
149 |
|
---|
150 | // Shorten a phrase by one symbol
|
---|
151 | int Phrase::shortenByOneAtSuffix() {
|
---|
152 | --length;
|
---|
153 | --back;
|
---|
154 | if (phraseMode==STOPWORDS) {
|
---|
155 | while (*back >=firstStopSymbol && *back <= lastStopSymbol) {
|
---|
156 | --length;
|
---|
157 | --back;
|
---|
158 | }
|
---|
159 | }
|
---|
160 | clearSuffix();
|
---|
161 | clearPrefix();
|
---|
162 | return 0;
|
---|
163 | }
|
---|
164 |
|
---|
165 | int Phrase::shortenByOneAtPrefix() {
|
---|
166 | --length;
|
---|
167 | ++forward;
|
---|
168 | if (phraseMode==STOPWORDS) {
|
---|
169 | while (*forward >=firstStopSymbol && *forward <= lastStopSymbol) {
|
---|
170 | --length;
|
---|
171 | ++forward;
|
---|
172 | }
|
---|
173 | }
|
---|
174 | clearSuffix();
|
---|
175 | clearPrefix();
|
---|
176 | return 0;
|
---|
177 | }
|
---|
178 |
|
---|
179 |
|
---|
180 | // Output a phrase to a stream
|
---|
181 | ostream &operator<<(ostream &stream, const Phrase &phrase)
|
---|
182 | {
|
---|
183 | assert(phrase.forward);
|
---|
184 | symbol *s = phrase.forward;
|
---|
185 |
|
---|
186 | stream << "s" << *s++;
|
---|
187 | for (cellcount i = 1; i < phrase.length; ++i)
|
---|
188 | stream << " s" << *s++;
|
---|
189 |
|
---|
190 | return stream;
|
---|
191 | }
|
---|
192 |
|
---|
193 |
|
---|
194 | // Convert the phrase to a string
|
---|
195 | // Note thgat you have to delete the memory yourself.
|
---|
196 | char *Phrase::toString()
|
---|
197 | {
|
---|
198 | assert(forward);
|
---|
199 |
|
---|
200 | char *str;
|
---|
201 | str = new char[length*20];
|
---|
202 | symbol *s = forward;
|
---|
203 | sprintf(str, "s%d", *s++);
|
---|
204 |
|
---|
205 | for (cellcount i = 1; i < length; ++i) {
|
---|
206 | sprintf(str, "%s s%d", str, *s++);
|
---|
207 | }
|
---|
208 |
|
---|
209 | return str;
|
---|
210 | }
|
---|
211 |
|
---|
212 |
|
---|
213 | // Compare a phrase to an another array of symbols
|
---|
214 | //
|
---|
215 | // Given an array of words with a specified length,
|
---|
216 | // compare those words to this phrase.
|
---|
217 | //
|
---|
218 | // Return 0 if the two phrases are the same over length length
|
---|
219 | // Return 1 if the phrase is "greater" than tyhe words, or
|
---|
220 | // Return -1 if the phrase is less.
|
---|
221 |
|
---|
222 | int Phrase::compareSuffix(symbol *words, cellcount length) {
|
---|
223 | assert(forward);
|
---|
224 | symbol *p = forward;
|
---|
225 |
|
---|
226 | for (cellcount i = 0; i < length; ++i) {
|
---|
227 | if (*p > *words) {
|
---|
228 | return 1;
|
---|
229 | } else if (*p < *words) {
|
---|
230 | return -1;
|
---|
231 | } else {
|
---|
232 | *p++;
|
---|
233 | *words++;
|
---|
234 | }
|
---|
235 | }
|
---|
236 |
|
---|
237 | return 0;
|
---|
238 |
|
---|
239 | }
|
---|
240 |
|
---|
241 | int Phrase::comparePrefix(symbol *words, cellcount length) {
|
---|
242 |
|
---|
243 | assert(back);
|
---|
244 | symbol *p = back;
|
---|
245 |
|
---|
246 | for (cellcount i = 0; i < length; ++i) {
|
---|
247 | if (*p > *words) {
|
---|
248 | return -1;
|
---|
249 | } else if (*p < *words) {
|
---|
250 | return 1;
|
---|
251 | } else {
|
---|
252 | *p--;
|
---|
253 | *words--;
|
---|
254 | }
|
---|
255 | }
|
---|
256 |
|
---|
257 | return 0;
|
---|
258 |
|
---|
259 | }
|
---|
260 |
|
---|
261 |
|
---|
262 | // Does the phrase have a unique suffix/prefix extension?
|
---|
263 | //
|
---|
264 | // For suffix:
|
---|
265 | // If uniqueSuffixExtensions is 1, then phrase has some unique expansion.
|
---|
266 | // If it is 0 then it does not. If it is -1 then we don't know, and have
|
---|
267 | // to calculate it from the suffix array. Same goes for prefixes.
|
---|
268 |
|
---|
269 | int Phrase::hasUniqueSuffixExtension() {
|
---|
270 |
|
---|
271 | // if we don't know, then work it out
|
---|
272 | if (uniqueSuffixExtension == -1) {
|
---|
273 |
|
---|
274 | ensureSuffixFound();
|
---|
275 |
|
---|
276 | // pointers to the phrase's first and last occurances in te suffixArray
|
---|
277 | symbol *fst = firstSuffix + length;
|
---|
278 | symbol *lst = lastSuffix + length;
|
---|
279 |
|
---|
280 | // in ANYPHRASE mode, simply check the next symbol
|
---|
281 | if (phraseMode == ANYPHRASE) {
|
---|
282 | if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
283 | uniqueSuffixExtension = 1;
|
---|
284 | } else {
|
---|
285 | uniqueSuffixExtension = 0;
|
---|
286 | }
|
---|
287 | }
|
---|
288 | // in STOPWORDS mode, make sure there is a unique next content symbol
|
---|
289 | else {
|
---|
290 | uniqueSuffixExtension = 0;
|
---|
291 | while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
292 | if (*fst >= firstContentSymbol) {
|
---|
293 | uniqueSuffixExtension = 1;
|
---|
294 | break;
|
---|
295 | }
|
---|
296 | ++fst;
|
---|
297 | ++lst;
|
---|
298 | }
|
---|
299 | }
|
---|
300 | }
|
---|
301 |
|
---|
302 | return uniqueSuffixExtension;
|
---|
303 | }
|
---|
304 |
|
---|
305 | int Phrase::hasUniquePrefixExtension() {
|
---|
306 |
|
---|
307 | // if we don't know, then work it out
|
---|
308 | if (uniquePrefixExtension == -1) {
|
---|
309 |
|
---|
310 | ensurePrefixFound();
|
---|
311 |
|
---|
312 | // pointers to the phrase's first and last occurances in the prefixArray
|
---|
313 | symbol *fst = firstPrefix - length;
|
---|
314 | symbol *lst = lastPrefix - length;
|
---|
315 |
|
---|
316 | // in ANYPHRASE mode, simply check the next symbol
|
---|
317 | if (phraseMode == ANYPHRASE) {
|
---|
318 | if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
319 | uniquePrefixExtension = 1;
|
---|
320 | } else {
|
---|
321 | uniquePrefixExtension = 0;
|
---|
322 | }
|
---|
323 | }
|
---|
324 | // in STOPWORDS mode, make sure there is a unique next content symbol
|
---|
325 | else {
|
---|
326 | uniquePrefixExtension = 0;
|
---|
327 | while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
328 | if (*fst >= firstContentSymbol) {
|
---|
329 | uniquePrefixExtension = 1;
|
---|
330 | break;
|
---|
331 | }
|
---|
332 | --fst;
|
---|
333 | --lst;
|
---|
334 | }
|
---|
335 | }
|
---|
336 | }
|
---|
337 |
|
---|
338 | return uniquePrefixExtension;
|
---|
339 | }
|
---|
340 |
|
---|
341 |
|
---|
342 | // Expand a phrase with a unique suffix/prefix by 1 symbol
|
---|
343 | //
|
---|
344 | // Note that in STOPWORDS mode a "unique extension" means a unique
|
---|
345 | // extension that ends on a content symbol. Although we may extend
|
---|
346 | // such a phrase by 2 or more symbols, we will only extend it by a
|
---|
347 | // single content symbol.
|
---|
348 |
|
---|
349 | int Phrase::expandUniqueSuffixExtensionByOne() {
|
---|
350 | assert(suffixFound);
|
---|
351 | assert(uniqueSuffixExtension == 1);
|
---|
352 |
|
---|
353 | // in ANYPHRASE mode, expand the phrase by one symbol
|
---|
354 | if (phraseMode == ANYPHRASE) {
|
---|
355 | increaseSuffixLength(length + 1);
|
---|
356 | }
|
---|
357 | // in STOPWORDS mode, expand the phrase up to the next content symbol
|
---|
358 | else {
|
---|
359 | symbol *next = firstSuffix + length;
|
---|
360 | cellcount newlength = length + 1;
|
---|
361 |
|
---|
362 | while (*next < firstContentSymbol) {
|
---|
363 | ++next;
|
---|
364 | ++newlength;
|
---|
365 | }
|
---|
366 | increaseSuffixLength(newlength);
|
---|
367 | }
|
---|
368 |
|
---|
369 | uniqueSuffixExtension = -1;
|
---|
370 | return 0;
|
---|
371 | }
|
---|
372 |
|
---|
373 | int Phrase::expandUniquePrefixExtensionByOne() {
|
---|
374 | assert(prefixFound);
|
---|
375 | assert(uniquePrefixExtension == 1);
|
---|
376 |
|
---|
377 | // in ANYPHRASE mode, expand the phrase by one symbol
|
---|
378 | if (phraseMode == ANYPHRASE) {
|
---|
379 | increasePrefixLength(length + 1);
|
---|
380 | }
|
---|
381 | // in STOPWORDS mode, expand the phrase to the next content symbol
|
---|
382 | else {
|
---|
383 | symbol *next = firstPrefix - length;
|
---|
384 | cellcount newlength = length + 1;
|
---|
385 | while (*next < firstContentSymbol) {
|
---|
386 | --next;
|
---|
387 | ++newlength;
|
---|
388 | }
|
---|
389 | increasePrefixLength(newlength);
|
---|
390 |
|
---|
391 | }
|
---|
392 |
|
---|
393 | uniquePrefixExtension = -1;
|
---|
394 | return 0;
|
---|
395 | }
|
---|
396 |
|
---|
397 |
|
---|
398 | // Find the phrase in a suffixArray
|
---|
399 | //
|
---|
400 | // Given a suffix array, find the first and last occurances of the phrase.
|
---|
401 | // The begin and end parameters identify which cells to search (inclusive).
|
---|
402 |
|
---|
403 | int Phrase::findFirstAndLastSuffix() {
|
---|
404 |
|
---|
405 | return findFirstAndLastSuffix(0, inputLength-1);
|
---|
406 | }
|
---|
407 |
|
---|
408 | int Phrase::findFirstAndLastSuffix(cellindex begin, cellindex end) {
|
---|
409 |
|
---|
410 | // First find *any* occurance of the phrase
|
---|
411 | assert(begin <= end);
|
---|
412 |
|
---|
413 | // if we're only searching one cell, it is very easy
|
---|
414 | if (begin == end) {
|
---|
415 | assert(compareSuffix(suffixArray[begin], length) == 0);
|
---|
416 | firstSuffix = lastSuffix = suffixArray[begin];
|
---|
417 | firstSuffixIndex = lastSuffixIndex = begin;
|
---|
418 | suffixFrequency = 1;
|
---|
419 | suffixFound = 1;
|
---|
420 | return 0;
|
---|
421 | }
|
---|
422 |
|
---|
423 | cellindex c;
|
---|
424 | int cmp;
|
---|
425 |
|
---|
426 | do {
|
---|
427 | c = (end + begin) / 2;
|
---|
428 | cmp = compareSuffix(suffixArray[c], length);
|
---|
429 | if (cmp == 1) {
|
---|
430 | // target phrase is lower than phrase at suffixArray[c]
|
---|
431 | begin = c + 1;
|
---|
432 | } else if (cmp == -1) {
|
---|
433 | // target phrase is higher than phrase at suffixArray[c]
|
---|
434 | end = c - 1;
|
---|
435 | }
|
---|
436 | } while (cmp != 0);
|
---|
437 |
|
---|
438 | cellindex lastbegin = c;
|
---|
439 | cellindex lastend = end;
|
---|
440 |
|
---|
441 | // Next find the first occurance of the phrase.
|
---|
442 | // We know that the first occurance must be between the
|
---|
443 | // current value of begin and the current value of c.
|
---|
444 | end = c;
|
---|
445 |
|
---|
446 | do {
|
---|
447 | if (begin == end) {
|
---|
448 | c = begin;
|
---|
449 | cmp = 0;
|
---|
450 | } else {
|
---|
451 | c = (begin + end) / 2;
|
---|
452 | cmp = compareSuffix(suffixArray[c], length);
|
---|
453 | if (cmp == 1) {
|
---|
454 | // target phrase is lower than phrase at suffixArray[c]
|
---|
455 | begin = c + 1;
|
---|
456 | } else {
|
---|
457 | assert(cmp == 0);
|
---|
458 | // target phrase is the same as the phrase at suffixArray[c]. However,
|
---|
459 | // to find the first occurance, suffixArray[c] must be the same as the
|
---|
460 | // phrase, but suffixArray[c-1] must be different.
|
---|
461 | if (c>0) {
|
---|
462 | cmp = compareSuffix(suffixArray[c-1], length);
|
---|
463 | if (cmp == 0) {
|
---|
464 | end = c - 1;
|
---|
465 | assert(end >= begin);
|
---|
466 | cmp = 1;
|
---|
467 | } else {
|
---|
468 | cmp = 0;
|
---|
469 | }
|
---|
470 | }
|
---|
471 | }
|
---|
472 | }
|
---|
473 | } while (cmp != 0);
|
---|
474 |
|
---|
475 | // we have found the first location
|
---|
476 | firstSuffixIndex = c;
|
---|
477 | firstSuffix = suffixArray[c];
|
---|
478 |
|
---|
479 | // Next find the last occurance of the phrase.
|
---|
480 | // We previously stored some bounds for its loccation.
|
---|
481 | begin = lastbegin;
|
---|
482 | end = lastend;
|
---|
483 |
|
---|
484 | do {
|
---|
485 | if (begin == end) {
|
---|
486 | c = begin;
|
---|
487 | cmp = 0;
|
---|
488 | } else {
|
---|
489 | c = (begin + end) / 2;
|
---|
490 | cmp = compareSuffix(suffixArray[c], length);
|
---|
491 | if (cmp == -1) {
|
---|
492 | // target phrase is greater than phrase at suffixArray[c]
|
---|
493 | end = c - 1;
|
---|
494 | } else {
|
---|
495 | assert(cmp == 0);
|
---|
496 | // target phrase is the same as the phrase at suffixArray[c]. However,
|
---|
497 | // to find the last occurance, one of two additional condiditons must be met:
|
---|
498 | // either c must be the very last cell in the array, or
|
---|
499 | // suffixArray[c+1] must be different from phrase
|
---|
500 | if (c < inputLength-1) {
|
---|
501 | cmp = compareSuffix(suffixArray[c+1], length);
|
---|
502 | if (cmp == 0) {
|
---|
503 | begin = c + 1;
|
---|
504 | cmp = -1;
|
---|
505 | } else {
|
---|
506 | cmp = 0;
|
---|
507 | }
|
---|
508 | }
|
---|
509 | }
|
---|
510 | }
|
---|
511 | } while (cmp != 0);
|
---|
512 |
|
---|
513 | lastSuffixIndex = c;
|
---|
514 | lastSuffix = suffixArray[c];
|
---|
515 | suffixFrequency = lastSuffixIndex - firstSuffixIndex + 1;
|
---|
516 | suffixFound = 1;
|
---|
517 |
|
---|
518 | return 0;
|
---|
519 | }
|
---|
520 |
|
---|
521 |
|
---|
522 | // Find the phrase in a prefix array
|
---|
523 | //
|
---|
524 | // Given a prefix array, find the first and last occurances of the phrase.
|
---|
525 | // The begin and end parameters identify which cells to search (inclusive).
|
---|
526 |
|
---|
527 | int Phrase::findFirstAndLastPrefix() {
|
---|
528 | return findFirstAndLastPrefix(0, inputLength-1);
|
---|
529 | }
|
---|
530 |
|
---|
531 | int Phrase::findFirstAndLastPrefix(cellindex begin, cellindex end) {
|
---|
532 |
|
---|
533 | // First find *any* occurance of the phrase
|
---|
534 | assert(begin <= end);
|
---|
535 |
|
---|
536 | // if we're only searching one cell, it is very easy
|
---|
537 | if (begin == end) {
|
---|
538 | assert(comparePrefix(prefixArray[begin], length) == 0);
|
---|
539 | firstPrefix = lastPrefix = prefixArray[begin];
|
---|
540 | firstPrefixIndex = lastPrefixIndex = begin;
|
---|
541 | prefixFrequency = 1;
|
---|
542 | prefixFound = 1;
|
---|
543 | return 0;
|
---|
544 | }
|
---|
545 |
|
---|
546 | cellindex c;
|
---|
547 | int cmp;
|
---|
548 |
|
---|
549 | do {
|
---|
550 | c = (end + begin) / 2;
|
---|
551 | cmp = comparePrefix(prefixArray[c], length);
|
---|
552 | if (cmp == 1) {
|
---|
553 | begin = c + 1;
|
---|
554 | } else if (cmp == -1) {
|
---|
555 | end = c - 1;
|
---|
556 | }
|
---|
557 | } while (cmp != 0);
|
---|
558 |
|
---|
559 | cellindex lastbegin = c;
|
---|
560 | cellindex lastend = end;
|
---|
561 |
|
---|
562 | // Next find the first occurance of the phrase.
|
---|
563 | // We know that the first occurance must be between the
|
---|
564 | // current value of begin and the cureent value of c.
|
---|
565 | end = c;
|
---|
566 |
|
---|
567 | do {
|
---|
568 | if (begin == end) {
|
---|
569 | c = begin;
|
---|
570 | cmp = 0;
|
---|
571 | } else {
|
---|
572 | c = (begin + end) / 2;
|
---|
573 | cmp = comparePrefix(prefixArray[c], length);
|
---|
574 | if (cmp == 1) {
|
---|
575 | // target phrase is lower than phrase at prefixArray[c]
|
---|
576 | begin = c + 1;
|
---|
577 | } else {
|
---|
578 | assert(cmp == 0);
|
---|
579 | // target phrase is the same as the phrase at prefixArray[c]. However, to
|
---|
580 | // find the first occurance, one of two additional condiditons must be met:
|
---|
581 | // either c == the first cell in prefix array
|
---|
582 | // or prefixArray[c-1] != phrase.
|
---|
583 | if (c > 0) {
|
---|
584 | cmp = comparePrefix(prefixArray[c-1], length);
|
---|
585 | if (cmp == 0) {
|
---|
586 | end = c - 1;
|
---|
587 | assert(end >= begin);
|
---|
588 | cmp = 1;
|
---|
589 | } else {
|
---|
590 | cmp = 0;
|
---|
591 | }
|
---|
592 | }
|
---|
593 | }
|
---|
594 | }
|
---|
595 | } while (cmp != 0);
|
---|
596 |
|
---|
597 | // we have found the first location
|
---|
598 | firstPrefixIndex = c;
|
---|
599 | firstPrefix = prefixArray[c];
|
---|
600 |
|
---|
601 | // Next find the last occurance of the phrase.
|
---|
602 | // We previously stored some bounds for its loccation.
|
---|
603 | begin = lastbegin;
|
---|
604 | end = lastend;
|
---|
605 |
|
---|
606 | do {
|
---|
607 | if (begin == end) {
|
---|
608 | c = begin;
|
---|
609 | cmp = 0;
|
---|
610 | } else {
|
---|
611 | c = (begin + end) / 2;
|
---|
612 | cmp = comparePrefix(prefixArray[c], length);
|
---|
613 | if (cmp == -1) {
|
---|
614 | // target phrase is greater than phrase at prefixArray[c]
|
---|
615 | end = c - 1;
|
---|
616 | } else {
|
---|
617 | assert(cmp == 0);
|
---|
618 | // target phrase is the same as the phrase at prefixArray[c]. However,
|
---|
619 | // to find the last occurance, prefixArray[c] must be the same as the
|
---|
620 | // phrase, but prefixArray[c+1] must be different.
|
---|
621 | cmp = comparePrefix(prefixArray[c+1], length);
|
---|
622 | if (cmp == 0) {
|
---|
623 | begin = c + 1;
|
---|
624 | cmp = -1;
|
---|
625 | } else {
|
---|
626 | cmp = 0;
|
---|
627 | }
|
---|
628 | }
|
---|
629 | }
|
---|
630 | } while (cmp != 0);
|
---|
631 |
|
---|
632 | lastPrefixIndex = c;
|
---|
633 | lastPrefix = prefixArray[c];
|
---|
634 | prefixFrequency = lastPrefixIndex - firstPrefixIndex + 1;
|
---|
635 | prefixFound = 1;
|
---|
636 |
|
---|
637 | return 0;
|
---|
638 | }
|
---|
639 |
|
---|
640 |
|
---|
641 | // Calculate a set of initial suffix/prefix candidates
|
---|
642 | //
|
---|
643 | // Expand the phrase to find the initial
|
---|
644 | // set of candidates that may be extensions of that phrase,
|
---|
645 | // and add them to the end of the results vector
|
---|
646 |
|
---|
647 | void Phrase::initialSuffixCandidates(vector<Phrase> &results) {
|
---|
648 |
|
---|
649 | ensureSuffixFound();
|
---|
650 | Phrase next;
|
---|
651 | cellindex i = firstSuffixIndex;
|
---|
652 |
|
---|
653 | // Find all the expansions of Phrase p
|
---|
654 | while (i <= lastSuffixIndex) {
|
---|
655 |
|
---|
656 | // create new phrase exactly one longer than the current one
|
---|
657 | next = newPhraseShortestSuffixExpansion(i);
|
---|
658 |
|
---|
659 | // If the expansion occurs more than once and is not delimited, expand it
|
---|
660 | if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= minOccurs)) {
|
---|
661 | next.expandWhileUniqueSuffixExtension();
|
---|
662 | results.push_back(next);
|
---|
663 | }
|
---|
664 |
|
---|
665 | // Move onto the next expansion
|
---|
666 | i = next.lastSuffixIndex + 1;
|
---|
667 | }
|
---|
668 | }
|
---|
669 |
|
---|
670 |
|
---|
671 | void Phrase::initialPrefixCandidates(vector<Phrase> &results) {
|
---|
672 |
|
---|
673 | ensurePrefixFound();
|
---|
674 | Phrase next;
|
---|
675 | cellindex i = firstPrefixIndex;
|
---|
676 |
|
---|
677 | // Find all the expansions of Phrase p
|
---|
678 | while (i <= lastPrefixIndex) {
|
---|
679 |
|
---|
680 | // create new phrase exactly one longer than the current one
|
---|
681 | next = newPhraseShortestPrefixExpansion(i);
|
---|
682 |
|
---|
683 | // If the expansion occurs more than once and is not delimited, expand it
|
---|
684 | if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= minOccurs)) {
|
---|
685 | next.expandWhileUniquePrefixExtension();
|
---|
686 | results.push_back(next);
|
---|
687 | }
|
---|
688 |
|
---|
689 | // Move onto the next expansion
|
---|
690 | i = next.lastPrefixIndex + 1;
|
---|
691 | }
|
---|
692 | }
|
---|
693 |
|
---|
694 |
|
---|
695 | // Create a new phrase that is longer then the current one, but as short
|
---|
696 | // as possible. In ANYPHRASE mode this means the new phrase is exactly
|
---|
697 | // one symbol longer than the current one, but in STOPWORD mode the expansion
|
---|
698 | // must introduce a new content symbol
|
---|
699 | //
|
---|
700 | // Note that we never expand across delimiters; if we encounter a delimiter
|
---|
701 | // then this becomes the "end" symbol of the phrase. The calling function
|
---|
702 | // should check for this condition.
|
---|
703 |
|
---|
704 | Phrase Phrase::newPhraseShortestSuffixExpansion(cellindex i) {
|
---|
705 |
|
---|
706 | // create a new phrase exactly one symbol longer than the current one
|
---|
707 | Phrase p(suffixArray[i], length + 1, SUFFIX);
|
---|
708 |
|
---|
709 | if (phraseMode == STOPWORDS) {
|
---|
710 | // in STOPWORDS mode we must introduce a new content symbol
|
---|
711 | while ((*(p.back) >= firstStopSymbol) &&
|
---|
712 | (*(p.back) <= lastStopSymbol)) {
|
---|
713 | p.increaseSuffixLength(p.length + 1);
|
---|
714 | }
|
---|
715 | }
|
---|
716 |
|
---|
717 | p.findFirstAndLastSuffix(i, lastSuffixIndex);
|
---|
718 | return p;
|
---|
719 | }
|
---|
720 |
|
---|
721 | Phrase Phrase::newPhraseShortestPrefixExpansion(cellindex i) {
|
---|
722 |
|
---|
723 | // create a new phrase exactly one symbol longer than the current one
|
---|
724 | Phrase p(prefixArray[i], length + 1, PREFIX);
|
---|
725 |
|
---|
726 | if (phraseMode == STOPWORDS) {
|
---|
727 | // in STOPWORDS mode we must introduce a new content symbol
|
---|
728 | while ((*(p.forward) >= firstStopSymbol) &&
|
---|
729 | (*(p.forward) <= lastStopSymbol)) {
|
---|
730 | p.increasePrefixLength(p.length + 1);
|
---|
731 | }
|
---|
732 | }
|
---|
733 |
|
---|
734 | p.findFirstAndLastPrefix(i, lastPrefixIndex);
|
---|
735 | return p;
|
---|
736 | }
|
---|
737 |
|
---|
738 |
|
---|
739 | // Expand a phrase until it no longer has a unique suffix extension
|
---|
740 | //
|
---|
741 | // If the phrase only occurs once in the suffix array, do nothing.
|
---|
742 | //
|
---|
743 | // If we are in stopwords mode then we expand if there is a unique
|
---|
744 | // suffix expansion and this expansion is a content word. We do this
|
---|
745 | // with the content_len variable, which holds the length of the longest
|
---|
746 | // phrase ending in a content word.
|
---|
747 |
|
---|
748 | int Phrase::expandWhileUniqueSuffixExtension() {
|
---|
749 | ensureSuffixFound();
|
---|
750 |
|
---|
751 | // if the phrase occurs only once, do nothing
|
---|
752 | if (suffixFrequency < minOccurs)
|
---|
753 | return 0;
|
---|
754 |
|
---|
755 | // if we already know there's no unique suffix extension, do nothing
|
---|
756 | if (uniqueSuffixExtension == 0)
|
---|
757 | return 0;
|
---|
758 |
|
---|
759 | // count the length over which the cells are the same
|
---|
760 | // and no delimiter is crossed.
|
---|
761 | symbol *fst = firstSuffix;
|
---|
762 | symbol *lst = lastSuffix;
|
---|
763 | cellindex len = 0;
|
---|
764 | cellindex content_len = 0;
|
---|
765 |
|
---|
766 | // Calculate the length over which the phrases match
|
---|
767 | while((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
768 | ++len;
|
---|
769 | if (*fst > lastStopSymbol) content_len = len;
|
---|
770 | ++fst;
|
---|
771 | ++lst;
|
---|
772 | }
|
---|
773 |
|
---|
774 | // in ANYPHRASE mode, expand the phrase for as long as possible
|
---|
775 | if (phraseMode == ANYPHRASE) {
|
---|
776 | if (len > length) {
|
---|
777 | increaseSuffixLength(len);
|
---|
778 | }
|
---|
779 | }
|
---|
780 | // in STOPWORDS mode, expand only as far as the last contentword
|
---|
781 | else {
|
---|
782 | if (content_len > length) {
|
---|
783 | increaseSuffixLength(content_len);
|
---|
784 | }
|
---|
785 | }
|
---|
786 |
|
---|
787 | uniqueSuffixExtension = 0;
|
---|
788 | return 0;
|
---|
789 | }
|
---|
790 |
|
---|
791 | int Phrase::expandWhileUniquePrefixExtension() {
|
---|
792 | ensurePrefixFound();
|
---|
793 |
|
---|
794 | // if the phrase occurs only once, do nothing
|
---|
795 | if (prefixFrequency < minOccurs)
|
---|
796 | return 0;
|
---|
797 |
|
---|
798 | // if we already know there's no unique extension, do nothing
|
---|
799 | if (uniquePrefixExtension == 0)
|
---|
800 | return 0;
|
---|
801 |
|
---|
802 | // count the length over which the cells are the same
|
---|
803 | // and no delimiter is crossed.
|
---|
804 | symbol *fst = firstPrefix;
|
---|
805 | symbol *lst = lastPrefix;
|
---|
806 | cellindex len = 0;
|
---|
807 | cellindex content_len = 0;
|
---|
808 |
|
---|
809 | // Calculate the length over which the phrases match
|
---|
810 | while((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
811 | ++len;
|
---|
812 | if (*fst > lastStopSymbol) content_len = len;
|
---|
813 | --fst;
|
---|
814 | --lst;
|
---|
815 | }
|
---|
816 |
|
---|
817 | // in ANYPHRASE mode, expand the phrase for as long as possible
|
---|
818 | if (phraseMode == ANYPHRASE) {
|
---|
819 | if (len > length) {
|
---|
820 | increasePrefixLength(len);
|
---|
821 | }
|
---|
822 | }
|
---|
823 | // in STOPWORDS mode, expand only as far as the last contentword
|
---|
824 | else {
|
---|
825 | if (content_len > length) {
|
---|
826 | increasePrefixLength(content_len);
|
---|
827 | }
|
---|
828 | }
|
---|
829 |
|
---|
830 | uniquePrefixExtension = 0;
|
---|
831 | return 0;
|
---|
832 | }
|
---|
833 |
|
---|
834 | // Compare the length of two phrases
|
---|
835 | //
|
---|
836 | // Given two phrases, return true if the first is shorter/longer,
|
---|
837 | // otherwise return false. For use in various sort functions.
|
---|
838 |
|
---|
839 | bool isShorter(Phrase p1, Phrase p2) {
|
---|
840 | if (p1.length < p2.length) {
|
---|
841 | return true;
|
---|
842 | }
|
---|
843 | return false;
|
---|
844 | }
|
---|
845 |
|
---|
846 | bool isLonger(Phrase p1, Phrase p2) {
|
---|
847 | if (p1.length > p2.length) {
|
---|
848 | return true;
|
---|
849 | }
|
---|
850 | return false;
|
---|
851 | }
|
---|
852 |
|
---|
853 |
|
---|