[1562] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
| 3 | * phrase.cpp -- implementation of the phrase object used by suffix.cpp
|
---|
| 4 | *
|
---|
[1631] | 5 | * Copyright 2000 Gordon W. Paynter
|
---|
| 6 | * Copyright 2000 The New Zealand Digital Library Project
|
---|
[1562] | 7 | *
|
---|
| 8 | * A component of the Greenstone digital library software
|
---|
| 9 | * from the New Zealand Digital Library Project at the
|
---|
| 10 | * University of Waikato, New Zealand.
|
---|
| 11 | *
|
---|
| 12 | * This program is free software; you can redistribute it and/or modify
|
---|
| 13 | * it under the terms of the GNU General Public License as published by
|
---|
| 14 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 15 | * (at your option) any later version.
|
---|
| 16 | *
|
---|
| 17 | * This program is distributed in the hope that it will be useful,
|
---|
| 18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 20 | * GNU General Public License for more details.
|
---|
| 21 | *
|
---|
| 22 | * You should have received a copy of the GNU General Public License
|
---|
| 23 | * along with this program; if not, write to the Free Software
|
---|
| 24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 25 | *
|
---|
| 26 | *********************************************************************/
|
---|
| 27 |
|
---|
| 28 | #include <iostream.h>
|
---|
| 29 | #include <assert.h>
|
---|
| 30 | #include <vector.h>
|
---|
| 31 | #include <stdio.h>
|
---|
| 32 |
|
---|
| 33 | #include "suffix.h"
|
---|
| 34 | #include "phrase.h"
|
---|
| 35 |
|
---|
| 36 | // Phrase constructor functions
|
---|
| 37 |
|
---|
| 38 | Phrase::Phrase(symbol *words, cellcount size, int direction) {
|
---|
| 39 | empty();
|
---|
| 40 | length = size;
|
---|
| 41 |
|
---|
| 42 | if (direction == SUFFIX) {
|
---|
| 43 | forward = words;
|
---|
| 44 | back = forward + size - 1;
|
---|
| 45 | } else {
|
---|
| 46 | back = words;
|
---|
| 47 | forward = back - size + 1;
|
---|
| 48 | }
|
---|
| 49 | }
|
---|
| 50 |
|
---|
| 51 | Phrase::Phrase() {
|
---|
| 52 | empty();
|
---|
| 53 | }
|
---|
| 54 |
|
---|
| 55 |
|
---|
| 56 | // Empty the contents of a phrase
|
---|
| 57 |
|
---|
| 58 | int Phrase::empty() {
|
---|
| 59 |
|
---|
| 60 | forward = back = NULL;
|
---|
| 61 | length = 0;
|
---|
| 62 |
|
---|
| 63 | suffixFound = prefixFound = 0;
|
---|
| 64 | firstSuffix = firstPrefix = NULL;
|
---|
| 65 | lastSuffix = lastPrefix = NULL;
|
---|
| 66 | suffixFrequency = prefixFrequency = 0;
|
---|
| 67 | uniqueSuffixExtension = uniquePrefixExtension = -1;
|
---|
| 68 |
|
---|
| 69 | return 0;
|
---|
| 70 | }
|
---|
| 71 |
|
---|
| 72 | // Remove the details of the location where a suffix/prefix
|
---|
| 73 | // is found in the array.
|
---|
| 74 |
|
---|
| 75 | int Phrase::clearSuffix() {
|
---|
| 76 | suffixFound = 0;
|
---|
| 77 | firstSuffix = NULL;
|
---|
| 78 | lastSuffix = NULL;
|
---|
| 79 | suffixFrequency = 0;
|
---|
| 80 | uniqueSuffixExtension = -1;
|
---|
| 81 | return 0;
|
---|
| 82 | }
|
---|
| 83 |
|
---|
| 84 | int Phrase::clearPrefix() {
|
---|
| 85 | prefixFound = 0;
|
---|
| 86 | firstPrefix = NULL;
|
---|
| 87 | lastPrefix = NULL;
|
---|
| 88 | prefixFrequency = 0;
|
---|
| 89 | uniquePrefixExtension = -1;
|
---|
| 90 | return 0;
|
---|
| 91 | }
|
---|
| 92 |
|
---|
| 93 | // Increase the length of s phrase
|
---|
| 94 | //
|
---|
| 95 | // We increase the length of a string "in place". When we expand
|
---|
| 96 | // the suffix, however, we invalidate the prefix data, and vice-versa.
|
---|
| 97 |
|
---|
| 98 | int Phrase::increaseSuffixLength(cellcount l) {
|
---|
| 99 | length = l;
|
---|
| 100 | clearPrefix();
|
---|
| 101 | back = forward + l - 1;
|
---|
| 102 | return 0;
|
---|
| 103 | }
|
---|
| 104 |
|
---|
| 105 | int Phrase::increasePrefixLength(cellcount l) {
|
---|
| 106 | length = l;
|
---|
| 107 | clearSuffix();
|
---|
| 108 | forward = back - l + 1;
|
---|
| 109 | return 0;
|
---|
| 110 | }
|
---|
| 111 |
|
---|
| 112 |
|
---|
| 113 | // Convert the phrase to a string
|
---|
| 114 | char *Phrase::toString() {
|
---|
| 115 |
|
---|
| 116 | assert(forward);
|
---|
| 117 |
|
---|
| 118 | char *str;
|
---|
| 119 | str = new char[length*20];
|
---|
| 120 | symbol *s = forward;
|
---|
| 121 | sprintf(str, "s%d", *s++);
|
---|
| 122 |
|
---|
| 123 | for (cellcount i = 1; i < length; i++) {
|
---|
| 124 | sprintf(str, "%s s%d", str, *s++);
|
---|
| 125 | }
|
---|
| 126 |
|
---|
| 127 | return str;
|
---|
| 128 | }
|
---|
| 129 |
|
---|
| 130 |
|
---|
| 131 | // Compare a phrase to an another array of symbols
|
---|
| 132 | //
|
---|
| 133 | // Given an array of words with a specified length,
|
---|
| 134 | // compare those words to this phrase.
|
---|
| 135 | //
|
---|
| 136 | // Return 0 if the two phrases are the same over length length
|
---|
| 137 | // Return 1 if the phrase is "greater" than tyhe words, or
|
---|
| 138 | // Return -1 if the phrase is less.
|
---|
| 139 |
|
---|
| 140 | int Phrase::compareSuffix(symbol *words, cellcount length) {
|
---|
| 141 | assert(forward);
|
---|
| 142 | symbol *p = forward;
|
---|
| 143 |
|
---|
| 144 | for (cellcount i = 0; i < length; i++) {
|
---|
| 145 | if (*p > *words) {
|
---|
| 146 | return 1;
|
---|
| 147 | } else if (*p < *words) {
|
---|
| 148 | return -1;
|
---|
| 149 | } else {
|
---|
| 150 | *p++;
|
---|
| 151 | *words++;
|
---|
| 152 | }
|
---|
| 153 | }
|
---|
| 154 |
|
---|
| 155 | return 0;
|
---|
| 156 |
|
---|
| 157 | }
|
---|
| 158 |
|
---|
| 159 | int Phrase::comparePrefix(symbol *words, cellcount length) {
|
---|
| 160 |
|
---|
| 161 | assert(back);
|
---|
| 162 | symbol *p = back;
|
---|
| 163 |
|
---|
| 164 | for (cellcount i = 0; i < length; i++) {
|
---|
| 165 | if (*p > *words) {
|
---|
| 166 | return -1;
|
---|
| 167 | } else if (*p < *words) {
|
---|
| 168 | return 1;
|
---|
| 169 | } else {
|
---|
| 170 | *p--;
|
---|
| 171 | *words--;
|
---|
| 172 | }
|
---|
| 173 | }
|
---|
| 174 |
|
---|
| 175 | return 0;
|
---|
| 176 |
|
---|
| 177 | }
|
---|
| 178 |
|
---|
| 179 |
|
---|
| 180 | // Does the phrase have a unique suffix/prefix extension?
|
---|
| 181 | //
|
---|
[1873] | 182 | // For suffix:
|
---|
| 183 | // If uniqueSuffixExtensions is 1, then phrase has some unique expansion.
|
---|
| 184 | // If it is 0 then it does not. If it is -1 then we don't know, and have
|
---|
| 185 | // to calculate it from the suffix array. Same goes for prefixes.
|
---|
[1562] | 186 |
|
---|
| 187 | int Phrase::hasUniqueSuffixExtension() {
|
---|
| 188 |
|
---|
| 189 | // if we don't know, then work it out
|
---|
| 190 | if (uniqueSuffixExtension == -1) {
|
---|
| 191 |
|
---|
[1873] | 192 | ensureSuffixFound();
|
---|
[1562] | 193 |
|
---|
| 194 | // pointers to the phrase's first and last occurances in te suffixArray
|
---|
| 195 | symbol *fst = firstSuffix + length;
|
---|
| 196 | symbol *lst = lastSuffix + length;
|
---|
| 197 |
|
---|
| 198 | // in ANYPHRASE mode, simply check the next symbol
|
---|
| 199 | if (phraseMode == ANYPHRASE) {
|
---|
| 200 | if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
| 201 | uniqueSuffixExtension = 1;
|
---|
| 202 | } else {
|
---|
| 203 | uniqueSuffixExtension = 0;
|
---|
| 204 | }
|
---|
| 205 | }
|
---|
| 206 | // in STOPWORDS mode, make sure there is a unique next content symbol
|
---|
| 207 | else {
|
---|
| 208 | uniqueSuffixExtension = 0;
|
---|
| 209 | while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
| 210 | if (*fst >= firstContentSymbol) {
|
---|
| 211 | uniqueSuffixExtension = 1;
|
---|
| 212 | break;
|
---|
| 213 | }
|
---|
| 214 | fst++;
|
---|
| 215 | lst++;
|
---|
| 216 | }
|
---|
| 217 | }
|
---|
| 218 | }
|
---|
| 219 |
|
---|
| 220 | return uniqueSuffixExtension;
|
---|
| 221 | }
|
---|
| 222 |
|
---|
| 223 | int Phrase::hasUniquePrefixExtension() {
|
---|
| 224 |
|
---|
| 225 | // if we don't know, then work it out
|
---|
| 226 | if (uniquePrefixExtension == -1) {
|
---|
| 227 |
|
---|
| 228 | ensurePrefixFound();
|
---|
| 229 |
|
---|
[1873] | 230 | // pointers to the phrase's first and last occurances in the prefixArray
|
---|
[1562] | 231 | symbol *fst = firstPrefix - length;
|
---|
| 232 | symbol *lst = lastPrefix - length;
|
---|
| 233 |
|
---|
| 234 | // in ANYPHRASE mode, simply check the next symbol
|
---|
| 235 | if (phraseMode == ANYPHRASE) {
|
---|
| 236 | if ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
| 237 | uniquePrefixExtension = 1;
|
---|
| 238 | } else {
|
---|
| 239 | uniquePrefixExtension = 0;
|
---|
| 240 | }
|
---|
| 241 | }
|
---|
| 242 | // in STOPWORDS mode, make sure there is a unique next content symbol
|
---|
| 243 | else {
|
---|
| 244 | uniquePrefixExtension = 0;
|
---|
| 245 | while ((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
| 246 | if (*fst >= firstContentSymbol) {
|
---|
| 247 | uniquePrefixExtension = 1;
|
---|
| 248 | break;
|
---|
| 249 | }
|
---|
| 250 | fst--;
|
---|
| 251 | lst--;
|
---|
| 252 | }
|
---|
| 253 | }
|
---|
| 254 | }
|
---|
| 255 |
|
---|
| 256 | return uniquePrefixExtension;
|
---|
| 257 | }
|
---|
| 258 |
|
---|
| 259 |
|
---|
[1873] | 260 | // Expand a phrase with a unique suffix/prefix by 1 symbol
|
---|
[1562] | 261 | //
|
---|
| 262 | // Note that in STOPWORDS mode a "unique extension" means a unique
|
---|
| 263 | // extension that ends on a content symbol. Although we may extend
|
---|
| 264 | // such a phrase by 2 or more symbols, we will only extend it by a
|
---|
| 265 | // single content symbol.
|
---|
| 266 |
|
---|
| 267 | int Phrase::expandUniqueSuffixExtensionByOne() {
|
---|
| 268 | assert(suffixFound);
|
---|
| 269 | assert(uniqueSuffixExtension == 1);
|
---|
| 270 |
|
---|
| 271 | // in ANYPHRASE mode, expand the phrase by one symbol
|
---|
| 272 | if (phraseMode == ANYPHRASE) {
|
---|
| 273 | increaseSuffixLength(length + 1);
|
---|
| 274 | }
|
---|
| 275 | // in STOPWORDS mode, expand the phrase up to the next content symbol
|
---|
| 276 | else {
|
---|
| 277 | symbol *next = firstSuffix + length;
|
---|
| 278 | cellcount newlength = length + 1;
|
---|
| 279 |
|
---|
| 280 | while (*next < firstContentSymbol) {
|
---|
| 281 | next++;
|
---|
| 282 | newlength++;
|
---|
| 283 | }
|
---|
| 284 | increaseSuffixLength(newlength);
|
---|
| 285 | }
|
---|
| 286 |
|
---|
| 287 | uniqueSuffixExtension = -1;
|
---|
| 288 | return 0;
|
---|
| 289 | }
|
---|
| 290 |
|
---|
| 291 | int Phrase::expandUniquePrefixExtensionByOne() {
|
---|
| 292 | assert(prefixFound);
|
---|
| 293 | assert(uniquePrefixExtension == 1);
|
---|
| 294 |
|
---|
| 295 | // in ANYPHRASE mode, expand the phrase by one symbol
|
---|
| 296 | if (phraseMode == ANYPHRASE) {
|
---|
| 297 | increasePrefixLength(length + 1);
|
---|
| 298 | }
|
---|
| 299 | // in STOPWORDS mode, expand the phrase to the next content symbol
|
---|
| 300 | else {
|
---|
| 301 | symbol *next = firstPrefix - length;
|
---|
| 302 | cellcount newlength = length + 1;
|
---|
| 303 | while (*next < firstContentSymbol) {
|
---|
| 304 | next--;
|
---|
| 305 | newlength++;
|
---|
| 306 | }
|
---|
| 307 | increasePrefixLength(newlength);
|
---|
| 308 |
|
---|
| 309 | }
|
---|
| 310 |
|
---|
| 311 | uniquePrefixExtension = -1;
|
---|
| 312 | return 0;
|
---|
| 313 | }
|
---|
| 314 |
|
---|
| 315 |
|
---|
| 316 | // Find the phrase in a suffixArray
|
---|
| 317 | //
|
---|
| 318 | // Given a suffix array, find the first and last occurances of the phrase.
|
---|
| 319 | // The begin and end parameters identify which cells to search (inclusive).
|
---|
| 320 |
|
---|
| 321 | int Phrase::findFirstAndLastSuffix() {
|
---|
| 322 | return findFirstAndLastSuffix(0, inputLength-1);
|
---|
| 323 | }
|
---|
| 324 |
|
---|
| 325 | int Phrase::findFirstAndLastSuffix(cellindex begin, cellindex end) {
|
---|
| 326 |
|
---|
| 327 | // First find *any* occurance of the phrase
|
---|
| 328 | assert(begin <= end);
|
---|
| 329 |
|
---|
| 330 | // cout << "findFirstAndLastSuffix (" << begin << " - " << end << ") : " << toString() << endl;
|
---|
| 331 |
|
---|
| 332 | // if we're only searching one cell, it is very easy
|
---|
| 333 | if (begin == end) {
|
---|
| 334 | assert(compareSuffix(suffixArray[begin], length) == 0);
|
---|
| 335 | firstSuffix = lastSuffix = suffixArray[begin];
|
---|
| 336 | firstSuffixIndex = lastSuffixIndex = begin;
|
---|
| 337 | suffixFrequency = 1;
|
---|
| 338 | suffixFound = 1;
|
---|
| 339 | return 0;
|
---|
| 340 | }
|
---|
| 341 |
|
---|
| 342 | cellindex c;
|
---|
| 343 | int cmp;
|
---|
| 344 |
|
---|
| 345 | do {
|
---|
| 346 | // cout << "Find anywhere in " << begin << " - " << end << endl;
|
---|
| 347 | c = (end + begin) / 2;
|
---|
| 348 | cmp = compareSuffix(suffixArray[c], length);
|
---|
| 349 | if (cmp == 1) {
|
---|
| 350 | // target phrase is lower than phrase at suffixArray[c]
|
---|
| 351 | begin = c + 1;
|
---|
| 352 | } else if (cmp == -1) {
|
---|
| 353 | // target phrase is higher than phrase at suffixArray[c]
|
---|
| 354 | end = c - 1;
|
---|
| 355 | }
|
---|
| 356 | } while (cmp != 0);
|
---|
| 357 |
|
---|
| 358 | cellindex lastbegin = c;
|
---|
| 359 | cellindex lastend = end;
|
---|
| 360 |
|
---|
| 361 | // Next find the first occurance of the phrase.
|
---|
| 362 | // We know that the first occurance must be between the
|
---|
| 363 | // current value of begin and the current value of c.
|
---|
| 364 | end = c;
|
---|
| 365 |
|
---|
| 366 | do {
|
---|
| 367 | // cout << "first-searching with " << begin << " - " << end << endl;
|
---|
| 368 | if (begin == end) {
|
---|
| 369 | c = begin;
|
---|
| 370 | cmp = 0;
|
---|
| 371 | } else {
|
---|
| 372 | c = (begin + end) / 2;
|
---|
| 373 | cmp = compareSuffix(suffixArray[c], length);
|
---|
| 374 | if (cmp == 1) {
|
---|
| 375 | // target phrase is lower than phrase at suffixArray[c]
|
---|
| 376 | begin = c + 1;
|
---|
| 377 | } else {
|
---|
| 378 | assert(cmp == 0);
|
---|
| 379 | // target phrase is the same as the phrase at suffixArray[c]. However,
|
---|
| 380 | // to find the first occurance, suffixArray[c] must be the same as the
|
---|
| 381 | // phrase, but suffixArray[c-1] must be different.
|
---|
| 382 | cmp = compareSuffix(suffixArray[c-1], length);
|
---|
| 383 | if (cmp == 0) {
|
---|
| 384 | end = c - 1;
|
---|
| 385 | assert(end >= begin);
|
---|
| 386 | cmp = 1;
|
---|
| 387 | } else {
|
---|
| 388 | cmp = 0;
|
---|
| 389 | }
|
---|
| 390 | }
|
---|
| 391 | }
|
---|
| 392 | } while (cmp != 0);
|
---|
| 393 |
|
---|
| 394 | // we have found the first location
|
---|
| 395 | firstSuffixIndex = c;
|
---|
| 396 | firstSuffix = suffixArray[c];
|
---|
| 397 |
|
---|
| 398 | // Next find the last occurance of the phrase.
|
---|
| 399 | // We previously stored some bounds for its loccation.
|
---|
| 400 | begin = lastbegin;
|
---|
| 401 | end = lastend;
|
---|
| 402 |
|
---|
| 403 | do {
|
---|
| 404 | // cout << "last-searching with " << begin << " - " << end << endl;
|
---|
| 405 | if (begin == end) {
|
---|
| 406 | c = begin;
|
---|
| 407 | cmp = 0;
|
---|
| 408 | } else {
|
---|
| 409 | c = (begin + end) / 2;
|
---|
| 410 | cmp = compareSuffix(suffixArray[c], length);
|
---|
| 411 | if (cmp == -1) {
|
---|
| 412 | // target phrase is greater than phrase at suffixArray[c]
|
---|
| 413 | end = c - 1;
|
---|
| 414 | } else {
|
---|
| 415 | assert(cmp == 0);
|
---|
| 416 | // target phrase is the same as the phrase at suffixArray[c]. However,
|
---|
| 417 | // to find the last occurance, one of two additional condiditons must be met:
|
---|
| 418 | // either c must be the very last cell in the array, or
|
---|
| 419 | // suffixArray[c+1] must be different from phrase
|
---|
| 420 | if (c < inputLength-1) {
|
---|
| 421 | cmp = compareSuffix(suffixArray[c+1], length);
|
---|
| 422 | if (cmp == 0) {
|
---|
| 423 | begin = c + 1;
|
---|
| 424 | cmp = -1;
|
---|
| 425 | } else {
|
---|
| 426 | cmp = 0;
|
---|
| 427 | }
|
---|
| 428 | }
|
---|
| 429 | }
|
---|
| 430 | }
|
---|
| 431 | } while (cmp != 0);
|
---|
| 432 |
|
---|
| 433 | lastSuffixIndex = c;
|
---|
| 434 | lastSuffix = suffixArray[c];
|
---|
| 435 | suffixFrequency = lastSuffixIndex - firstSuffixIndex + 1;
|
---|
| 436 | suffixFound = 1;
|
---|
| 437 |
|
---|
| 438 | return 0;
|
---|
| 439 | }
|
---|
| 440 |
|
---|
| 441 |
|
---|
| 442 | // Find the phrase in a prefix array
|
---|
| 443 | //
|
---|
| 444 | // Given a prefix array, find the first and last occurances of the phrase.
|
---|
| 445 | // The begin and end parameters identify which cells to search (inclusive).
|
---|
| 446 |
|
---|
| 447 | int Phrase::findFirstAndLastPrefix() {
|
---|
| 448 | return findFirstAndLastPrefix(0, inputLength-1);
|
---|
| 449 | }
|
---|
| 450 |
|
---|
| 451 | int Phrase::findFirstAndLastPrefix(cellindex begin, cellindex end) {
|
---|
| 452 |
|
---|
| 453 | // First find *any* occurance of the phrase
|
---|
| 454 | assert(begin <= end);
|
---|
| 455 |
|
---|
| 456 | // cout << "findFirstAndLastPrefix (" << begin << " - " << end << ") : " << toString() << endl;
|
---|
| 457 |
|
---|
| 458 | // if we're only searching one cell, it is very easy
|
---|
| 459 | if (begin == end) {
|
---|
| 460 | assert(comparePrefix(prefixArray[begin], length) == 0);
|
---|
| 461 | firstPrefix = lastPrefix = prefixArray[begin];
|
---|
| 462 | firstPrefixIndex = lastPrefixIndex = begin;
|
---|
| 463 | prefixFrequency = 1;
|
---|
| 464 | prefixFound = 1;
|
---|
| 465 | return 0;
|
---|
| 466 | }
|
---|
| 467 |
|
---|
| 468 | cellindex c;
|
---|
| 469 | int cmp;
|
---|
| 470 |
|
---|
| 471 | do {
|
---|
| 472 | // cout << "Find anywhere in prefixarray " << begin << " - " << end << endl;
|
---|
| 473 | c = (end + begin) / 2;
|
---|
| 474 | cmp = comparePrefix(prefixArray[c], length);
|
---|
| 475 | if (cmp == 1) {
|
---|
| 476 | begin = c + 1;
|
---|
| 477 | } else if (cmp == -1) {
|
---|
| 478 | end = c - 1;
|
---|
| 479 | }
|
---|
| 480 | } while (cmp != 0);
|
---|
| 481 |
|
---|
| 482 | cellindex lastbegin = c;
|
---|
| 483 | cellindex lastend = end;
|
---|
| 484 |
|
---|
| 485 | // Next find the first occurance of the phrase.
|
---|
| 486 | // We know that the first occurance must be between the
|
---|
| 487 | // current value of begin and the cureent value of c.
|
---|
| 488 | end = c;
|
---|
| 489 |
|
---|
| 490 | do {
|
---|
| 491 | // cout << "first-prefix-searching with " << begin << " - " << end << endl;
|
---|
| 492 | if (begin == end) {
|
---|
| 493 | c = begin;
|
---|
| 494 | cmp = 0;
|
---|
| 495 | } else {
|
---|
| 496 | c = (begin + end) / 2;
|
---|
| 497 | cmp = comparePrefix(prefixArray[c], length);
|
---|
| 498 | // cout << "cmp: " << cmp << ", c: " << c << endl;
|
---|
| 499 | if (cmp == 1) {
|
---|
| 500 | // target phrase is lower than phrase at prefixArray[c]
|
---|
| 501 | begin = c + 1;
|
---|
| 502 | } else {
|
---|
| 503 | assert(cmp == 0);
|
---|
| 504 | // target phrase is the same as the phrase at prefixArray[c]. However, to
|
---|
| 505 | // find the first occurance, one of two additional condiditons must be met:
|
---|
| 506 | // either c == the first cell in prefix array
|
---|
| 507 | // or prefixArray[c-1] != phrase.
|
---|
| 508 | if (c > 0) {
|
---|
| 509 | cmp = comparePrefix(prefixArray[c-1], length);
|
---|
| 510 | if (cmp == 0) {
|
---|
| 511 | end = c - 1;
|
---|
| 512 | assert(end >= begin);
|
---|
| 513 | cmp = 1;
|
---|
| 514 | } else {
|
---|
| 515 | cmp = 0;
|
---|
| 516 | }
|
---|
| 517 | }
|
---|
| 518 | }
|
---|
| 519 | }
|
---|
| 520 | } while (cmp != 0);
|
---|
| 521 |
|
---|
| 522 | // we have found the first location
|
---|
| 523 | firstPrefixIndex = c;
|
---|
| 524 | firstPrefix = prefixArray[c];
|
---|
| 525 |
|
---|
| 526 | // Next find the last occurance of the phrase.
|
---|
| 527 | // We previously stored some bounds for its loccation.
|
---|
| 528 | begin = lastbegin;
|
---|
| 529 | end = lastend;
|
---|
| 530 |
|
---|
| 531 | do {
|
---|
| 532 | // cout << "last-prefix-searching with " << begin << " - " << end << endl;
|
---|
| 533 | if (begin == end) {
|
---|
| 534 | c = begin;
|
---|
| 535 | cmp = 0;
|
---|
| 536 | } else {
|
---|
| 537 | c = (begin + end) / 2;
|
---|
| 538 | cmp = comparePrefix(prefixArray[c], length);
|
---|
| 539 | if (cmp == -1) {
|
---|
| 540 | // target phrase is greater than phrase at prefixArray[c]
|
---|
| 541 | end = c - 1;
|
---|
| 542 | } else {
|
---|
| 543 | assert(cmp == 0);
|
---|
| 544 | // target phrase is the same as the phrase at prefixArray[c]. However,
|
---|
| 545 | // to find the last occurance, prefixArray[c] must be the same as the
|
---|
| 546 | // phrase, but prefixArray[c+1] must be different.
|
---|
| 547 | cmp = comparePrefix(prefixArray[c+1], length);
|
---|
| 548 | if (cmp == 0) {
|
---|
| 549 | begin = c + 1;
|
---|
| 550 | cmp = -1;
|
---|
| 551 | } else {
|
---|
| 552 | cmp = 0;
|
---|
| 553 | }
|
---|
| 554 | }
|
---|
| 555 | }
|
---|
| 556 | } while (cmp != 0);
|
---|
| 557 |
|
---|
| 558 | lastPrefixIndex = c;
|
---|
| 559 | lastPrefix = prefixArray[c];
|
---|
| 560 | prefixFrequency = lastPrefixIndex - firstPrefixIndex + 1;
|
---|
| 561 | prefixFound = 1;
|
---|
| 562 |
|
---|
| 563 | return 0;
|
---|
| 564 | }
|
---|
| 565 |
|
---|
| 566 |
|
---|
| 567 | // Ensure that the phrase has been found in the suffix & prefix arrays
|
---|
| 568 |
|
---|
| 569 | int Phrase::ensureSuffixFound() {
|
---|
| 570 | if (!suffixFound) {
|
---|
| 571 | findFirstAndLastSuffix();
|
---|
| 572 | }
|
---|
| 573 | return 0;
|
---|
| 574 | }
|
---|
| 575 |
|
---|
| 576 | int Phrase::ensurePrefixFound() {
|
---|
| 577 | if (!prefixFound) {
|
---|
| 578 | findFirstAndLastPrefix();
|
---|
| 579 | }
|
---|
| 580 | return 0;
|
---|
| 581 | }
|
---|
| 582 |
|
---|
| 583 |
|
---|
| 584 | // Calculate a set of initial suffix/prefix candidates
|
---|
| 585 | //
|
---|
| 586 | // Expand the phrase to find the initial
|
---|
| 587 | // set of candidates that may be extensions of that phrase,
|
---|
| 588 | // and add them to the end of the results vector
|
---|
| 589 |
|
---|
| 590 | int Phrase::initialSuffixCandidates(vector<Phrase> &results) {
|
---|
| 591 |
|
---|
| 592 | ensureSuffixFound();
|
---|
| 593 | Phrase next;
|
---|
| 594 | cellindex i = firstSuffixIndex;
|
---|
| 595 |
|
---|
| 596 | // Find all the expansions of Phrase p
|
---|
| 597 | while (i <= lastSuffixIndex) {
|
---|
| 598 |
|
---|
| 599 | // create new phrase exactly one longer than the current one
|
---|
| 600 | next = newPhraseShortestSuffixExpansion(i);
|
---|
| 601 |
|
---|
| 602 | // If the expansion occurs more than once and is not delimited, expand it
|
---|
| 603 | if ((*(next.back) > LASTDELIMITER) && (next.suffixFrequency >= 2)) {
|
---|
| 604 | next.expandWhileUniqueSuffixExtension();
|
---|
| 605 | results.push_back(next);
|
---|
| 606 | }
|
---|
| 607 |
|
---|
| 608 | // Move onto the next expansion
|
---|
| 609 | i = next.lastSuffixIndex + 1;
|
---|
| 610 |
|
---|
| 611 | }
|
---|
| 612 | return 0;
|
---|
| 613 | }
|
---|
| 614 |
|
---|
| 615 |
|
---|
| 616 | int Phrase::initialPrefixCandidates(vector<Phrase> &results) {
|
---|
| 617 |
|
---|
| 618 | ensurePrefixFound();
|
---|
| 619 | Phrase next;
|
---|
| 620 | cellindex i = firstPrefixIndex;
|
---|
| 621 |
|
---|
| 622 | // Find all the expansions of Phrase p
|
---|
| 623 | while (i <= lastPrefixIndex) {
|
---|
| 624 |
|
---|
| 625 | // create new phrase exactly one longer than the current one
|
---|
| 626 | next = newPhraseShortestPrefixExpansion(i);
|
---|
| 627 |
|
---|
| 628 | // If the expansion occurs more than once and is not delimited, expand it
|
---|
| 629 | if ((*(next.forward) > LASTDELIMITER) && (next.prefixFrequency >= 2)) {
|
---|
| 630 | next.expandWhileUniquePrefixExtension();
|
---|
| 631 | results.push_back(next);
|
---|
| 632 | }
|
---|
| 633 |
|
---|
| 634 | // Move onto the next expansion
|
---|
| 635 | i = next.lastPrefixIndex + 1;
|
---|
| 636 |
|
---|
| 637 | }
|
---|
| 638 | return 0;
|
---|
| 639 | }
|
---|
| 640 |
|
---|
| 641 |
|
---|
| 642 | // Create a new phrase that is longer then the current one, but as short
|
---|
| 643 | // as possible. In ANYPHRASE mode this means the new phrase is exactly
|
---|
| 644 | // one symbol longer than the current one, but in STOPWORD mode the expansion
|
---|
| 645 | // must introduce a new content symbol
|
---|
| 646 | //
|
---|
| 647 | // Note that we never expand across delimiters; if we encounter a delimiter
|
---|
| 648 | // then this becomes the "end" symbol of the phrase. The calling function
|
---|
| 649 | // should check for this condition.
|
---|
| 650 |
|
---|
| 651 | Phrase Phrase::newPhraseShortestSuffixExpansion(cellindex i) {
|
---|
| 652 |
|
---|
| 653 | // create a new phrase exactly one symbol longer than the current one
|
---|
| 654 | Phrase p(suffixArray[i], length + 1, SUFFIX);
|
---|
| 655 |
|
---|
| 656 | if (phraseMode == STOPWORDS) {
|
---|
| 657 | // in STOPWORDS mode we must introduce a new content symbol
|
---|
| 658 | while ((*(p.back) >= firstStopSymbol) &&
|
---|
| 659 | (*(p.back) <= lastStopSymbol)) {
|
---|
| 660 | p.increaseSuffixLength(p.length + 1);
|
---|
| 661 | }
|
---|
| 662 | }
|
---|
| 663 |
|
---|
| 664 | p.findFirstAndLastSuffix(i, lastSuffixIndex);
|
---|
| 665 | return p;
|
---|
| 666 | }
|
---|
| 667 |
|
---|
| 668 | Phrase Phrase::newPhraseShortestPrefixExpansion(cellindex i) {
|
---|
| 669 |
|
---|
| 670 | // create a new phrase exactly one symbol longer than the current one
|
---|
| 671 | Phrase p(prefixArray[i], length + 1, PREFIX);
|
---|
| 672 |
|
---|
| 673 | if (phraseMode == STOPWORDS) {
|
---|
| 674 | // in STOPWORDS mode we must introduce a new content symbol
|
---|
| 675 | while ((*(p.forward) >= firstStopSymbol) &&
|
---|
| 676 | (*(p.forward) <= lastStopSymbol)) {
|
---|
| 677 | p.increasePrefixLength(p.length + 1);
|
---|
| 678 | }
|
---|
| 679 | }
|
---|
| 680 |
|
---|
| 681 | p.findFirstAndLastPrefix(i, lastPrefixIndex);
|
---|
| 682 | return p;
|
---|
| 683 | }
|
---|
| 684 |
|
---|
| 685 |
|
---|
| 686 | // Expand a phrase until it no longer has a unique suffix extension
|
---|
| 687 | //
|
---|
| 688 | // If the phrase only occurs once in the suffix array, do nothing.
|
---|
| 689 | //
|
---|
| 690 | // If we are in stopwords mode then we expand if there is a unique
|
---|
| 691 | // suffix expansion and this expansion is a content word. We do this
|
---|
| 692 | // with the content_len variable, which holds the length of the longest
|
---|
| 693 | // phrase ending in a content word.
|
---|
| 694 |
|
---|
| 695 | int Phrase::expandWhileUniqueSuffixExtension() {
|
---|
| 696 | assert(forward);
|
---|
| 697 | assert(suffixFound);
|
---|
| 698 |
|
---|
| 699 | // if the phrase occurs only once, do nothing
|
---|
| 700 | if (suffixFrequency < 2) {
|
---|
| 701 | return 0;
|
---|
| 702 | }
|
---|
| 703 |
|
---|
| 704 | // count the length over which the cells are the same
|
---|
| 705 | // and no delimiter is crossed.
|
---|
| 706 | symbol *fst = firstSuffix;
|
---|
| 707 | symbol *lst = lastSuffix;
|
---|
| 708 | cellindex len = 0;
|
---|
| 709 | cellindex content_len = 0;
|
---|
| 710 |
|
---|
| 711 | // Calculate the length over which the phrases match
|
---|
| 712 | while((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
| 713 | len++;
|
---|
| 714 | if (*fst > lastStopSymbol) content_len = len;
|
---|
| 715 | fst++;
|
---|
| 716 | lst++;
|
---|
| 717 | }
|
---|
| 718 |
|
---|
| 719 | // in ANYPHRASE mode, expand the phrase for as long as possible
|
---|
| 720 | if (phraseMode == ANYPHRASE) {
|
---|
| 721 | if (len > length) {
|
---|
| 722 | increaseSuffixLength(len);
|
---|
| 723 | }
|
---|
| 724 | }
|
---|
| 725 | // in STOPWORDS mode, expand only as far as the last contentword
|
---|
| 726 | else {
|
---|
| 727 | if (content_len > length) {
|
---|
| 728 | increaseSuffixLength(content_len);
|
---|
| 729 | }
|
---|
| 730 | }
|
---|
| 731 |
|
---|
| 732 | uniqueSuffixExtension = 0;
|
---|
| 733 | return 0;
|
---|
| 734 | }
|
---|
| 735 |
|
---|
| 736 | int Phrase::expandWhileUniquePrefixExtension() {
|
---|
| 737 | assert(prefixFound);
|
---|
| 738 |
|
---|
| 739 | // if the phrase occurs only once, do nothing
|
---|
| 740 | if (prefixFrequency < 2) {
|
---|
| 741 | return 0;
|
---|
| 742 | }
|
---|
| 743 |
|
---|
| 744 | // count the length over which the cells are the same
|
---|
| 745 | // and no delimiter is crossed.
|
---|
| 746 | symbol *fst = firstPrefix;
|
---|
| 747 | symbol *lst = lastPrefix;
|
---|
| 748 | cellindex len = 0;
|
---|
| 749 | cellindex content_len = 0;
|
---|
| 750 |
|
---|
| 751 | // Calculate the length over which the phrases match
|
---|
| 752 | while((*fst == *lst) && (*fst > LASTDELIMITER)) {
|
---|
| 753 | len++;
|
---|
| 754 | if (*fst > lastStopSymbol) content_len = len;
|
---|
| 755 | fst--;
|
---|
| 756 | lst--;
|
---|
| 757 | }
|
---|
| 758 |
|
---|
| 759 | // in ANYPHRASE mode, expand the phrase for as long as possible
|
---|
| 760 | if (phraseMode == ANYPHRASE) {
|
---|
| 761 | if (len > length) {
|
---|
| 762 | increasePrefixLength(len);
|
---|
| 763 | }
|
---|
| 764 | }
|
---|
| 765 | // in STOPWORDS mode, expand only as far as the last contentword
|
---|
| 766 | else {
|
---|
| 767 | if (content_len > length) {
|
---|
| 768 | increasePrefixLength(content_len);
|
---|
| 769 | }
|
---|
| 770 | }
|
---|
| 771 |
|
---|
| 772 | uniquePrefixExtension = 0;
|
---|
| 773 | return 0;
|
---|
| 774 | }
|
---|
| 775 |
|
---|
| 776 |
|
---|
| 777 |
|
---|
| 778 |
|
---|
| 779 |
|
---|
| 780 |
|
---|
| 781 |
|
---|
| 782 |
|
---|
| 783 |
|
---|
| 784 |
|
---|
| 785 |
|
---|
| 786 |
|
---|
| 787 | // Compare the length of two phrases
|
---|
| 788 | //
|
---|
[1873] | 789 | // Given two phrases, return true if the first is shorter/longer,
|
---|
| 790 | // otherwise return false. For use in various sort functions.
|
---|
[1562] | 791 |
|
---|
| 792 | bool isShorter(Phrase p1, Phrase p2) {
|
---|
| 793 | if (p1.length < p2.length) {
|
---|
| 794 | return true;
|
---|
| 795 | }
|
---|
| 796 | return false;
|
---|
| 797 | }
|
---|
| 798 |
|
---|
[1873] | 799 | bool isLonger(Phrase p1, Phrase p2) {
|
---|
| 800 | if (p1.length > p2.length) {
|
---|
| 801 | return true;
|
---|
| 802 | }
|
---|
| 803 | return false;
|
---|
| 804 | }
|
---|
[1562] | 805 |
|
---|
[1873] | 806 |
|
---|