[1562] | 1 | /**********************************************************************
|
---|
| 2 | *
|
---|
[2801] | 3 | * Suffix.cpp -- Extract the repeated phrases in the input with suffix
|
---|
| 4 | * and prefix arrays (cgn & gwp's simpler algorithm,
|
---|
| 5 | * and kjm's improvements).
|
---|
[1562] | 6 | *
|
---|
[1873] | 7 | * Copyright 2000 Gordon W. Paynter
|
---|
[1631] | 8 | * Copyright 2000 The New Zealand Digital Library Project
|
---|
[1562] | 9 | *
|
---|
| 10 | * A component of the Greenstone digital library software
|
---|
| 11 | * from the New Zealand Digital Library Project at the
|
---|
| 12 | * University of Waikato, New Zealand.
|
---|
| 13 | *
|
---|
| 14 | * This program is free software; you can redistribute it and/or modify
|
---|
| 15 | * it under the terms of the GNU General Public License as published by
|
---|
| 16 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 17 | * (at your option) any later version.
|
---|
| 18 | *
|
---|
| 19 | * This program is distributed in the hope that it will be useful,
|
---|
| 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 22 | * GNU General Public License for more details.
|
---|
| 23 | *
|
---|
| 24 | * You should have received a copy of the GNU General Public License
|
---|
| 25 | * along with this program; if not, write to the Free Software
|
---|
| 26 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 27 | *
|
---|
| 28 | *********************************************************************/
|
---|
| 29 |
|
---|
| 30 | #include <assert.h>
|
---|
| 31 | #include <math.h>
|
---|
| 32 | #include <stdio.h>
|
---|
| 33 | #include <stdlib.h>
|
---|
| 34 | #include <string.h>
|
---|
| 35 |
|
---|
[2487] | 36 | #if defined(GSDL_USE_IOS_H)
|
---|
| 37 | # include <fstream.h>
|
---|
| 38 | # include <iostream.h>
|
---|
| 39 | #else
|
---|
| 40 | # include <fstream>
|
---|
| 41 | # include <iostream>
|
---|
| 42 | #endif
|
---|
[1562] | 43 |
|
---|
[2487] | 44 | #if defined(GSDL_USE_STL_H)
|
---|
| 45 | # if defined(GSDL_USE_ALGO_H)
|
---|
| 46 | # include <algo.h>
|
---|
| 47 | # else
|
---|
| 48 | # include <algorithm.h>
|
---|
| 49 | # endif
|
---|
| 50 | # include <vector.h>
|
---|
| 51 | #else
|
---|
| 52 | # include <algorithm>
|
---|
| 53 | # include <vector>
|
---|
| 54 | #endif
|
---|
| 55 |
|
---|
[1562] | 56 | #include "suffix.h"
|
---|
| 57 | #include "phrase.h"
|
---|
| 58 |
|
---|
| 59 | // Global variables declared in suffix.h
|
---|
| 60 | cellcount inputLength;
|
---|
| 61 |
|
---|
| 62 | symbol *symbols;
|
---|
| 63 | symbol **suffixArray;
|
---|
[2801] | 64 | symbol **prefixArray;
|
---|
[1562] | 65 | check *suffixCheck;
|
---|
| 66 |
|
---|
[2807] | 67 | // the length of the check array
|
---|
| 68 | cellcount checkLength;
|
---|
[1562] | 69 |
|
---|
| 70 | // How many documents are in this collection?
|
---|
| 71 | cellcount numberOfDocuments;
|
---|
| 72 | symbol **documentArray;
|
---|
| 73 |
|
---|
[2801] | 74 |
|
---|
[1562] | 75 | // Do we accept any phrase, or do we eliminate those ending with stopwords ?
|
---|
| 76 | int phraseMode = ANYPHRASE; //STOPWORDS;
|
---|
| 77 |
|
---|
[2801] | 78 |
|
---|
[1562] | 79 | // The filestem of the collection's phindex directory
|
---|
| 80 | char collection[FILENAME_MAX];
|
---|
| 81 |
|
---|
| 82 |
|
---|
| 83 | // The ranges of the stopword and content-word symbols for the collection
|
---|
| 84 | symbol firstStopSymbol = 0;
|
---|
| 85 | symbol lastStopSymbol = 0;
|
---|
| 86 | symbol firstContentSymbol = 0;
|
---|
| 87 | symbol lastContentSymbol = 0;
|
---|
| 88 |
|
---|
| 89 |
|
---|
[2801] | 90 | // Some useful comparison functions, defined below.
|
---|
| 91 | int suffixCompare(const void *, const void *);
|
---|
| 92 | int prefixCompare(const void *, const void *);
|
---|
| 93 | int pointerCompare(const void *, const void *);
|
---|
[1562] | 94 |
|
---|
[2807] | 95 | // some bit manipulation functions for the check arrays, defined below
|
---|
| 96 | int getSuffixCheck(cellindex suff);
|
---|
| 97 | void setSuffixCheck(cellindex suff);
|
---|
| 98 |
|
---|
[2801] | 99 | // Functions for implementing "phrase memory". These let us "remember"
|
---|
| 100 | // each phrase that we've expanded without using too much memory.
|
---|
[1562] | 101 | void initialisePhraseMemory();
|
---|
| 102 | void rememberThisPhrase(cellindex index, cellcount length);
|
---|
| 103 | bool isPhraseStored(cellindex index, cellcount length);
|
---|
| 104 | void deletePhraseMemory();
|
---|
| 105 |
|
---|
| 106 |
|
---|
[1618] | 107 | // how much output do we want?
|
---|
| 108 | int verbosity = 1;
|
---|
[1562] | 109 |
|
---|
[1618] | 110 |
|
---|
[2801] | 111 | // Get a phrase's expansions
|
---|
[1562] | 112 | //
|
---|
[2801] | 113 | // Get the set of "minimal", "maximal", non-unique expansions of a
|
---|
| 114 | // phrase p, using the simpler algorithm that Craig and Gordon came up
|
---|
| 115 | // with at Google.
|
---|
[1562] | 116 | //
|
---|
| 117 | // Returns a vector of Expansions.
|
---|
| 118 |
|
---|
[2801] | 119 | void getExpansions(Phrase &p, vector<Phrase> &results) {
|
---|
[1562] | 120 |
|
---|
[2801] | 121 | // 1. Get the initial candidates
|
---|
[1562] | 122 | vector<Phrase> candidates;
|
---|
| 123 | p.initialSuffixCandidates(candidates);
|
---|
[2801] | 124 | int suffcands = candidates.size();
|
---|
[1562] | 125 | p.initialPrefixCandidates(candidates);
|
---|
| 126 |
|
---|
[2801] | 127 | if (candidates.size() == 0)
|
---|
| 128 | return;
|
---|
[1562] | 129 |
|
---|
[2801] | 130 | vector<Phrase>::iterator i;
|
---|
| 131 | for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
|
---|
| 132 | i->expandWhileUniquePrefixExtension();
|
---|
| 133 | i->ensureSuffixFound();
|
---|
| 134 | }
|
---|
| 135 | for (i; i != candidates.end(); ++i) {
|
---|
| 136 | i->expandWhileUniqueSuffixExtension();
|
---|
| 137 | }
|
---|
[1562] | 138 |
|
---|
[2801] | 139 | // 3. Ensure minimality: ignore phrases whose subphrases are also found
|
---|
| 140 | results.clear();
|
---|
[1562] | 141 |
|
---|
[2801] | 142 | // Initialise the candidates, check array, and various variables.
|
---|
| 143 | sort(candidates.begin(), candidates.end(), isShorter);
|
---|
[2807] | 144 | memset(suffixCheck, 0, sizeof(check)*checkLength);
|
---|
[2801] | 145 | unsigned minimum_length = candidates.begin()->length;
|
---|
| 146 |
|
---|
| 147 | // Try to add each candidate to the results set, ignoring the non-minimal
|
---|
| 148 | for (vector<Phrase>::iterator candidate = candidates.begin();
|
---|
| 149 | candidate != candidates.end(); candidate++) {
|
---|
[1562] | 150 |
|
---|
[2801] | 151 | // Make a copy of candidate to mutilate while performing sub-phrase checks
|
---|
| 152 | Phrase temp_phrase(*candidate);
|
---|
| 153 | bool shorter_found = false;
|
---|
| 154 |
|
---|
| 155 | // Check for shorter and shorter versions of the tenporary phrase
|
---|
| 156 | while (temp_phrase.length >= minimum_length && !shorter_found) {
|
---|
| 157 | temp_phrase.ensureSuffixFound();
|
---|
[2807] | 158 | //if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
|
---|
| 159 | if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
|
---|
[2801] | 160 | temp_phrase.shortenByOneAtPrefix();
|
---|
| 161 | else
|
---|
| 162 | shorter_found = true;
|
---|
[1562] | 163 |
|
---|
[2801] | 164 | // Possible efficiency here: we can finish if the prefix of c
|
---|
| 165 | // and temp_phrase are the same for candidate->length symbols.
|
---|
[1562] | 166 | }
|
---|
| 167 |
|
---|
[2801] | 168 | if (!shorter_found) {
|
---|
| 169 | results.push_back(*candidate);
|
---|
| 170 | candidate->ensureSuffixFound();
|
---|
| 171 | for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
|
---|
[2807] | 172 | //suffixCheck[k] = candidate->length;
|
---|
| 173 | setSuffixCheck(k);
|
---|
[1562] | 174 | }
|
---|
| 175 | }
|
---|
| 176 | }
|
---|
| 177 |
|
---|
| 178 |
|
---|
| 179 | // suffixCompare
|
---|
| 180 | //
|
---|
| 181 | // Compare two pointers into a suffix array. We use this in the
|
---|
| 182 | // qsort function, so the input are pointers to pointers.
|
---|
| 183 | //
|
---|
| 184 | // Return -1 if (a < b), otherwise (a > b) so return +1,
|
---|
| 185 |
|
---|
| 186 | int suffixCompare(const void *cpa, const void *cpb) {
|
---|
| 187 |
|
---|
| 188 | // Cast then dereference pointers to suffix array elements
|
---|
| 189 | symbol *pa = (symbol *) cpa;
|
---|
| 190 | symbol *pb = (symbol *) cpb;
|
---|
| 191 | pa = (symbol *) *pa;
|
---|
| 192 | pb = (symbol *) *pb;
|
---|
| 193 |
|
---|
| 194 | // If the two elements are the same, examine the next one
|
---|
| 195 | while (*pa == *pb) {
|
---|
| 196 | *pa++;
|
---|
| 197 | *pb++;
|
---|
| 198 | }
|
---|
| 199 |
|
---|
| 200 | // Make the copmparison and return
|
---|
| 201 | if ( *pa < *pb) {
|
---|
| 202 | return -1;
|
---|
| 203 | } else {
|
---|
| 204 | return +1;
|
---|
| 205 | }
|
---|
| 206 | }
|
---|
| 207 |
|
---|
| 208 |
|
---|
| 209 | // prefixCompare
|
---|
| 210 | //
|
---|
| 211 | // Compare two pointers into a prefix array. We use this in the
|
---|
| 212 | // qsort function, so the input are pointers to pointers.
|
---|
| 213 | //
|
---|
| 214 | // Return -1 if (a > b), otherwise (a < b) so return +1,
|
---|
| 215 |
|
---|
| 216 | int prefixCompare(const void *cpa, const void *cpb) {
|
---|
| 217 |
|
---|
| 218 | // Cast then dereference pointers to prefix array elements
|
---|
| 219 | symbol *pa = (symbol *) cpa;
|
---|
| 220 | symbol *pb = (symbol *) cpb;
|
---|
| 221 | pa = (symbol *) *pa;
|
---|
| 222 | pb = (symbol *) *pb;
|
---|
| 223 |
|
---|
| 224 | // If the two elements are the same, examine the next one
|
---|
| 225 | while (*pa == *pb) {
|
---|
| 226 | *pa--;
|
---|
| 227 | *pb--;
|
---|
| 228 | }
|
---|
| 229 |
|
---|
| 230 | // Make the copmparison and return
|
---|
| 231 | if ( *pa > *pb) {
|
---|
| 232 | return -1;
|
---|
| 233 | } else {
|
---|
| 234 | return +1;
|
---|
| 235 | }
|
---|
| 236 | }
|
---|
| 237 |
|
---|
| 238 | // simpleCompare
|
---|
| 239 | //
|
---|
| 240 | // Compare two pointers based on the memory location they point to.
|
---|
| 241 |
|
---|
| 242 | int pointerCompare( const void *pa, const void *pb ) {
|
---|
| 243 |
|
---|
| 244 | symbol **a = (symbol **) pa;
|
---|
| 245 | symbol **b = (symbol **) pb;
|
---|
| 246 |
|
---|
| 247 | if (*a < *b) {
|
---|
| 248 | return -1;
|
---|
| 249 | } else if (*a > *b) {
|
---|
| 250 | return 1;
|
---|
| 251 | } else {
|
---|
| 252 | return 0;
|
---|
| 253 | }
|
---|
| 254 | }
|
---|
| 255 |
|
---|
[2807] | 256 | int getSuffixCheck(cellindex suff) {
|
---|
| 257 | cellindex cell = suff >> 3;
|
---|
| 258 | check remainder = suff & 0x07; // the last 3 bits
|
---|
| 259 | if (suffixCheck[cell]& (1 << remainder)) {
|
---|
| 260 | return 1;
|
---|
| 261 | }
|
---|
| 262 | return 0;
|
---|
| 263 | }
|
---|
| 264 | void setSuffixCheck(cellindex suff) {
|
---|
| 265 | cellindex cell = suff >> 3;
|
---|
| 266 | check remainder = suff & 0x07; // the last 3 bits
|
---|
| 267 | suffixCheck[cell] |= (1 << remainder);
|
---|
[1562] | 268 |
|
---|
[2807] | 269 | }
|
---|
| 270 |
|
---|
[1882] | 271 | // Read the clauses.numbers file into the "symbols" array.
|
---|
[1562] | 272 | //
|
---|
[1882] | 273 | // Each number in the file is a symbol number; it is essential that
|
---|
| 274 | // the first symbol (and no others) be COLLECTIONSTART and the last
|
---|
| 275 | // symbol (and no others) be COLLECTIONEND.
|
---|
[1562] | 276 | //
|
---|
| 277 | // Return the number of numbers in the array.
|
---|
| 278 |
|
---|
[1882] | 279 | int readNumbers() {
|
---|
[1562] | 280 |
|
---|
| 281 | char filename[FILENAME_MAX];
|
---|
| 282 | sprintf(filename, "%s/clauses.numbers", collection);
|
---|
[1618] | 283 | if (verbosity) {
|
---|
[1882] | 284 | cout << "Reading numbers file: " << filename << endl;
|
---|
[1618] | 285 | }
|
---|
[1562] | 286 |
|
---|
| 287 | // Open the numbers file
|
---|
[2498] | 288 | ifstream inFile1(filename, ios::in);
|
---|
| 289 | if (!inFile1) {
|
---|
[1562] | 290 | cerr << "File " << filename << " could not be opened\n";
|
---|
| 291 | exit(1);
|
---|
| 292 | }
|
---|
| 293 |
|
---|
[1882] | 294 | // Count the number of symbols
|
---|
| 295 | inputLength = 0;
|
---|
| 296 | symbol word;
|
---|
[2498] | 297 | while (inFile1 >> word) {
|
---|
[1882] | 298 | inputLength++;
|
---|
| 299 | }
|
---|
[2498] | 300 | inFile1.close();
|
---|
[1882] | 301 |
|
---|
| 302 | // Allocate the symbbols array
|
---|
| 303 | if (verbosity > 1) {
|
---|
| 304 | cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
|
---|
| 305 | }
|
---|
[2487] | 306 | symbols = new symbol[inputLength];
|
---|
[1882] | 307 | if (symbols == NULL) {
|
---|
| 308 | cerr << "Suffix error: not enough memory to hold " << inputLength
|
---|
| 309 | << " symbols." << endl;
|
---|
| 310 | exit(2);
|
---|
| 311 | }
|
---|
| 312 |
|
---|
[1562] | 313 | // Read the numbers file into the numbers array
|
---|
[1882] | 314 | if (verbosity > 2) {
|
---|
| 315 | cout << "Reading the numbers" << endl;
|
---|
| 316 | }
|
---|
[2498] | 317 |
|
---|
| 318 | ifstream inFile2(filename, ios::in);
|
---|
| 319 | if (!inFile2) {
|
---|
| 320 | cerr << "File " << filename << " could not be opened\n";
|
---|
| 321 | exit(1);
|
---|
| 322 | }
|
---|
[1882] | 323 | cellcount next = 0;
|
---|
[1562] | 324 | numberOfDocuments = 0;
|
---|
[2498] | 325 | while (inFile2 >> word) {
|
---|
[1882] | 326 | symbols[next++] = word;
|
---|
[1562] | 327 | if (word == DOCUMENTSTART) {
|
---|
| 328 | numberOfDocuments++;
|
---|
| 329 | }
|
---|
| 330 | }
|
---|
[2498] | 331 | inFile2.close();
|
---|
[1562] | 332 |
|
---|
| 333 | // Make sure the numbers file is intact
|
---|
[1882] | 334 | assert(symbols[0] == COLLECTIONSTART);
|
---|
| 335 | assert(symbols[next-1] == COLLECTIONEND);
|
---|
[1562] | 336 |
|
---|
[1882] | 337 | return inputLength;
|
---|
[1562] | 338 | }
|
---|
| 339 |
|
---|
| 340 |
|
---|
| 341 |
|
---|
| 342 | // Get Document Occurrance statistics
|
---|
| 343 | //
|
---|
| 344 | // Given a phrase, what documents does it occur in?
|
---|
| 345 |
|
---|
[2801] | 346 | cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
|
---|
[1562] | 347 |
|
---|
[2801] | 348 | // cout << "searching for \""<< p << "\" in documents "
|
---|
[1562] | 349 | // << 0 << "-" << numberOfDocuments - 1 << endl;
|
---|
| 350 |
|
---|
| 351 | // The number of documents in which this phrase occurs
|
---|
| 352 | cellcount df = 0;
|
---|
| 353 |
|
---|
| 354 | // Initialise the document frequency array
|
---|
[2806] | 355 | // for (cellindex i = 0; i < numberOfDocuments; i++) {
|
---|
| 356 | // frequency[i] = 0;
|
---|
| 357 | //}
|
---|
| 358 | memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
|
---|
[1562] | 359 |
|
---|
| 360 | // variables used to facilitate the search
|
---|
| 361 | cellindex begin;
|
---|
| 362 | cellindex end;
|
---|
| 363 | cellindex d;
|
---|
| 364 | symbol *target;
|
---|
| 365 | bool found;
|
---|
| 366 |
|
---|
| 367 | // search for the document in which each occurence of the phrase is found
|
---|
[2487] | 368 | for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
|
---|
[1562] | 369 |
|
---|
[2487] | 370 | // cout << "looking for phrase at suffixArray[" << j << "]\n";
|
---|
[1562] | 371 |
|
---|
[2487] | 372 | target = suffixArray[j];
|
---|
[1562] | 373 | begin = 0;
|
---|
| 374 | end = numberOfDocuments - 1;
|
---|
| 375 | found = false;
|
---|
| 376 |
|
---|
| 377 | // Search for the occurence of a document delimiter that target
|
---|
| 378 | // occurs immediately after.
|
---|
| 379 | // We do this by performing a binary chop search on documentArray.
|
---|
| 380 | while (!found) {
|
---|
| 381 |
|
---|
| 382 | // cout << "searching for " << (cellindex) target << " in "
|
---|
| 383 | // << begin << " - " << end << endl;
|
---|
| 384 |
|
---|
| 385 | assert (begin <= end);
|
---|
| 386 |
|
---|
| 387 | // If the beginning and end of the interval are the same,
|
---|
[1618] | 388 | // then we've found the correct document
|
---|
[1562] | 389 | if (begin == end) {
|
---|
| 390 | if (frequency[begin] == 0) {
|
---|
| 391 | df++;
|
---|
| 392 | }
|
---|
| 393 | frequency[begin]++;
|
---|
| 394 | found = true;
|
---|
| 395 | }
|
---|
| 396 |
|
---|
| 397 | // Otherwise, examine a new document midway through the begin-end
|
---|
| 398 | // interval and see if it is the one.
|
---|
| 399 | else {
|
---|
| 400 | d = (begin + end) / 2;
|
---|
| 401 | if (target > documentArray[d]) {
|
---|
| 402 | // If target addrss is greater than this, but thisi sthe last document,
|
---|
| 403 | // then this must be the one we want. Or, if target is greater than
|
---|
| 404 | // this one but less then the next, this must be the one we wnat.
|
---|
| 405 | if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
|
---|
| 406 | if (frequency[d] == 0) {
|
---|
| 407 | df++;
|
---|
| 408 | }
|
---|
| 409 | frequency[d]++;
|
---|
| 410 | found = true;
|
---|
| 411 | } else {
|
---|
| 412 | // otherwise we know to search later in the document set
|
---|
| 413 | begin = d + 1;
|
---|
| 414 | }
|
---|
| 415 | } else {
|
---|
| 416 | // search earlier in the document set
|
---|
| 417 | end = d - 1;
|
---|
| 418 | }
|
---|
| 419 | }
|
---|
| 420 | }
|
---|
| 421 | }
|
---|
| 422 | return df;
|
---|
| 423 | }
|
---|
| 424 |
|
---|
| 425 |
|
---|
| 426 |
|
---|
| 427 |
|
---|
| 428 |
|
---|
| 429 |
|
---|
| 430 | // phraseExpansionMemory : Which phrases have we expanded?
|
---|
| 431 | //
|
---|
| 432 | // A set of utilities for keeping track of which phrases we have expanded.
|
---|
| 433 | // We don't want to expand a phrase more than once, after all.
|
---|
| 434 | //
|
---|
| 435 | // This REALLY ought to be in its own class, but it works so that's okay.
|
---|
| 436 | //
|
---|
| 437 | // Phrases are identified by their firstSuffixPosition and length.
|
---|
| 438 | //
|
---|
| 439 | // Functions provided are:
|
---|
| 440 | // void initialisePhraseMemory()
|
---|
| 441 | // void rememberThisPhrase(index, length)
|
---|
| 442 | // bool isPhraseStored(index, length)
|
---|
| 443 | // void deletePhraseMemory()
|
---|
| 444 | //
|
---|
| 445 | // Internally, we will have two separate cases:
|
---|
| 446 | //
|
---|
| 447 | // Phrases of length 1-8:
|
---|
| 448 | // unsigned char phraseMemory[inputLength]
|
---|
| 449 | // is an array where each cell "remembers" the corresponding index in the
|
---|
| 450 | // suffixArray, and each of the 8 bits of the cell correspond to the phrases
|
---|
| 451 | // of length 1, 2... 8.
|
---|
| 452 | // Eventually, we will make this disk-based (i.e. store the array in a file).
|
---|
| 453 | //
|
---|
| 454 | // Phrases of length 9+:
|
---|
| 455 | // file hashTableFile
|
---|
| 456 | // file listOfEntries
|
---|
| 457 | // The first file is a hash table; each phrase maps to one of its cells, which
|
---|
| 458 | // contains either 0 (empty, no occurence) or a number which is an entry number
|
---|
| 459 | // in the second file. This file contains a "list" of entries. Each consists of
|
---|
| 460 | // three numbers: the suffixArray index of the phrase, the length of the phrase,
|
---|
| 461 | // and the entry number of the next phrase with the same hash.
|
---|
| 462 | //
|
---|
| 463 |
|
---|
| 464 |
|
---|
| 465 | unsigned char *phraseMemory;
|
---|
| 466 |
|
---|
| 467 | void initialiseLongPhraseMemory();
|
---|
| 468 | void rememberThisLongPhrase(cellindex index, cellcount length);
|
---|
| 469 | bool isLongPhraseStored(cellindex index, cellcount length);
|
---|
| 470 | void deleteLongPhraseMemory();
|
---|
| 471 |
|
---|
| 472 |
|
---|
| 473 | void initialisePhraseMemory() {
|
---|
| 474 |
|
---|
[2487] | 475 | phraseMemory = new unsigned char[inputLength];
|
---|
[1562] | 476 |
|
---|
| 477 | // to begin with, everything is empty
|
---|
[2806] | 478 | // for (cellcount i = 0; i < inputLength; i++) {
|
---|
| 479 | // phraseMemory[i] = 0;
|
---|
| 480 | //}
|
---|
| 481 | memset(phraseMemory, 0, sizeof(char)*inputLength);
|
---|
| 482 |
|
---|
[1562] | 483 | // intialise the hashTable of long phrases
|
---|
| 484 | initialiseLongPhraseMemory();
|
---|
| 485 |
|
---|
| 486 | }
|
---|
| 487 |
|
---|
| 488 | void rememberThisPhrase(cellindex index, cellcount length) {
|
---|
| 489 |
|
---|
| 490 | // if the phrase is very long, use the file-based system
|
---|
| 491 | if (length > 8) {
|
---|
| 492 | rememberThisLongPhrase(index, length);
|
---|
| 493 | return;
|
---|
| 494 | }
|
---|
| 495 |
|
---|
| 496 | // create a char with just the bit corresponding to length set
|
---|
| 497 | unsigned char newbit = 1;
|
---|
| 498 | for (cellcount i = 1; i < length; i++) {
|
---|
| 499 | newbit <<= 1;
|
---|
| 500 | }
|
---|
| 501 |
|
---|
| 502 | // set that bit in the memory array at position index
|
---|
| 503 | phraseMemory[index] |= newbit;
|
---|
| 504 | }
|
---|
| 505 |
|
---|
| 506 |
|
---|
| 507 | bool isPhraseStored(cellindex index, cellcount length) {
|
---|
| 508 |
|
---|
| 509 | // if the phrase is very long, use the file-based system
|
---|
| 510 | if (length > 8) {
|
---|
| 511 | return isLongPhraseStored(index, length);
|
---|
| 512 | }
|
---|
| 513 |
|
---|
| 514 | // create a char with just the bit corresponding to length set
|
---|
| 515 | unsigned char newbit = 1;
|
---|
| 516 | for (cellcount i = 1; i < length; i++) {
|
---|
| 517 | newbit <<= 1;
|
---|
| 518 | }
|
---|
| 519 |
|
---|
| 520 | // retrurn true if that bit is set in memory arrayat position index
|
---|
| 521 | return (phraseMemory[index] & newbit);
|
---|
| 522 | }
|
---|
| 523 |
|
---|
| 524 | void deletePhraseMemory() {
|
---|
| 525 | delete phraseMemory;
|
---|
| 526 | deleteLongPhraseMemory();
|
---|
| 527 | }
|
---|
| 528 |
|
---|
| 529 |
|
---|
| 530 |
|
---|
| 531 | // Files etc used to store "long" equavlents of the above
|
---|
| 532 |
|
---|
| 533 | fstream hashTableFile;
|
---|
| 534 | char hashTableFileName[FILENAME_MAX];
|
---|
| 535 | fstream listOfEntries;
|
---|
| 536 | char listOfEntriesName[FILENAME_MAX];
|
---|
| 537 | cellindex nextEntryNumber;
|
---|
| 538 |
|
---|
| 539 | const cellcount bigPrime = 7919;
|
---|
| 540 |
|
---|
| 541 |
|
---|
| 542 | void initialiseLongPhraseMemory() {
|
---|
| 543 |
|
---|
| 544 | cellindex example = 0;
|
---|
| 545 |
|
---|
| 546 | sprintf(hashTableFileName, "%s/hashTable", collection);
|
---|
| 547 | sprintf(listOfEntriesName, "%s/hashLists", collection);
|
---|
| 548 |
|
---|
| 549 |
|
---|
| 550 | // create the new hashtable
|
---|
[1618] | 551 | if (verbosity > 1) {
|
---|
| 552 | cout << "Initialising hashTable: " << hashTableFileName << endl;
|
---|
| 553 | }
|
---|
[1562] | 554 | hashTableFile.open(hashTableFileName, ios::in | ios::out);
|
---|
| 555 | for (cellcount i = 0; i < bigPrime; i++) {
|
---|
| 556 | hashTableFile.write((char *) &example, sizeof(example));
|
---|
| 557 | }
|
---|
| 558 |
|
---|
| 559 | // create the list of phrases
|
---|
[1618] | 560 | if (verbosity > 1) {
|
---|
| 561 | cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
|
---|
| 562 | }
|
---|
[1562] | 563 | listOfEntries.open(listOfEntriesName, ios::in | ios::out);
|
---|
| 564 | listOfEntries.write((char *) &example, sizeof(example));
|
---|
| 565 | listOfEntries.write((char *) &example, sizeof(example));
|
---|
| 566 | listOfEntries.write((char *) &example, sizeof(example));
|
---|
| 567 | nextEntryNumber = 1;
|
---|
| 568 | }
|
---|
| 569 |
|
---|
| 570 |
|
---|
| 571 | void rememberThisLongPhrase(cellindex index, cellcount length) {
|
---|
| 572 |
|
---|
| 573 | // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
|
---|
| 574 |
|
---|
| 575 | cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
|
---|
| 576 | cellindex pointer;
|
---|
| 577 | cellindex zero = 0;
|
---|
| 578 | cellindex readp = 0;
|
---|
| 579 | cellindex readi = 0;
|
---|
| 580 | cellindex readl = 0;
|
---|
| 581 |
|
---|
| 582 | hashTableFile.seekg(hashOffset);
|
---|
| 583 | hashTableFile.read((char *) &pointer, sizeof(cellindex));
|
---|
| 584 |
|
---|
| 585 | if (pointer == 0) {
|
---|
| 586 | // There is no entry at all in the hash table for this entry
|
---|
| 587 | // so create one
|
---|
| 588 |
|
---|
| 589 | pointer = nextEntryNumber++;
|
---|
| 590 | hashTableFile.seekg(hashOffset);
|
---|
| 591 | hashTableFile.write((char *) &pointer, sizeof(cellindex));
|
---|
| 592 |
|
---|
| 593 | listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
|
---|
| 594 | listOfEntries.write((char *) &zero, sizeof(cellindex));
|
---|
| 595 | listOfEntries.write((char *) &index, sizeof(cellindex));
|
---|
| 596 | listOfEntries.write((char *) &length, sizeof(cellindex));
|
---|
| 597 |
|
---|
| 598 | } else {
|
---|
| 599 | // There is a list starting at this hash value, so the phrase may
|
---|
| 600 | // be already remembered, or it might need to be appended
|
---|
| 601 |
|
---|
| 602 | while (pointer != 0) {
|
---|
| 603 | // Read the entry pointed to by pointer
|
---|
| 604 | listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
|
---|
| 605 | listOfEntries.read((char *) &readp, sizeof(cellindex));
|
---|
| 606 | listOfEntries.read((char *) &readi, sizeof(cellindex));
|
---|
| 607 | listOfEntries.read((char *) &readl, sizeof(cellindex));
|
---|
| 608 |
|
---|
| 609 | // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
|
---|
| 610 |
|
---|
| 611 | if ((readi == index) && (readl = length)) {
|
---|
| 612 | // we've found that we've already stored it
|
---|
| 613 | return;
|
---|
| 614 | } else if (readp == 0) {
|
---|
| 615 | // we're reached the end of the list. Add a new entry.
|
---|
| 616 | listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
|
---|
| 617 | listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
|
---|
| 618 | pointer = nextEntryNumber++;
|
---|
| 619 |
|
---|
| 620 | listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
|
---|
| 621 | listOfEntries.write((char *) &zero, sizeof(cellindex));
|
---|
| 622 | listOfEntries.write((char *) &index, sizeof(cellindex));
|
---|
| 623 | listOfEntries.write((char *) &length, sizeof(cellindex));
|
---|
| 624 | return;
|
---|
| 625 | } else {
|
---|
| 626 | // go on to the next node
|
---|
| 627 | pointer = readp;
|
---|
| 628 | }
|
---|
| 629 | }
|
---|
| 630 | }
|
---|
| 631 |
|
---|
| 632 |
|
---|
| 633 | }
|
---|
| 634 |
|
---|
[2801] | 635 |
|
---|
[1562] | 636 | bool isLongPhraseStored(cellindex index, cellcount length) {
|
---|
| 637 |
|
---|
| 638 | // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
|
---|
| 639 |
|
---|
| 640 | cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
|
---|
| 641 | cellindex pointer;
|
---|
| 642 | cellindex readp = 0;
|
---|
| 643 | cellindex readi = 0;
|
---|
| 644 | cellindex readl = 0;
|
---|
| 645 |
|
---|
| 646 | // Find the phrase in the hashFile
|
---|
| 647 | hashTableFile.seekg(hashOffset);
|
---|
| 648 | hashTableFile.read((char *) &pointer, sizeof(cellindex));
|
---|
| 649 |
|
---|
| 650 | if (pointer == 0) {
|
---|
| 651 | // There is no entry at all in the hash table for this entry
|
---|
| 652 | // so nothing is stored
|
---|
| 653 | return false;
|
---|
| 654 |
|
---|
| 655 | } else {
|
---|
| 656 | // There is a list starting at this hash value, so the phrase may
|
---|
| 657 | // be already remembered in that list
|
---|
| 658 | while (pointer != 0) {
|
---|
| 659 | // Read the entry pointed to by pointer
|
---|
| 660 | listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
|
---|
| 661 | listOfEntries.read((char *) &readp, sizeof(cellindex));
|
---|
| 662 | listOfEntries.read((char *) &readi, sizeof(cellindex));
|
---|
| 663 | listOfEntries.read((char *) &readl, sizeof(cellindex));
|
---|
| 664 |
|
---|
| 665 | if ((readi == index) && (readl = length)) {
|
---|
| 666 | // we've found the phrase stored here
|
---|
| 667 | return true;
|
---|
| 668 | } else {
|
---|
| 669 | // go on to the next node
|
---|
| 670 | pointer = readp;
|
---|
| 671 | }
|
---|
| 672 | }
|
---|
| 673 | }
|
---|
| 674 | return false;
|
---|
| 675 | }
|
---|
| 676 |
|
---|
[2801] | 677 |
|
---|
[1562] | 678 | void deleteLongPhraseMemory() {
|
---|
| 679 | // remove the hash & other files
|
---|
| 680 |
|
---|
| 681 | hashTableFile.close();
|
---|
| 682 | listOfEntries.close();
|
---|
| 683 | remove(hashTableFileName);
|
---|
| 684 | remove(listOfEntriesName);
|
---|
| 685 |
|
---|
| 686 | }
|
---|
| 687 |
|
---|
| 688 |
|
---|
| 689 | // Read the collection statistics file
|
---|
[2801] | 690 | //
|
---|
[1562] | 691 | void readStatistics() {
|
---|
| 692 |
|
---|
| 693 | // open the statistics file
|
---|
| 694 | char filename[FILENAME_MAX];
|
---|
| 695 | sprintf(filename, "%s/clauses.stats", collection);
|
---|
| 696 |
|
---|
| 697 | // Open the file
|
---|
| 698 | ifstream inFile(filename, ios::in);
|
---|
| 699 | if (!inFile) {
|
---|
| 700 | cerr << "File " << filename << " could not be opened\n";
|
---|
| 701 | exit(1);
|
---|
| 702 | }
|
---|
| 703 |
|
---|
| 704 | // Read the numbers file into the numbers array
|
---|
| 705 | char key[1000];
|
---|
| 706 | symbol value;
|
---|
| 707 | while (inFile >> key >> value){
|
---|
| 708 | if (strcmp(key, "first_stopword") == 0) {
|
---|
| 709 | firstStopSymbol = value;
|
---|
| 710 | } else if (strcmp(key, "last_stopword") == 0) {
|
---|
| 711 | lastStopSymbol = value;
|
---|
| 712 | } else if (strcmp(key, "first_contentword") == 0) {
|
---|
| 713 | firstContentSymbol = value;
|
---|
| 714 | } else if (strcmp(key, "last_contentword") == 0) {
|
---|
| 715 | lastContentSymbol = value;
|
---|
| 716 | }
|
---|
| 717 | }
|
---|
| 718 | inFile.close();
|
---|
| 719 |
|
---|
| 720 | // Make sure we have the information we need
|
---|
| 721 | if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
|
---|
| 722 | cerr << "Statistics file incomplete" << endl;
|
---|
| 723 | exit(1);
|
---|
| 724 | }
|
---|
| 725 | }
|
---|
| 726 |
|
---|
[2801] | 727 | cellcount getContentCount(symbol firstContent) {
|
---|
[1562] | 728 |
|
---|
[2801] | 729 | cellcount content=0;
|
---|
| 730 | for (cellcount i=0; i<inputLength; i++) {
|
---|
| 731 | if (symbols[i]>=firstContent) content++;
|
---|
| 732 | }
|
---|
[1562] | 733 |
|
---|
[2801] | 734 | return content;
|
---|
| 735 | }
|
---|
[1562] | 736 |
|
---|
[2801] | 737 | int main (int argc, char * argv[]) {
|
---|
[1562] | 738 |
|
---|
[2801] | 739 | // Command-line arguments
|
---|
| 740 | // argv[1] is the phindex directory
|
---|
| 741 | // argv[2] is the maximum array symbol length (optional)
|
---|
| 742 | // argv[3] is the mode, where 1 is stopword mode (optional)
|
---|
| 743 | if (argc < 2) {
|
---|
| 744 | cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
|
---|
| 745 | exit(1);
|
---|
| 746 | }
|
---|
| 747 |
|
---|
| 748 | // collection directory
|
---|
| 749 | strcpy(collection, argv[1]);
|
---|
| 750 |
|
---|
| 751 | // mode parameter
|
---|
| 752 | phraseMode = atoi(argv[2]);
|
---|
| 753 | assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
|
---|
| 754 |
|
---|
| 755 | // optional verbosity parameter
|
---|
| 756 | if (argc == 4) {
|
---|
| 757 | verbosity = atoi(argv[3]);
|
---|
| 758 | assert (verbosity >= 0);
|
---|
| 759 | }
|
---|
| 760 |
|
---|
| 761 | if (verbosity) {
|
---|
| 762 | cout << "suffix: the phrase extraction program" << endl;
|
---|
| 763 | }
|
---|
| 764 |
|
---|
| 765 | if (verbosity > 1) {
|
---|
| 766 | if (phraseMode == STOPWORDS) {
|
---|
| 767 | cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
|
---|
| 768 | } else {
|
---|
| 769 | cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
|
---|
| 770 | }
|
---|
| 771 | }
|
---|
| 772 |
|
---|
| 773 | // Read the statistics file
|
---|
| 774 | readStatistics();
|
---|
| 775 |
|
---|
| 776 | // Read the numbers file
|
---|
| 777 | readNumbers();
|
---|
| 778 |
|
---|
| 779 | if (numberOfDocuments == 0) {
|
---|
| 780 | cerr << "There are no documents in this collection!" << endl;
|
---|
| 781 | exit(1);
|
---|
| 782 | }
|
---|
| 783 |
|
---|
| 784 | symbol firstContent;
|
---|
| 785 | if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
|
---|
| 786 | else firstContent = firstStopSymbol;
|
---|
| 787 |
|
---|
| 788 | cellcount contentLength = 0;
|
---|
| 789 | contentLength = getContentCount(firstContent);
|
---|
| 790 |
|
---|
| 791 | // Create the suffix & prefix arrays
|
---|
| 792 | suffixArray = new symbol *[contentLength];
|
---|
| 793 | prefixArray = new symbol *[contentLength];
|
---|
| 794 |
|
---|
| 795 | cellcount here=0;
|
---|
| 796 | // Initialise prefix and suffix arrays, only use the needed suffixes
|
---|
| 797 | for (cellcount j = 0; j < inputLength; j++) {
|
---|
| 798 | if (symbols[j]>=firstContent) {
|
---|
| 799 | suffixArray[here] = &symbols[j];
|
---|
| 800 | prefixArray[here] = &symbols[j];
|
---|
| 801 | here++;
|
---|
| 802 | }
|
---|
| 803 | }
|
---|
| 804 | qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
|
---|
| 805 | qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
|
---|
| 806 |
|
---|
[2807] | 807 | checkLength = contentLength/8 + 1;
|
---|
| 808 | suffixCheck = new check[checkLength];
|
---|
[2801] | 809 | if (suffixCheck == NULL) {
|
---|
| 810 | cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
|
---|
| 811 | exit(2);
|
---|
| 812 | }
|
---|
[2807] | 813 | memset(suffixCheck, 0, sizeof(check)*checkLength);
|
---|
[2801] | 814 |
|
---|
| 815 | cout <<"\ngenerating the phrase hierarchy\n\n";
|
---|
| 816 |
|
---|
| 817 | // Create the document arrays
|
---|
| 818 | if (verbosity > 1) {
|
---|
| 819 | cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
|
---|
| 820 | }
|
---|
| 821 |
|
---|
| 822 | // The document frequecy array is used to count the number of times
|
---|
| 823 | // each phrase occurs in each document. The number of documents in
|
---|
| 824 | // which a phrase occurs is stored in df.
|
---|
| 825 | frequency *documentFrequency = new frequency[numberOfDocuments];
|
---|
| 826 | frequency df;
|
---|
| 827 |
|
---|
| 828 | // documentArray will be searched in order to discover which document
|
---|
| 829 | // each phrase occurs in.
|
---|
| 830 | documentArray = new symbol *[numberOfDocuments];
|
---|
| 831 |
|
---|
| 832 | // just scan through the input text to find the doc starts
|
---|
| 833 | cellindex d = 0;
|
---|
| 834 | for (cellindex i=0; i<inputLength; i++) {
|
---|
| 835 | if (symbols[i] == DOCUMENTSTART) {
|
---|
| 836 | documentArray[d] = &symbols[i];
|
---|
| 837 | d++;
|
---|
| 838 | }
|
---|
| 839 | }
|
---|
| 840 |
|
---|
| 841 | // the phrases stuff is expecting inputLength to be the length of the
|
---|
| 842 | // suffix array, so change it.
|
---|
| 843 | inputLength = contentLength;
|
---|
| 844 |
|
---|
| 845 | // Extract phrases
|
---|
| 846 | //
|
---|
| 847 | // We will make several passesover the data, in each case considering
|
---|
| 848 | // a set of input phrases and generating a set of output phrases, which
|
---|
| 849 | // we will expancd in later passes.
|
---|
| 850 | //
|
---|
| 851 | // The input phrases in the first pass will be the vocabulary.
|
---|
| 852 | // In later passes, the input phrases will be the output phrases of the
|
---|
| 853 | // previous pass.
|
---|
| 854 | //
|
---|
| 855 | // In each pass we will consider each input phrase in turn. If we
|
---|
| 856 | // have seen it before, we will ignore it. Otherwise, we will expand
|
---|
| 857 | // it and add its expansions to the set of output phrases.
|
---|
| 858 |
|
---|
| 859 | // Store the phrase data in the phrases file
|
---|
| 860 | char phraseDataName[FILENAME_MAX];
|
---|
| 861 | sprintf(phraseDataName, "%s/phrases", collection);
|
---|
| 862 | ofstream phraseData(phraseDataName, ios::out);
|
---|
| 863 | if (!phraseData) {
|
---|
| 864 | cout << "File " << phraseDataName << " could not be opened\n";
|
---|
| 865 | exit(1);
|
---|
| 866 | }
|
---|
| 867 |
|
---|
| 868 | // Count the number of phrases output
|
---|
| 869 | unsigned long int phraseCounter = 0;
|
---|
| 870 |
|
---|
| 871 | // Set up the phrase expansion memory.
|
---|
| 872 | // We need this so that we don't expand a phrase more than once
|
---|
| 873 | initialisePhraseMemory();
|
---|
| 874 |
|
---|
| 875 | // The current pass numebr
|
---|
| 876 | int phrasePass = 1;
|
---|
| 877 |
|
---|
| 878 |
|
---|
| 879 | // PASS NUMBER 1
|
---|
| 880 | if (verbosity > 1) {
|
---|
| 881 | cout << "Starting pass " << phrasePass << endl;
|
---|
| 882 | }
|
---|
| 883 |
|
---|
| 884 | ofstream outPhrase;
|
---|
| 885 | char outPhraseName[FILENAME_MAX];
|
---|
| 886 | unsigned long int outPhraseCounter = 0;
|
---|
| 887 |
|
---|
| 888 | // On the first pass, simply work through the vocabulary
|
---|
| 889 | sprintf(outPhraseName, "%s/outPhrase.1", collection);
|
---|
| 890 | outPhrase.open(outPhraseName, ios::out);
|
---|
| 891 | if (!outPhrase) {
|
---|
| 892 | cerr << "File " << outPhraseName << " could not be opened\n";
|
---|
| 893 | exit(1);
|
---|
| 894 | }
|
---|
| 895 |
|
---|
| 896 | // Iterate over the different symbols by working through the suffix array
|
---|
| 897 | vector<Phrase> result;
|
---|
| 898 | cellindex ij = 0;
|
---|
| 899 | char *tmpString;
|
---|
| 900 |
|
---|
| 901 | Phrase p;
|
---|
| 902 | while (ij < inputLength) {
|
---|
| 903 |
|
---|
| 904 | // make a new phrase of length 1
|
---|
| 905 | p = Phrase(suffixArray[ij], 1, SUFFIX);
|
---|
| 906 | p.findFirstAndLastSuffix(ij, inputLength-1);
|
---|
| 907 |
|
---|
| 908 | // We ignore this symbol if it occurs only once, if it is a delimiter,
|
---|
| 909 | // of if we are in stopwords mode and it is a stopword
|
---|
| 910 | // - in this new version, only need to check freq
|
---|
| 911 | // We could imagine a new mode/command-line option, which is like
|
---|
| 912 | // STOPWORDS but without this restrictrion. This would let you browse
|
---|
| 913 | // from "the" to "the AGRIS" for example, but not from "AGRIS" to
|
---|
| 914 | // "the AGRIS" (where the is a stopword and AGRIS a content word).
|
---|
| 915 | // The system used to work like this; it is easy to implement, but
|
---|
| 916 | // it explodes the size of the indexes. So: would it be useful?
|
---|
| 917 | if (p.suffixFrequency > 1) {
|
---|
| 918 | // Get minimal expansions of the phrase
|
---|
| 919 | getExpansions(p, result);
|
---|
| 920 |
|
---|
| 921 | if (!result.empty()) {
|
---|
| 922 |
|
---|
| 923 | // Remember that we have expanded this phrase
|
---|
| 924 | rememberThisPhrase(ij, 1);
|
---|
| 925 |
|
---|
| 926 | // write the phrase text
|
---|
| 927 | phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
|
---|
| 928 | << result.size() << ":";
|
---|
| 929 |
|
---|
| 930 | // write the results
|
---|
| 931 | for (cellcount k = 0; k < result.size(); k++) {
|
---|
| 932 | if (k) {
|
---|
| 933 | phraseData << ",";
|
---|
| 934 | }
|
---|
| 935 | phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
|
---|
| 936 | outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
|
---|
| 937 | outPhraseCounter++;
|
---|
| 938 | }
|
---|
| 939 | result.clear();
|
---|
| 940 |
|
---|
| 941 | // Write the documents in which this phrase occurs
|
---|
| 942 | df = getDocumentOccurrances(p, documentFrequency);
|
---|
| 943 | phraseData << ":" << df << ":";
|
---|
| 944 |
|
---|
| 945 | // write the documents
|
---|
| 946 | for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
|
---|
| 947 | if (documentFrequency[m]) {
|
---|
| 948 | if (first) {
|
---|
| 949 | first = 0;
|
---|
| 950 | } else {
|
---|
| 951 | phraseData << ";";
|
---|
| 952 | }
|
---|
| 953 | // Output the document number. Note that here we've numbered the
|
---|
| 954 | // N documents from 0 to N-1, but later they'll be 1-N. Thus we
|
---|
| 955 | // add 1 to the document id when we output it.
|
---|
| 956 | phraseData << "d" << (m+1);
|
---|
| 957 | // Next, output the frequency with which the document occurs, but
|
---|
| 958 | // only if it is > 1.
|
---|
| 959 | if (documentFrequency[m] > 1) {
|
---|
| 960 | phraseData << "," << documentFrequency[m];
|
---|
| 961 | }
|
---|
| 962 | }
|
---|
| 963 | }
|
---|
| 964 |
|
---|
| 965 | phraseData << endl;
|
---|
| 966 | phraseCounter++;
|
---|
| 967 |
|
---|
| 968 | // feedback
|
---|
| 969 | if (verbosity) {
|
---|
| 970 | if (phraseCounter % 1000 == 0) {
|
---|
| 971 | cout << "phrase " << phraseCounter << ": "
|
---|
| 972 | << "cell " << p.firstSuffixIndex << " - " << p << endl;
|
---|
| 973 | }
|
---|
| 974 | }
|
---|
| 975 | }
|
---|
| 976 | }
|
---|
| 977 | ij = p.lastSuffixIndex + 1;
|
---|
| 978 | }
|
---|
| 979 | outPhrase.close();
|
---|
| 980 |
|
---|
| 981 | // REMAINING PASSES
|
---|
| 982 | // The previous outPhrase file forms the input to each new pass
|
---|
| 983 | cellcount start, length;
|
---|
| 984 | while (outPhraseCounter > 0) {
|
---|
| 985 |
|
---|
| 986 | // Start a new pass
|
---|
| 987 | phrasePass++;
|
---|
| 988 | if (verbosity) {
|
---|
| 989 | cout << "Starting pass " << phrasePass << endl;
|
---|
| 990 | }
|
---|
| 991 |
|
---|
| 992 | // Open the input file
|
---|
| 993 | char inPhraseName[FILENAME_MAX];
|
---|
| 994 | sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
|
---|
| 995 | ifstream inPhrase (inPhraseName, ios::in);
|
---|
| 996 | if (!inPhrase) {
|
---|
| 997 | cerr << "File " << inPhraseName << " could not be opened\n";
|
---|
| 998 | exit(1);
|
---|
| 999 | }
|
---|
| 1000 |
|
---|
| 1001 | // Open the output file
|
---|
| 1002 | sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
|
---|
| 1003 | outPhrase.open(outPhraseName, ios::out);
|
---|
| 1004 | if (!outPhrase) {
|
---|
| 1005 | cerr << "File " << outPhraseName << " could not be opened\n";
|
---|
| 1006 | exit(1);
|
---|
| 1007 | }
|
---|
| 1008 | outPhraseCounter = 0;
|
---|
| 1009 |
|
---|
| 1010 | // Process each phrase
|
---|
| 1011 | while(inPhrase >> start >> length) {
|
---|
| 1012 |
|
---|
| 1013 | // Ignore the phrase if we have expanded it before
|
---|
| 1014 | if (isPhraseStored(start, length))
|
---|
| 1015 | continue;
|
---|
| 1016 |
|
---|
| 1017 | // Remember that we have examined this phrase
|
---|
| 1018 | rememberThisPhrase(start, length);
|
---|
| 1019 |
|
---|
| 1020 | // Find the phrase in the suffixarray
|
---|
| 1021 | p = Phrase(suffixArray[start], length, SUFFIX);
|
---|
| 1022 | p.findFirstAndLastSuffix(start, inputLength-1);
|
---|
| 1023 |
|
---|
| 1024 | // Ignore the phrase if it only occurs once
|
---|
| 1025 | if (p.suffixFrequency < 2)
|
---|
| 1026 | continue;
|
---|
| 1027 |
|
---|
| 1028 | // Write the phrase text;
|
---|
| 1029 | phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
|
---|
| 1030 |
|
---|
| 1031 | // Expand the phrase, if it is fewer than 8 words long
|
---|
| 1032 | if (length <= 8) {
|
---|
| 1033 |
|
---|
| 1034 | // Get the minimal expansions for this phrase
|
---|
| 1035 | getExpansions(p, result);
|
---|
| 1036 |
|
---|
| 1037 | // write the results
|
---|
| 1038 | phraseData << result.size() << ":";
|
---|
| 1039 |
|
---|
| 1040 | for (cellcount i = 0; i < result.size(); i++) {
|
---|
| 1041 | if (i) {
|
---|
| 1042 | phraseData << ",";
|
---|
| 1043 | }
|
---|
| 1044 | phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
|
---|
| 1045 | outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
|
---|
| 1046 | outPhraseCounter++;
|
---|
| 1047 | }
|
---|
| 1048 | result.clear();
|
---|
| 1049 |
|
---|
| 1050 | } else {
|
---|
| 1051 | // phrase is too long to expand further
|
---|
| 1052 | phraseData << "0:";
|
---|
| 1053 | }
|
---|
| 1054 |
|
---|
| 1055 | // Write the documents in which this phrase occurs
|
---|
| 1056 | df = getDocumentOccurrances(p, documentFrequency);
|
---|
| 1057 | phraseData << ":" << df << ":";
|
---|
| 1058 |
|
---|
| 1059 | // write the documents
|
---|
| 1060 | for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
|
---|
| 1061 | if (documentFrequency[i]) {
|
---|
| 1062 | if (first) {
|
---|
| 1063 | first = 0;
|
---|
| 1064 | } else {
|
---|
| 1065 | phraseData << ";";
|
---|
| 1066 | }
|
---|
| 1067 | // Output the document number. Note that here we've numbered the
|
---|
| 1068 | // N documents from 0 to N-1, but later they'll be 1-N. Thus we
|
---|
| 1069 | // add 1 to the document id when we output it.
|
---|
| 1070 | phraseData << "d" << (i+1);
|
---|
| 1071 | // Next, output the frequency with which the document occurs, but
|
---|
| 1072 | // only if it is > 1.
|
---|
| 1073 | if (documentFrequency[i] > 1) {
|
---|
| 1074 | phraseData << "," << documentFrequency[i];
|
---|
| 1075 | }
|
---|
| 1076 | }
|
---|
| 1077 | }
|
---|
| 1078 |
|
---|
| 1079 | phraseData << endl;
|
---|
| 1080 | phraseCounter++;
|
---|
| 1081 |
|
---|
| 1082 | // feedback
|
---|
| 1083 | if (verbosity) {
|
---|
| 1084 | if (phraseCounter % 1000 == 0) {
|
---|
| 1085 | cout << "phrase " << phraseCounter << ": "<< "start " << start
|
---|
| 1086 | << ", length " << length << " - " << p << endl;
|
---|
| 1087 | }
|
---|
| 1088 | }
|
---|
| 1089 |
|
---|
| 1090 | }
|
---|
| 1091 |
|
---|
| 1092 | inPhrase.close();
|
---|
| 1093 | outPhrase.close();
|
---|
| 1094 | }
|
---|
| 1095 |
|
---|
| 1096 | phraseData.close();
|
---|
| 1097 | deletePhraseMemory();
|
---|
| 1098 |
|
---|
| 1099 | delete [] documentFrequency;
|
---|
| 1100 | delete [] symbols;
|
---|
| 1101 | delete [] suffixArray;
|
---|
| 1102 | delete [] prefixArray;
|
---|
| 1103 | delete [] suffixCheck;
|
---|
| 1104 | delete [] documentArray;
|
---|
| 1105 |
|
---|
| 1106 |
|
---|
| 1107 |
|
---|
| 1108 | cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
|
---|
| 1109 | return 0;
|
---|
| 1110 | }
|
---|
| 1111 |
|
---|
| 1112 |
|
---|
| 1113 |
|
---|
| 1114 |
|
---|
| 1115 |
|
---|