source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2867

Last change on this file since 2867 was 2867, checked in by paynter, 22 years ago

Moved all the sufficCheck functionality into the check.h header and
inlined it.

  • Property svn:keywords set to Author Date Id Revision
File size: 30.0 KB
Line 
1/**********************************************************************
2 *
3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
4 * and prefix arrays (cgn & gwp's simpler algorithm,
5 * and kjm's improvements).
6 *
7 * Copyright 2000 Gordon W. Paynter
8 * Copyright 2000 The New Zealand Digital Library Project
9 *
10 * A component of the Greenstone digital library software from the
11 * New Zealand Digital Library Project at the University of Waikato,
12 * New Zealand.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License as
16 * published by the Free Software Foundation; either version 2 of
17 * the License, or (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 *
28 *********************************************************************/
29
30#include <assert.h>
31#include <math.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#if defined(GSDL_USE_IOS_H)
37# include <fstream.h>
38# include <iostream.h>
39#else
40# include <fstream>
41# include <iostream>
42#endif
43
44#if defined(GSDL_USE_STL_H)
45# if defined(GSDL_USE_ALGO_H)
46# include <algo.h>
47# else
48# include <algorithm.h>
49# endif
50# include <vector.h>
51#else
52# include <algorithm>
53# include <vector>
54#endif
55
56#include "suffix.h"
57#include "phrase.h"
58#include "check.h"
59
60// Global variables declared in suffix.h
61cellcount inputLength;
62
63symbol *symbols;
64symbol **suffixArray;
65symbol **prefixArray;
66
67// How many documents are in this collection?
68cellcount numberOfDocuments;
69symbol **documentArray;
70
71
72// Do we accept any phrase, or do we eliminate those ending with stopwords ?
73int phraseMode = ANYPHRASE; //STOPWORDS;
74
75
76// The filestem of the collection's phindex directory
77char collection[FILENAME_MAX];
78
79
80// The ranges of the stopword and content-word symbols for the collection
81symbol firstStopSymbol = 0;
82symbol lastStopSymbol = 0;
83symbol firstContentSymbol = 0;
84symbol lastContentSymbol = 0;
85
86
87// Some useful comparison functions, defined below.
88int suffixCompare(const void *, const void *);
89int prefixCompare(const void *, const void *);
90int pointerCompare(const void *, const void *);
91
92
93// Functions for implementing "phrase memory". These let us "remember"
94// each phrase that we've expanded without using too much memory.
95void initialisePhraseMemory();
96void rememberThisPhrase(cellindex index, cellcount length);
97bool isPhraseStored(cellindex index, cellcount length);
98void deletePhraseMemory();
99
100
101// how much output do we want?
102int verbosity = 1;
103
104
105// Get a phrase's expansions
106//
107// Get the set of "minimal", "maximal", non-unique expansions of a
108// phrase p, using the simpler algorithm that Craig and Gordon came up
109// with at Google.
110//
111// Returns a vector of Expansions.
112
113void getExpansions(Phrase &p, vector<Phrase> &results) {
114
115 // 1. Get the initial candidates
116 vector<Phrase> candidates;
117 p.initialSuffixCandidates(candidates);
118 int suffcands = candidates.size();
119 p.initialPrefixCandidates(candidates);
120
121 if (candidates.size() == 0)
122 return;
123
124 vector<Phrase>::iterator i;
125 for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
126 i->expandWhileUniquePrefixExtension();
127 i->ensureSuffixFound();
128 }
129 for (i; i != candidates.end(); ++i) {
130 i->expandWhileUniqueSuffixExtension();
131 }
132
133 // 3. Ensure minimality: ignore phrases whose subphrases are also found
134 results.clear();
135
136 // Initialise the candidates, check array, and various variables.
137 sort(candidates.begin(), candidates.end(), isShorter);
138 unsigned minimum_length = candidates.begin()->length;
139 clearSuffixCheck();
140
141 // Try to add each candidate to the results set, ignoring the non-minimal
142 for (vector<Phrase>::iterator candidate = candidates.begin();
143 candidate != candidates.end(); candidate++) {
144
145 // Make a copy of candidate to mutilate while performing sub-phrase checks
146 Phrase temp_phrase(*candidate);
147 bool shorter_found = false;
148
149 // Check for shorter and shorter versions of the temporary phrase
150 while (temp_phrase.length >= minimum_length && !shorter_found) {
151 temp_phrase.ensureSuffixFound();
152 if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
153 temp_phrase.shortenByOneAtPrefix();
154 else
155 shorter_found = true;
156
157 // Possible efficiency here: we can finish if the prefix of c
158 // and temp_phrase are the same for candidate->length symbols.
159 }
160
161 // If no shorter phrase is found, use this one
162 if (!shorter_found) {
163 results.push_back(*candidate);
164 candidate->ensureSuffixFound();
165 setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
166 }
167 }
168}
169
170
171// suffixCompare
172//
173// Compare two pointers into a suffix array. We use this in the
174// qsort function, so the input are pointers to pointers.
175//
176// Return -1 if (a < b), otherwise (a > b) so return +1,
177
178int suffixCompare(const void *cpa, const void *cpb) {
179
180 // Cast then dereference pointers to suffix array elements
181 symbol *pa = (symbol *) cpa;
182 symbol *pb = (symbol *) cpb;
183 pa = (symbol *) *pa;
184 pb = (symbol *) *pb;
185
186 // If the two elements are the same, examine the next one
187 while (*pa == *pb) {
188 *pa++;
189 *pb++;
190 }
191
192 // Make the copmparison and return
193 if ( *pa < *pb) {
194 return -1;
195 } else {
196 return +1;
197 }
198}
199
200
201// prefixCompare
202//
203// Compare two pointers into a prefix array. We use this in the
204// qsort function, so the input are pointers to pointers.
205//
206// Return -1 if (a > b), otherwise (a < b) so return +1,
207
208int prefixCompare(const void *cpa, const void *cpb) {
209
210 // Cast then dereference pointers to prefix array elements
211 symbol *pa = (symbol *) cpa;
212 symbol *pb = (symbol *) cpb;
213 pa = (symbol *) *pa;
214 pb = (symbol *) *pb;
215
216 // If the two elements are the same, examine the next one
217 while (*pa == *pb) {
218 *pa--;
219 *pb--;
220 }
221
222 // Make the copmparison and return
223 if ( *pa > *pb) {
224 return -1;
225 } else {
226 return +1;
227 }
228}
229
230// simpleCompare
231//
232// Compare two pointers based on the memory location they point to.
233
234int pointerCompare( const void *pa, const void *pb ) {
235
236 symbol **a = (symbol **) pa;
237 symbol **b = (symbol **) pb;
238
239 if (*a < *b) {
240 return -1;
241 } else if (*a > *b) {
242 return 1;
243 } else {
244 return 0;
245 }
246}
247
248
249// Read the clauses.numbers file into the "symbols" array.
250//
251// Each number in the file is a symbol number; it is essential that
252// the first symbol (and no others) be COLLECTIONSTART and the last
253// symbol (and no others) be COLLECTIONEND.
254//
255// Return the number of numbers in the array.
256
257int readNumbers() {
258
259 char filename[FILENAME_MAX];
260 sprintf(filename, "%s/clauses.numbers", collection);
261 if (verbosity) {
262 cout << "Reading numbers file: " << filename << endl;
263 }
264
265 // Open the numbers file
266 ifstream inFile1(filename, ios::in);
267 if (!inFile1) {
268 cerr << "File " << filename << " could not be opened\n";
269 exit(1);
270 }
271
272 // Count the number of symbols
273 inputLength = 0;
274 symbol word;
275 while (inFile1 >> word) {
276 inputLength++;
277 }
278 inFile1.close();
279
280 // Allocate the symbbols array
281 if (verbosity > 1) {
282 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
283 }
284 symbols = new symbol[inputLength];
285 if (symbols == NULL) {
286 cerr << "Suffix error: not enough memory to hold " << inputLength
287 << " symbols." << endl;
288 exit(2);
289 }
290
291 // Read the numbers file into the numbers array
292 if (verbosity > 2) {
293 cout << "Reading the numbers" << endl;
294 }
295
296 ifstream inFile2(filename, ios::in);
297 if (!inFile2) {
298 cerr << "File " << filename << " could not be opened\n";
299 exit(1);
300 }
301 cellcount next = 0;
302 numberOfDocuments = 0;
303 while (inFile2 >> word) {
304 symbols[next++] = word;
305 if (word == DOCUMENTSTART) {
306 numberOfDocuments++;
307 }
308 }
309 inFile2.close();
310
311 // Make sure the numbers file is intact
312 assert(symbols[0] == COLLECTIONSTART);
313 assert(symbols[next-1] == COLLECTIONEND);
314
315 return inputLength;
316}
317
318
319
320// Get Document Occurrance statistics
321//
322// Given a phrase, what documents does it occur in?
323
324cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
325
326 // cout << "searching for \""<< p << "\" in documents "
327 // << 0 << "-" << numberOfDocuments - 1 << endl;
328
329 // The number of documents in which this phrase occurs
330 cellcount df = 0;
331
332 // Initialise the document frequency array
333 // for (cellindex i = 0; i < numberOfDocuments; i++) {
334 // frequency[i] = 0;
335 //}
336 memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
337
338 // variables used to facilitate the search
339 cellindex begin;
340 cellindex end;
341 cellindex d;
342 symbol *target;
343 bool found;
344
345 // search for the document in which each occurence of the phrase is found
346 for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
347
348 // cout << "looking for phrase at suffixArray[" << j << "]\n";
349
350 target = suffixArray[j];
351 begin = 0;
352 end = numberOfDocuments - 1;
353 found = false;
354
355 // Search for the occurence of a document delimiter that target
356 // occurs immediately after.
357 // We do this by performing a binary chop search on documentArray.
358 while (!found) {
359
360 // cout << "searching for " << (cellindex) target << " in "
361 // << begin << " - " << end << endl;
362
363 assert (begin <= end);
364
365 // If the beginning and end of the interval are the same,
366 // then we've found the correct document
367 if (begin == end) {
368 if (frequency[begin] == 0) {
369 df++;
370 }
371 frequency[begin]++;
372 found = true;
373 }
374
375 // Otherwise, examine a new document midway through the begin-end
376 // interval and see if it is the one.
377 else {
378 d = (begin + end) / 2;
379 if (target > documentArray[d]) {
380 // If target addrss is greater than this, but thisi sthe last document,
381 // then this must be the one we want. Or, if target is greater than
382 // this one but less then the next, this must be the one we wnat.
383 if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
384 if (frequency[d] == 0) {
385 df++;
386 }
387 frequency[d]++;
388 found = true;
389 } else {
390 // otherwise we know to search later in the document set
391 begin = d + 1;
392 }
393 } else {
394 // search earlier in the document set
395 end = d - 1;
396 }
397 }
398 }
399 }
400 return df;
401}
402
403
404
405
406
407
408// phraseExpansionMemory : Which phrases have we expanded?
409//
410// A set of utilities for keeping track of which phrases we have expanded.
411// We don't want to expand a phrase more than once, after all.
412//
413// This REALLY ought to be in its own class, but it works so that's okay.
414//
415// Phrases are identified by their firstSuffixPosition and length.
416//
417// Functions provided are:
418// void initialisePhraseMemory()
419// void rememberThisPhrase(index, length)
420// bool isPhraseStored(index, length)
421// void deletePhraseMemory()
422//
423// Internally, we will have two separate cases:
424//
425// Phrases of length 1-8:
426// unsigned char phraseMemory[inputLength]
427// is an array where each cell "remembers" the corresponding index in the
428// suffixArray, and each of the 8 bits of the cell correspond to the phrases
429// of length 1, 2... 8.
430// Eventually, we will make this disk-based (i.e. store the array in a file).
431//
432// Phrases of length 9+:
433// file hashTableFile
434// file listOfEntries
435// The first file is a hash table; each phrase maps to one of its cells, which
436// contains either 0 (empty, no occurence) or a number which is an entry number
437// in the second file. This file contains a "list" of entries. Each consists of
438// three numbers: the suffixArray index of the phrase, the length of the phrase,
439// and the entry number of the next phrase with the same hash.
440//
441
442
443unsigned char *phraseMemory;
444
445void initialiseLongPhraseMemory();
446void rememberThisLongPhrase(cellindex index, cellcount length);
447bool isLongPhraseStored(cellindex index, cellcount length);
448void deleteLongPhraseMemory();
449
450
451void initialisePhraseMemory() {
452
453 phraseMemory = new unsigned char[inputLength];
454
455 // to begin with, everything is empty
456 // for (cellcount i = 0; i < inputLength; i++) {
457 // phraseMemory[i] = 0;
458 //}
459 memset(phraseMemory, 0, sizeof(char)*inputLength);
460
461 // intialise the hashTable of long phrases
462 initialiseLongPhraseMemory();
463
464}
465
466void rememberThisPhrase(cellindex index, cellcount length) {
467
468 // if the phrase is very long, use the file-based system
469 if (length > 8) {
470 rememberThisLongPhrase(index, length);
471 return;
472 }
473
474 // create a char with just the bit corresponding to length set
475 unsigned char newbit = 1;
476 for (cellcount i = 1; i < length; i++) {
477 newbit <<= 1;
478 }
479
480 // set that bit in the memory array at position index
481 phraseMemory[index] |= newbit;
482}
483
484
485bool isPhraseStored(cellindex index, cellcount length) {
486
487 // if the phrase is very long, use the file-based system
488 if (length > 8) {
489 return isLongPhraseStored(index, length);
490 }
491
492 // create a char with just the bit corresponding to length set
493 unsigned char newbit = 1;
494 for (cellcount i = 1; i < length; i++) {
495 newbit <<= 1;
496 }
497
498 // retrurn true if that bit is set in memory arrayat position index
499 return (phraseMemory[index] & newbit);
500}
501
502void deletePhraseMemory() {
503 delete phraseMemory;
504 deleteLongPhraseMemory();
505}
506
507
508
509// Files etc used to store "long" equavlents of the above
510
511fstream hashTableFile;
512char hashTableFileName[FILENAME_MAX];
513fstream listOfEntries;
514char listOfEntriesName[FILENAME_MAX];
515cellindex nextEntryNumber;
516
517const cellcount bigPrime = 7919;
518
519
520void initialiseLongPhraseMemory() {
521
522 cellindex example = 0;
523
524 sprintf(hashTableFileName, "%s/hashTable", collection);
525 sprintf(listOfEntriesName, "%s/hashLists", collection);
526
527
528 // create the new hashtable
529 if (verbosity > 1) {
530 cout << "Initialising hashTable: " << hashTableFileName << endl;
531 }
532 hashTableFile.open(hashTableFileName, ios::in | ios::out);
533 for (cellcount i = 0; i < bigPrime; i++) {
534 hashTableFile.write((char *) &example, sizeof(example));
535 }
536
537 // create the list of phrases
538 if (verbosity > 1) {
539 cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
540 }
541 listOfEntries.open(listOfEntriesName, ios::in | ios::out);
542 listOfEntries.write((char *) &example, sizeof(example));
543 listOfEntries.write((char *) &example, sizeof(example));
544 listOfEntries.write((char *) &example, sizeof(example));
545 nextEntryNumber = 1;
546}
547
548
549void rememberThisLongPhrase(cellindex index, cellcount length) {
550
551 // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
552
553 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
554 cellindex pointer;
555 cellindex zero = 0;
556 cellindex readp = 0;
557 cellindex readi = 0;
558 cellindex readl = 0;
559
560 hashTableFile.seekg(hashOffset);
561 hashTableFile.read((char *) &pointer, sizeof(cellindex));
562
563 if (pointer == 0) {
564 // There is no entry at all in the hash table for this entry
565 // so create one
566
567 pointer = nextEntryNumber++;
568 hashTableFile.seekg(hashOffset);
569 hashTableFile.write((char *) &pointer, sizeof(cellindex));
570
571 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
572 listOfEntries.write((char *) &zero, sizeof(cellindex));
573 listOfEntries.write((char *) &index, sizeof(cellindex));
574 listOfEntries.write((char *) &length, sizeof(cellindex));
575
576 } else {
577 // There is a list starting at this hash value, so the phrase may
578 // be already remembered, or it might need to be appended
579
580 while (pointer != 0) {
581 // Read the entry pointed to by pointer
582 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
583 listOfEntries.read((char *) &readp, sizeof(cellindex));
584 listOfEntries.read((char *) &readi, sizeof(cellindex));
585 listOfEntries.read((char *) &readl, sizeof(cellindex));
586
587 // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
588
589 if ((readi == index) && (readl = length)) {
590 // we've found that we've already stored it
591 return;
592 } else if (readp == 0) {
593 // we're reached the end of the list. Add a new entry.
594 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
595 listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
596 pointer = nextEntryNumber++;
597
598 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
599 listOfEntries.write((char *) &zero, sizeof(cellindex));
600 listOfEntries.write((char *) &index, sizeof(cellindex));
601 listOfEntries.write((char *) &length, sizeof(cellindex));
602 return;
603 } else {
604 // go on to the next node
605 pointer = readp;
606 }
607 }
608 }
609
610
611}
612
613
614bool isLongPhraseStored(cellindex index, cellcount length) {
615
616 // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
617
618 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
619 cellindex pointer;
620 cellindex readp = 0;
621 cellindex readi = 0;
622 cellindex readl = 0;
623
624 // Find the phrase in the hashFile
625 hashTableFile.seekg(hashOffset);
626 hashTableFile.read((char *) &pointer, sizeof(cellindex));
627
628 if (pointer == 0) {
629 // There is no entry at all in the hash table for this entry
630 // so nothing is stored
631 return false;
632
633 } else {
634 // There is a list starting at this hash value, so the phrase may
635 // be already remembered in that list
636 while (pointer != 0) {
637 // Read the entry pointed to by pointer
638 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
639 listOfEntries.read((char *) &readp, sizeof(cellindex));
640 listOfEntries.read((char *) &readi, sizeof(cellindex));
641 listOfEntries.read((char *) &readl, sizeof(cellindex));
642
643 if ((readi == index) && (readl = length)) {
644 // we've found the phrase stored here
645 return true;
646 } else {
647 // go on to the next node
648 pointer = readp;
649 }
650 }
651 }
652 return false;
653}
654
655
656void deleteLongPhraseMemory() {
657 // remove the hash & other files
658
659 hashTableFile.close();
660 listOfEntries.close();
661 remove(hashTableFileName);
662 remove(listOfEntriesName);
663
664}
665
666
667// Read the collection statistics file
668//
669void readStatistics() {
670
671 // open the statistics file
672 char filename[FILENAME_MAX];
673 sprintf(filename, "%s/clauses.stats", collection);
674
675 // Open the file
676 ifstream inFile(filename, ios::in);
677 if (!inFile) {
678 cerr << "File " << filename << " could not be opened\n";
679 exit(1);
680 }
681
682 // Read the numbers file into the numbers array
683 char key[1000];
684 symbol value;
685 while (inFile >> key >> value){
686 if (strcmp(key, "first_stopword") == 0) {
687 firstStopSymbol = value;
688 } else if (strcmp(key, "last_stopword") == 0) {
689 lastStopSymbol = value;
690 } else if (strcmp(key, "first_contentword") == 0) {
691 firstContentSymbol = value;
692 } else if (strcmp(key, "last_contentword") == 0) {
693 lastContentSymbol = value;
694 }
695 }
696 inFile.close();
697
698 // Make sure we have the information we need
699 if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
700 cerr << "Statistics file incomplete" << endl;
701 exit(1);
702 }
703}
704
705cellcount getContentCount(symbol firstContent) {
706
707 cellcount content=0;
708 for (cellcount i=0; i<inputLength; i++) {
709 if (symbols[i]>=firstContent) content++;
710 }
711
712 return content;
713}
714
715
716int main (int argc, char * argv[]) {
717
718 // Command-line arguments
719 // argv[1] is the phindex directory
720 // argv[2] is the maximum array symbol length (optional)
721 // argv[3] is the mode, where 1 is stopword mode (optional)
722 if (argc < 2) {
723 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
724 exit(1);
725 }
726
727 // collection directory
728 strcpy(collection, argv[1]);
729
730 // mode parameter
731 phraseMode = atoi(argv[2]);
732 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
733
734 // optional verbosity parameter
735 if (argc == 4) {
736 verbosity = atoi(argv[3]);
737 assert (verbosity >= 0);
738 }
739
740 if (verbosity) {
741 cout << "suffix: the phrase extraction program" << endl;
742 }
743 if (verbosity > 1) {
744 if (phraseMode == STOPWORDS) {
745 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
746 } else {
747 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
748 }
749 }
750
751 // Read the statistics file
752 readStatistics();
753
754 // Read the numbers file
755 readNumbers();
756
757 if (numberOfDocuments == 0) {
758 cerr << "There are no documents in this collection!" << endl;
759 exit(1);
760 }
761
762 symbol firstContent;
763 if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
764 else firstContent = firstStopSymbol;
765
766 // Allocate memory for the suffix & prefix arrays
767 cellcount contentLength = 0;
768 contentLength = getContentCount(firstContent);
769 suffixArray = new symbol *[contentLength];
770 prefixArray = new symbol *[contentLength];
771 if (prefixArray == NULL) {
772 cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
773 exit(2);
774 }
775 allocateSuffixCheck(contentLength);
776
777 // Initialise prefix and suffix arrays, only use the needed suffixes
778 for (cellcount j = 0, here = 0; j < inputLength; j++) {
779 if (symbols[j]>=firstContent) {
780 suffixArray[here] = &symbols[j];
781 prefixArray[here] = &symbols[j];
782 here++;
783 }
784 }
785 qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
786 qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
787
788 // Create the document arrays
789 if (verbosity > 1) {
790 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
791 }
792
793 // The document frequecy array is used to count the number of times
794 // each phrase occurs in each document. The number of documents in
795 // which a phrase occurs is stored in df.
796 frequency *documentFrequency = new frequency[numberOfDocuments];
797 frequency df;
798
799 // documentArray will be searched in order to discover which document
800 // each phrase occurs in.
801 documentArray = new symbol *[numberOfDocuments];
802 if (documentArray == NULL) {
803 cerr << "Suffix: out of memory allocating document arrays." << endl;
804 exit(2);
805 }
806
807 // just scan through the input text to find the doc starts
808 cellindex d = 0;
809 for (cellindex i=0; i<inputLength; i++) {
810 if (symbols[i] == DOCUMENTSTART) {
811 documentArray[d] = &symbols[i];
812 d++;
813 }
814 }
815
816 // the phrases stuff is expecting inputLength to be the length of the
817 // suffix array, so change it.
818 inputLength = contentLength;
819
820 // Extract phrases
821 //
822 // We will make several passesover the data, in each case considering
823 // a set of input phrases and generating a set of output phrases, which
824 // we will expancd in later passes.
825 //
826 // The input phrases in the first pass will be the vocabulary.
827 // In later passes, the input phrases will be the output phrases of the
828 // previous pass.
829 //
830 // In each pass we will consider each input phrase in turn. If we
831 // have seen it before, we will ignore it. Otherwise, we will expand
832 // it and add its expansions to the set of output phrases.
833
834 cout <<"\ngenerating the phrase hierarchy\n\n";
835
836 // Store the phrase data in the phrases file
837 char phraseDataName[FILENAME_MAX];
838 sprintf(phraseDataName, "%s/phrases", collection);
839 ofstream phraseData(phraseDataName, ios::out);
840 if (!phraseData) {
841 cout << "File " << phraseDataName << " could not be opened\n";
842 exit(1);
843 }
844
845 // Count the number of phrases output
846 unsigned long int phraseCounter = 0;
847
848 // Set up the phrase expansion memory.
849 // We need this so that we don't expand a phrase more than once
850 initialisePhraseMemory();
851
852 // The current pass numebr
853 int phrasePass = 1;
854
855
856 // PASS NUMBER 1
857 if (verbosity > 1) {
858 cout << "Starting pass " << phrasePass << endl;
859 }
860
861 ofstream outPhrase;
862 char outPhraseName[FILENAME_MAX];
863 unsigned long int outPhraseCounter = 0;
864
865 // On the first pass, simply work through the vocabulary
866 sprintf(outPhraseName, "%s/outPhrase.1", collection);
867 outPhrase.open(outPhraseName, ios::out);
868 if (!outPhrase) {
869 cerr << "File " << outPhraseName << " could not be opened\n";
870 exit(1);
871 }
872
873 // Iterate over the different symbols by working through the suffix array
874 vector<Phrase> result;
875 cellindex ij = 0;
876 char *tmpString;
877
878 Phrase p;
879 while (ij < inputLength) {
880
881 // make a new phrase of length 1
882 p = Phrase(suffixArray[ij], 1, SUFFIX);
883 p.findFirstAndLastSuffix(ij, inputLength-1);
884
885 // We ignore this symbol if it occurs only once, if it is a delimiter,
886 // of if we are in stopwords mode and it is a stopword
887 // - in this new version, only need to check freq
888 // We could imagine a new mode/command-line option, which is like
889 // STOPWORDS but without this restrictrion. This would let you browse
890 // from "the" to "the AGRIS" for example, but not from "AGRIS" to
891 // "the AGRIS" (where the is a stopword and AGRIS a content word).
892 // The system used to work like this; it is easy to implement, but
893 // it explodes the size of the indexes. So: would it be useful?
894 if (p.suffixFrequency > 1) {
895 // Get minimal expansions of the phrase
896 getExpansions(p, result);
897
898 if (!result.empty()) {
899
900 // Remember that we have expanded this phrase
901 rememberThisPhrase(ij, 1);
902
903 // write the phrase text
904 phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
905 << result.size() << ":";
906
907 // write the results
908 for (cellcount k = 0; k < result.size(); k++) {
909 if (k) {
910 phraseData << ",";
911 }
912 phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
913 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
914 outPhraseCounter++;
915 }
916 result.clear();
917
918 // Write the documents in which this phrase occurs
919 df = getDocumentOccurrances(p, documentFrequency);
920 phraseData << ":" << df << ":";
921
922 // write the documents
923 for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
924 if (documentFrequency[m]) {
925 if (first) {
926 first = 0;
927 } else {
928 phraseData << ";";
929 }
930 // Output the document number. Note that here we've numbered the
931 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
932 // add 1 to the document id when we output it.
933 phraseData << "d" << (m+1);
934 // Next, output the frequency with which the document occurs, but
935 // only if it is > 1.
936 if (documentFrequency[m] > 1) {
937 phraseData << "," << documentFrequency[m];
938 }
939 }
940 }
941
942 phraseData << endl;
943 phraseCounter++;
944
945 // feedback
946 if (verbosity) {
947 if (phraseCounter % 1000 == 0) {
948 cout << "phrase " << phraseCounter << ": "
949 << "cell " << p.firstSuffixIndex << " - " << p << endl;
950 }
951 }
952 }
953 }
954 ij = p.lastSuffixIndex + 1;
955 }
956 outPhrase.close();
957
958 // REMAINING PASSES
959 // The previous outPhrase file forms the input to each new pass
960 cellcount start, length;
961 while (outPhraseCounter > 0) {
962
963 // Start a new pass
964 phrasePass++;
965 if (verbosity) {
966 cout << "Starting pass " << phrasePass << endl;
967 }
968
969 // Open the input file
970 char inPhraseName[FILENAME_MAX];
971 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
972 ifstream inPhrase (inPhraseName, ios::in);
973 if (!inPhrase) {
974 cerr << "File " << inPhraseName << " could not be opened\n";
975 exit(1);
976 }
977
978 // Open the output file
979 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
980 outPhrase.open(outPhraseName, ios::out);
981 if (!outPhrase) {
982 cerr << "File " << outPhraseName << " could not be opened\n";
983 exit(1);
984 }
985 outPhraseCounter = 0;
986
987 // Process each phrase
988 while(inPhrase >> start >> length) {
989
990 // Ignore the phrase if we have expanded it before
991 if (isPhraseStored(start, length))
992 continue;
993
994 // Remember that we have examined this phrase
995 rememberThisPhrase(start, length);
996
997 // Find the phrase in the suffixarray
998 p = Phrase(suffixArray[start], length, SUFFIX);
999 p.findFirstAndLastSuffix(start, inputLength-1);
1000
1001 // Ignore the phrase if it only occurs once
1002 if (p.suffixFrequency < 2)
1003 continue;
1004
1005 // Write the phrase text;
1006 phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1007
1008 // Expand the phrase, if it is fewer than 8 words long
1009 if (length <= 8) {
1010
1011 // Get the minimal expansions for this phrase
1012 getExpansions(p, result);
1013
1014 // write the results
1015 phraseData << result.size() << ":";
1016
1017 for (cellcount i = 0; i < result.size(); i++) {
1018 if (i) {
1019 phraseData << ",";
1020 }
1021 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1022 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1023 outPhraseCounter++;
1024 }
1025 result.clear();
1026
1027 } else {
1028 // phrase is too long to expand further
1029 phraseData << "0:";
1030 }
1031
1032 // Write the documents in which this phrase occurs
1033 df = getDocumentOccurrances(p, documentFrequency);
1034 phraseData << ":" << df << ":";
1035
1036 // write the documents
1037 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
1038 if (documentFrequency[i]) {
1039 if (first) {
1040 first = 0;
1041 } else {
1042 phraseData << ";";
1043 }
1044 // Output the document number. Note that here we've numbered the
1045 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
1046 // add 1 to the document id when we output it.
1047 phraseData << "d" << (i+1);
1048 // Next, output the frequency with which the document occurs, but
1049 // only if it is > 1.
1050 if (documentFrequency[i] > 1) {
1051 phraseData << "," << documentFrequency[i];
1052 }
1053 }
1054 }
1055
1056 phraseData << endl;
1057 phraseCounter++;
1058
1059 // feedback
1060 if (verbosity) {
1061 if (phraseCounter % 1000 == 0) {
1062 cout << "phrase " << phraseCounter << ": "<< "start " << start
1063 << ", length " << length << " - " << p << endl;
1064 }
1065 }
1066
1067 }
1068
1069 inPhrase.close();
1070 outPhrase.close();
1071 }
1072
1073 phraseData.close();
1074 deletePhraseMemory();
1075
1076 delete [] documentFrequency;
1077 delete [] symbols;
1078 delete [] suffixArray;
1079 delete [] prefixArray;
1080 delete [] suffixCheck;
1081 delete [] documentArray;
1082
1083
1084
1085 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1086 return 0;
1087}
1088
1089
1090
1091
1092
Note: See TracBrowser for help on using the repository browser.