source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2801

Last change on this file since 2801 was 2801, checked in by kjm18, 23 years ago

new version of suffix, based on suffix2 (gordon and craigs simpler version)
with kaths improvements

  • Property svn:keywords set to Author Date Id Revision
File size: 29.8 KB
Line 
1/**********************************************************************
2 *
3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
4 * and prefix arrays (cgn & gwp's simpler algorithm,
5 * and kjm's improvements).
6 *
7 * Copyright 2000 Gordon W. Paynter
8 * Copyright 2000 The New Zealand Digital Library Project
9 *
10 * A component of the Greenstone digital library software
11 * from the New Zealand Digital Library Project at the
12 * University of Waikato, New Zealand.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 *
28 *********************************************************************/
29
30#include <assert.h>
31#include <math.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#if defined(GSDL_USE_IOS_H)
37# include <fstream.h>
38# include <iostream.h>
39#else
40# include <fstream>
41# include <iostream>
42#endif
43
44#if defined(GSDL_USE_STL_H)
45# if defined(GSDL_USE_ALGO_H)
46# include <algo.h>
47# else
48# include <algorithm.h>
49# endif
50# include <vector.h>
51#else
52# include <algorithm>
53# include <vector>
54#endif
55
56#include "suffix.h"
57#include "phrase.h"
58
59// Global variables declared in suffix.h
60cellcount inputLength;
61
62symbol *symbols;
63symbol **suffixArray;
64symbol **prefixArray;
65check *suffixCheck;
66
67
68// How many documents are in this collection?
69cellcount numberOfDocuments;
70symbol **documentArray;
71
72
73// Do we accept any phrase, or do we eliminate those ending with stopwords ?
74int phraseMode = ANYPHRASE; //STOPWORDS;
75
76
77// The filestem of the collection's phindex directory
78char collection[FILENAME_MAX];
79
80
81// The ranges of the stopword and content-word symbols for the collection
82symbol firstStopSymbol = 0;
83symbol lastStopSymbol = 0;
84symbol firstContentSymbol = 0;
85symbol lastContentSymbol = 0;
86
87
88// Some useful comparison functions, defined below.
89int suffixCompare(const void *, const void *);
90int prefixCompare(const void *, const void *);
91int pointerCompare(const void *, const void *);
92
93
94// Functions for implementing "phrase memory". These let us "remember"
95// each phrase that we've expanded without using too much memory.
96void initialisePhraseMemory();
97void rememberThisPhrase(cellindex index, cellcount length);
98bool isPhraseStored(cellindex index, cellcount length);
99void deletePhraseMemory();
100
101
102// how much output do we want?
103int verbosity = 1;
104
105
106// Get a phrase's expansions
107//
108// Get the set of "minimal", "maximal", non-unique expansions of a
109// phrase p, using the simpler algorithm that Craig and Gordon came up
110// with at Google.
111//
112// Returns a vector of Expansions.
113
114void getExpansions(Phrase &p, vector<Phrase> &results) {
115
116 // 1. Get the initial candidates
117 vector<Phrase> candidates;
118 p.initialSuffixCandidates(candidates);
119 int suffcands = candidates.size();
120 p.initialPrefixCandidates(candidates);
121
122 if (candidates.size() == 0)
123 return;
124
125 vector<Phrase>::iterator i;
126 for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
127 i->expandWhileUniquePrefixExtension();
128 i->ensureSuffixFound();
129 }
130 for (i; i != candidates.end(); ++i) {
131 i->expandWhileUniqueSuffixExtension();
132 }
133
134 // 3. Ensure minimality: ignore phrases whose subphrases are also found
135 results.clear();
136
137 // Initialise the candidates, check array, and various variables.
138 sort(candidates.begin(), candidates.end(), isShorter);
139 for (cellcount j = 0; j < inputLength; j++)
140 suffixCheck[j] = 0;
141 unsigned minimum_length = candidates.begin()->length;
142
143 // Try to add each candidate to the results set, ignoring the non-minimal
144 for (vector<Phrase>::iterator candidate = candidates.begin();
145 candidate != candidates.end(); candidate++) {
146
147 // Make a copy of candidate to mutilate while performing sub-phrase checks
148 Phrase temp_phrase(*candidate);
149 bool shorter_found = false;
150
151 // Check for shorter and shorter versions of the tenporary phrase
152 while (temp_phrase.length >= minimum_length && !shorter_found) {
153 temp_phrase.ensureSuffixFound();
154 if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
155 temp_phrase.shortenByOneAtPrefix();
156 else
157 shorter_found = true;
158
159 // Possible efficiency here: we can finish if the prefix of c
160 // and temp_phrase are the same for candidate->length symbols.
161 }
162
163 if (!shorter_found) {
164 results.push_back(*candidate);
165 candidate->ensureSuffixFound();
166 for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
167 suffixCheck[k] = candidate->length;
168 }
169 }
170}
171
172
173// suffixCompare
174//
175// Compare two pointers into a suffix array. We use this in the
176// qsort function, so the input are pointers to pointers.
177//
178// Return -1 if (a < b), otherwise (a > b) so return +1,
179
180int suffixCompare(const void *cpa, const void *cpb) {
181
182 // Cast then dereference pointers to suffix array elements
183 symbol *pa = (symbol *) cpa;
184 symbol *pb = (symbol *) cpb;
185 pa = (symbol *) *pa;
186 pb = (symbol *) *pb;
187
188 // If the two elements are the same, examine the next one
189 while (*pa == *pb) {
190 *pa++;
191 *pb++;
192 }
193
194 // Make the copmparison and return
195 if ( *pa < *pb) {
196 return -1;
197 } else {
198 return +1;
199 }
200}
201
202
203// prefixCompare
204//
205// Compare two pointers into a prefix array. We use this in the
206// qsort function, so the input are pointers to pointers.
207//
208// Return -1 if (a > b), otherwise (a < b) so return +1,
209
210int prefixCompare(const void *cpa, const void *cpb) {
211
212 // Cast then dereference pointers to prefix array elements
213 symbol *pa = (symbol *) cpa;
214 symbol *pb = (symbol *) cpb;
215 pa = (symbol *) *pa;
216 pb = (symbol *) *pb;
217
218 // If the two elements are the same, examine the next one
219 while (*pa == *pb) {
220 *pa--;
221 *pb--;
222 }
223
224 // Make the copmparison and return
225 if ( *pa > *pb) {
226 return -1;
227 } else {
228 return +1;
229 }
230}
231
232// simpleCompare
233//
234// Compare two pointers based on the memory location they point to.
235
236int pointerCompare( const void *pa, const void *pb ) {
237
238 symbol **a = (symbol **) pa;
239 symbol **b = (symbol **) pb;
240
241 if (*a < *b) {
242 return -1;
243 } else if (*a > *b) {
244 return 1;
245 } else {
246 return 0;
247 }
248}
249
250
251// Read the clauses.numbers file into the "symbols" array.
252//
253// Each number in the file is a symbol number; it is essential that
254// the first symbol (and no others) be COLLECTIONSTART and the last
255// symbol (and no others) be COLLECTIONEND.
256//
257// Return the number of numbers in the array.
258
259int readNumbers() {
260
261 char filename[FILENAME_MAX];
262 sprintf(filename, "%s/clauses.numbers", collection);
263 if (verbosity) {
264 cout << "Reading numbers file: " << filename << endl;
265 }
266
267 // Open the numbers file
268 ifstream inFile1(filename, ios::in);
269 if (!inFile1) {
270 cerr << "File " << filename << " could not be opened\n";
271 exit(1);
272 }
273
274 // Count the number of symbols
275 inputLength = 0;
276 symbol word;
277 while (inFile1 >> word) {
278 inputLength++;
279 }
280 inFile1.close();
281
282 // Allocate the symbbols array
283 if (verbosity > 1) {
284 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
285 }
286 symbols = new symbol[inputLength];
287 if (symbols == NULL) {
288 cerr << "Suffix error: not enough memory to hold " << inputLength
289 << " symbols." << endl;
290 exit(2);
291 }
292
293 // Read the numbers file into the numbers array
294 if (verbosity > 2) {
295 cout << "Reading the numbers" << endl;
296 }
297
298 ifstream inFile2(filename, ios::in);
299 if (!inFile2) {
300 cerr << "File " << filename << " could not be opened\n";
301 exit(1);
302 }
303 cellcount next = 0;
304 numberOfDocuments = 0;
305 while (inFile2 >> word) {
306 symbols[next++] = word;
307 if (word == DOCUMENTSTART) {
308 numberOfDocuments++;
309 }
310 }
311 inFile2.close();
312
313 // Make sure the numbers file is intact
314 assert(symbols[0] == COLLECTIONSTART);
315 assert(symbols[next-1] == COLLECTIONEND);
316
317 return inputLength;
318}
319
320
321
322// Get Document Occurrance statistics
323//
324// Given a phrase, what documents does it occur in?
325
326cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
327
328 // cout << "searching for \""<< p << "\" in documents "
329 // << 0 << "-" << numberOfDocuments - 1 << endl;
330
331 // The number of documents in which this phrase occurs
332 cellcount df = 0;
333
334 // Initialise the document frequency array
335 for (cellindex i = 0; i < numberOfDocuments; i++) {
336 frequency[i] = 0;
337 }
338
339 // variables used to facilitate the search
340 cellindex begin;
341 cellindex end;
342 cellindex d;
343 symbol *target;
344 bool found;
345
346 // search for the document in which each occurence of the phrase is found
347 for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
348
349 // cout << "looking for phrase at suffixArray[" << j << "]\n";
350
351 target = suffixArray[j];
352 begin = 0;
353 end = numberOfDocuments - 1;
354 found = false;
355
356 // Search for the occurence of a document delimiter that target
357 // occurs immediately after.
358 // We do this by performing a binary chop search on documentArray.
359 while (!found) {
360
361 // cout << "searching for " << (cellindex) target << " in "
362 // << begin << " - " << end << endl;
363
364 assert (begin <= end);
365
366 // If the beginning and end of the interval are the same,
367 // then we've found the correct document
368 if (begin == end) {
369 if (frequency[begin] == 0) {
370 df++;
371 }
372 frequency[begin]++;
373 found = true;
374 }
375
376 // Otherwise, examine a new document midway through the begin-end
377 // interval and see if it is the one.
378 else {
379 d = (begin + end) / 2;
380 if (target > documentArray[d]) {
381 // If target addrss is greater than this, but thisi sthe last document,
382 // then this must be the one we want. Or, if target is greater than
383 // this one but less then the next, this must be the one we wnat.
384 if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
385 if (frequency[d] == 0) {
386 df++;
387 }
388 frequency[d]++;
389 found = true;
390 } else {
391 // otherwise we know to search later in the document set
392 begin = d + 1;
393 }
394 } else {
395 // search earlier in the document set
396 end = d - 1;
397 }
398 }
399 }
400 }
401 return df;
402}
403
404
405
406
407
408
409// phraseExpansionMemory : Which phrases have we expanded?
410//
411// A set of utilities for keeping track of which phrases we have expanded.
412// We don't want to expand a phrase more than once, after all.
413//
414// This REALLY ought to be in its own class, but it works so that's okay.
415//
416// Phrases are identified by their firstSuffixPosition and length.
417//
418// Functions provided are:
419// void initialisePhraseMemory()
420// void rememberThisPhrase(index, length)
421// bool isPhraseStored(index, length)
422// void deletePhraseMemory()
423//
424// Internally, we will have two separate cases:
425//
426// Phrases of length 1-8:
427// unsigned char phraseMemory[inputLength]
428// is an array where each cell "remembers" the corresponding index in the
429// suffixArray, and each of the 8 bits of the cell correspond to the phrases
430// of length 1, 2... 8.
431// Eventually, we will make this disk-based (i.e. store the array in a file).
432//
433// Phrases of length 9+:
434// file hashTableFile
435// file listOfEntries
436// The first file is a hash table; each phrase maps to one of its cells, which
437// contains either 0 (empty, no occurence) or a number which is an entry number
438// in the second file. This file contains a "list" of entries. Each consists of
439// three numbers: the suffixArray index of the phrase, the length of the phrase,
440// and the entry number of the next phrase with the same hash.
441//
442
443
444unsigned char *phraseMemory;
445
446void initialiseLongPhraseMemory();
447void rememberThisLongPhrase(cellindex index, cellcount length);
448bool isLongPhraseStored(cellindex index, cellcount length);
449void deleteLongPhraseMemory();
450
451
452void initialisePhraseMemory() {
453
454 phraseMemory = new unsigned char[inputLength];
455
456 // to begin with, everything is empty
457 for (cellcount i = 0; i < inputLength; i++) {
458 phraseMemory[i] = 0;
459 }
460
461 // intialise the hashTable of long phrases
462 initialiseLongPhraseMemory();
463
464}
465
466void rememberThisPhrase(cellindex index, cellcount length) {
467
468 // if the phrase is very long, use the file-based system
469 if (length > 8) {
470 rememberThisLongPhrase(index, length);
471 return;
472 }
473
474 // create a char with just the bit corresponding to length set
475 unsigned char newbit = 1;
476 for (cellcount i = 1; i < length; i++) {
477 newbit <<= 1;
478 }
479
480 // set that bit in the memory array at position index
481 phraseMemory[index] |= newbit;
482}
483
484
485bool isPhraseStored(cellindex index, cellcount length) {
486
487 // if the phrase is very long, use the file-based system
488 if (length > 8) {
489 return isLongPhraseStored(index, length);
490 }
491
492 // create a char with just the bit corresponding to length set
493 unsigned char newbit = 1;
494 for (cellcount i = 1; i < length; i++) {
495 newbit <<= 1;
496 }
497
498 // retrurn true if that bit is set in memory arrayat position index
499 return (phraseMemory[index] & newbit);
500}
501
502void deletePhraseMemory() {
503 delete phraseMemory;
504 deleteLongPhraseMemory();
505}
506
507
508
509// Files etc used to store "long" equavlents of the above
510
511fstream hashTableFile;
512char hashTableFileName[FILENAME_MAX];
513fstream listOfEntries;
514char listOfEntriesName[FILENAME_MAX];
515cellindex nextEntryNumber;
516
517const cellcount bigPrime = 7919;
518
519
520void initialiseLongPhraseMemory() {
521
522 cellindex example = 0;
523
524 sprintf(hashTableFileName, "%s/hashTable", collection);
525 sprintf(listOfEntriesName, "%s/hashLists", collection);
526
527
528 // create the new hashtable
529 if (verbosity > 1) {
530 cout << "Initialising hashTable: " << hashTableFileName << endl;
531 }
532 hashTableFile.open(hashTableFileName, ios::in | ios::out);
533 for (cellcount i = 0; i < bigPrime; i++) {
534 hashTableFile.write((char *) &example, sizeof(example));
535 }
536
537 // create the list of phrases
538 if (verbosity > 1) {
539 cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
540 }
541 listOfEntries.open(listOfEntriesName, ios::in | ios::out);
542 listOfEntries.write((char *) &example, sizeof(example));
543 listOfEntries.write((char *) &example, sizeof(example));
544 listOfEntries.write((char *) &example, sizeof(example));
545 nextEntryNumber = 1;
546}
547
548
549void rememberThisLongPhrase(cellindex index, cellcount length) {
550
551 // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
552
553 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
554 cellindex pointer;
555 cellindex zero = 0;
556 cellindex readp = 0;
557 cellindex readi = 0;
558 cellindex readl = 0;
559
560 hashTableFile.seekg(hashOffset);
561 hashTableFile.read((char *) &pointer, sizeof(cellindex));
562
563 if (pointer == 0) {
564 // There is no entry at all in the hash table for this entry
565 // so create one
566
567 pointer = nextEntryNumber++;
568 hashTableFile.seekg(hashOffset);
569 hashTableFile.write((char *) &pointer, sizeof(cellindex));
570
571 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
572 listOfEntries.write((char *) &zero, sizeof(cellindex));
573 listOfEntries.write((char *) &index, sizeof(cellindex));
574 listOfEntries.write((char *) &length, sizeof(cellindex));
575
576 } else {
577 // There is a list starting at this hash value, so the phrase may
578 // be already remembered, or it might need to be appended
579
580 while (pointer != 0) {
581 // Read the entry pointed to by pointer
582 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
583 listOfEntries.read((char *) &readp, sizeof(cellindex));
584 listOfEntries.read((char *) &readi, sizeof(cellindex));
585 listOfEntries.read((char *) &readl, sizeof(cellindex));
586
587 // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
588
589 if ((readi == index) && (readl = length)) {
590 // we've found that we've already stored it
591 return;
592 } else if (readp == 0) {
593 // we're reached the end of the list. Add a new entry.
594 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
595 listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
596 pointer = nextEntryNumber++;
597
598 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
599 listOfEntries.write((char *) &zero, sizeof(cellindex));
600 listOfEntries.write((char *) &index, sizeof(cellindex));
601 listOfEntries.write((char *) &length, sizeof(cellindex));
602 return;
603 } else {
604 // go on to the next node
605 pointer = readp;
606 }
607 }
608 }
609
610
611}
612
613
614bool isLongPhraseStored(cellindex index, cellcount length) {
615
616 // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
617
618 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
619 cellindex pointer;
620 cellindex readp = 0;
621 cellindex readi = 0;
622 cellindex readl = 0;
623
624 // Find the phrase in the hashFile
625 hashTableFile.seekg(hashOffset);
626 hashTableFile.read((char *) &pointer, sizeof(cellindex));
627
628 if (pointer == 0) {
629 // There is no entry at all in the hash table for this entry
630 // so nothing is stored
631 return false;
632
633 } else {
634 // There is a list starting at this hash value, so the phrase may
635 // be already remembered in that list
636 while (pointer != 0) {
637 // Read the entry pointed to by pointer
638 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
639 listOfEntries.read((char *) &readp, sizeof(cellindex));
640 listOfEntries.read((char *) &readi, sizeof(cellindex));
641 listOfEntries.read((char *) &readl, sizeof(cellindex));
642
643 if ((readi == index) && (readl = length)) {
644 // we've found the phrase stored here
645 return true;
646 } else {
647 // go on to the next node
648 pointer = readp;
649 }
650 }
651 }
652 return false;
653}
654
655
656void deleteLongPhraseMemory() {
657 // remove the hash & other files
658
659 hashTableFile.close();
660 listOfEntries.close();
661 remove(hashTableFileName);
662 remove(listOfEntriesName);
663
664}
665
666
667// Read the collection statistics file
668//
669void readStatistics() {
670
671 // open the statistics file
672 char filename[FILENAME_MAX];
673 sprintf(filename, "%s/clauses.stats", collection);
674
675 // Open the file
676 ifstream inFile(filename, ios::in);
677 if (!inFile) {
678 cerr << "File " << filename << " could not be opened\n";
679 exit(1);
680 }
681
682 // Read the numbers file into the numbers array
683 char key[1000];
684 symbol value;
685 while (inFile >> key >> value){
686 if (strcmp(key, "first_stopword") == 0) {
687 firstStopSymbol = value;
688 } else if (strcmp(key, "last_stopword") == 0) {
689 lastStopSymbol = value;
690 } else if (strcmp(key, "first_contentword") == 0) {
691 firstContentSymbol = value;
692 } else if (strcmp(key, "last_contentword") == 0) {
693 lastContentSymbol = value;
694 }
695 }
696 inFile.close();
697
698 // Make sure we have the information we need
699 if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
700 cerr << "Statistics file incomplete" << endl;
701 exit(1);
702 }
703}
704
705cellcount getContentCount(symbol firstContent) {
706
707 cellcount content=0;
708 for (cellcount i=0; i<inputLength; i++) {
709 if (symbols[i]>=firstContent) content++;
710 }
711
712 return content;
713}
714
715int main (int argc, char * argv[]) {
716
717 // Command-line arguments
718 // argv[1] is the phindex directory
719 // argv[2] is the maximum array symbol length (optional)
720 // argv[3] is the mode, where 1 is stopword mode (optional)
721 if (argc < 2) {
722 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
723 exit(1);
724 }
725
726 // collection directory
727 strcpy(collection, argv[1]);
728
729 // mode parameter
730 phraseMode = atoi(argv[2]);
731 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
732
733 // optional verbosity parameter
734 if (argc == 4) {
735 verbosity = atoi(argv[3]);
736 assert (verbosity >= 0);
737 }
738
739 if (verbosity) {
740 cout << "suffix: the phrase extraction program" << endl;
741 }
742
743 if (verbosity > 1) {
744 if (phraseMode == STOPWORDS) {
745 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
746 } else {
747 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
748 }
749 }
750
751 // Read the statistics file
752 readStatistics();
753
754 // Read the numbers file
755 readNumbers();
756
757 if (numberOfDocuments == 0) {
758 cerr << "There are no documents in this collection!" << endl;
759 exit(1);
760 }
761
762 symbol firstContent;
763 if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
764 else firstContent = firstStopSymbol;
765
766 cellcount contentLength = 0;
767 contentLength = getContentCount(firstContent);
768
769 // Create the suffix & prefix arrays
770 suffixArray = new symbol *[contentLength];
771 prefixArray = new symbol *[contentLength];
772
773 cellcount here=0;
774 // Initialise prefix and suffix arrays, only use the needed suffixes
775 for (cellcount j = 0; j < inputLength; j++) {
776 if (symbols[j]>=firstContent) {
777 suffixArray[here] = &symbols[j];
778 prefixArray[here] = &symbols[j];
779 here++;
780 }
781 }
782 qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
783 qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
784
785 suffixCheck = new check[contentLength];
786 if (suffixCheck == NULL) {
787 cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
788 exit(2);
789 }
790 for (cellcount j = 0; j < contentLength; j++)
791 suffixCheck[j] = 0;
792
793 cout <<"\ngenerating the phrase hierarchy\n\n";
794
795 // Create the document arrays
796 if (verbosity > 1) {
797 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
798 }
799
800 // The document frequecy array is used to count the number of times
801 // each phrase occurs in each document. The number of documents in
802 // which a phrase occurs is stored in df.
803 frequency *documentFrequency = new frequency[numberOfDocuments];
804 frequency df;
805
806 // documentArray will be searched in order to discover which document
807 // each phrase occurs in.
808 documentArray = new symbol *[numberOfDocuments];
809
810 // just scan through the input text to find the doc starts
811 cellindex d = 0;
812 for (cellindex i=0; i<inputLength; i++) {
813 if (symbols[i] == DOCUMENTSTART) {
814 documentArray[d] = &symbols[i];
815 d++;
816 }
817 }
818
819 // the phrases stuff is expecting inputLength to be the length of the
820 // suffix array, so change it.
821 inputLength = contentLength;
822
823 // Extract phrases
824 //
825 // We will make several passesover the data, in each case considering
826 // a set of input phrases and generating a set of output phrases, which
827 // we will expancd in later passes.
828 //
829 // The input phrases in the first pass will be the vocabulary.
830 // In later passes, the input phrases will be the output phrases of the
831 // previous pass.
832 //
833 // In each pass we will consider each input phrase in turn. If we
834 // have seen it before, we will ignore it. Otherwise, we will expand
835 // it and add its expansions to the set of output phrases.
836
837 // Store the phrase data in the phrases file
838 char phraseDataName[FILENAME_MAX];
839 sprintf(phraseDataName, "%s/phrases", collection);
840 ofstream phraseData(phraseDataName, ios::out);
841 if (!phraseData) {
842 cout << "File " << phraseDataName << " could not be opened\n";
843 exit(1);
844 }
845
846 // Count the number of phrases output
847 unsigned long int phraseCounter = 0;
848
849 // Set up the phrase expansion memory.
850 // We need this so that we don't expand a phrase more than once
851 initialisePhraseMemory();
852
853 // The current pass numebr
854 int phrasePass = 1;
855
856
857 // PASS NUMBER 1
858 if (verbosity > 1) {
859 cout << "Starting pass " << phrasePass << endl;
860 }
861
862 ofstream outPhrase;
863 char outPhraseName[FILENAME_MAX];
864 unsigned long int outPhraseCounter = 0;
865
866 // On the first pass, simply work through the vocabulary
867 sprintf(outPhraseName, "%s/outPhrase.1", collection);
868 outPhrase.open(outPhraseName, ios::out);
869 if (!outPhrase) {
870 cerr << "File " << outPhraseName << " could not be opened\n";
871 exit(1);
872 }
873
874 // Iterate over the different symbols by working through the suffix array
875 vector<Phrase> result;
876 cellindex ij = 0;
877 char *tmpString;
878
879 Phrase p;
880 while (ij < inputLength) {
881
882 // make a new phrase of length 1
883 p = Phrase(suffixArray[ij], 1, SUFFIX);
884 p.findFirstAndLastSuffix(ij, inputLength-1);
885
886 // We ignore this symbol if it occurs only once, if it is a delimiter,
887 // of if we are in stopwords mode and it is a stopword
888 // - in this new version, only need to check freq
889 // We could imagine a new mode/command-line option, which is like
890 // STOPWORDS but without this restrictrion. This would let you browse
891 // from "the" to "the AGRIS" for example, but not from "AGRIS" to
892 // "the AGRIS" (where the is a stopword and AGRIS a content word).
893 // The system used to work like this; it is easy to implement, but
894 // it explodes the size of the indexes. So: would it be useful?
895 if (p.suffixFrequency > 1) {
896 // Get minimal expansions of the phrase
897 getExpansions(p, result);
898
899 if (!result.empty()) {
900
901 // Remember that we have expanded this phrase
902 rememberThisPhrase(ij, 1);
903
904 // write the phrase text
905 phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
906 << result.size() << ":";
907
908 // write the results
909 for (cellcount k = 0; k < result.size(); k++) {
910 if (k) {
911 phraseData << ",";
912 }
913 phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
914 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
915 outPhraseCounter++;
916 }
917 result.clear();
918
919 // Write the documents in which this phrase occurs
920 df = getDocumentOccurrances(p, documentFrequency);
921 phraseData << ":" << df << ":";
922
923 // write the documents
924 for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
925 if (documentFrequency[m]) {
926 if (first) {
927 first = 0;
928 } else {
929 phraseData << ";";
930 }
931 // Output the document number. Note that here we've numbered the
932 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
933 // add 1 to the document id when we output it.
934 phraseData << "d" << (m+1);
935 // Next, output the frequency with which the document occurs, but
936 // only if it is > 1.
937 if (documentFrequency[m] > 1) {
938 phraseData << "," << documentFrequency[m];
939 }
940 }
941 }
942
943 phraseData << endl;
944 phraseCounter++;
945
946 // feedback
947 if (verbosity) {
948 if (phraseCounter % 1000 == 0) {
949 cout << "phrase " << phraseCounter << ": "
950 << "cell " << p.firstSuffixIndex << " - " << p << endl;
951 }
952 }
953 }
954 }
955 ij = p.lastSuffixIndex + 1;
956 }
957 outPhrase.close();
958
959 // REMAINING PASSES
960 // The previous outPhrase file forms the input to each new pass
961 cellcount start, length;
962 while (outPhraseCounter > 0) {
963
964 // Start a new pass
965 phrasePass++;
966 if (verbosity) {
967 cout << "Starting pass " << phrasePass << endl;
968 }
969
970 // Open the input file
971 char inPhraseName[FILENAME_MAX];
972 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
973 ifstream inPhrase (inPhraseName, ios::in);
974 if (!inPhrase) {
975 cerr << "File " << inPhraseName << " could not be opened\n";
976 exit(1);
977 }
978
979 // Open the output file
980 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
981 outPhrase.open(outPhraseName, ios::out);
982 if (!outPhrase) {
983 cerr << "File " << outPhraseName << " could not be opened\n";
984 exit(1);
985 }
986 outPhraseCounter = 0;
987
988 // Process each phrase
989 while(inPhrase >> start >> length) {
990
991 // Ignore the phrase if we have expanded it before
992 if (isPhraseStored(start, length))
993 continue;
994
995 // Remember that we have examined this phrase
996 rememberThisPhrase(start, length);
997
998 // Find the phrase in the suffixarray
999 p = Phrase(suffixArray[start], length, SUFFIX);
1000 p.findFirstAndLastSuffix(start, inputLength-1);
1001
1002 // Ignore the phrase if it only occurs once
1003 if (p.suffixFrequency < 2)
1004 continue;
1005
1006 // Write the phrase text;
1007 phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1008
1009 // Expand the phrase, if it is fewer than 8 words long
1010 if (length <= 8) {
1011
1012 // Get the minimal expansions for this phrase
1013 getExpansions(p, result);
1014
1015 // write the results
1016 phraseData << result.size() << ":";
1017
1018 for (cellcount i = 0; i < result.size(); i++) {
1019 if (i) {
1020 phraseData << ",";
1021 }
1022 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1023 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1024 outPhraseCounter++;
1025 }
1026 result.clear();
1027
1028 } else {
1029 // phrase is too long to expand further
1030 phraseData << "0:";
1031 }
1032
1033 // Write the documents in which this phrase occurs
1034 df = getDocumentOccurrances(p, documentFrequency);
1035 phraseData << ":" << df << ":";
1036
1037 // write the documents
1038 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
1039 if (documentFrequency[i]) {
1040 if (first) {
1041 first = 0;
1042 } else {
1043 phraseData << ";";
1044 }
1045 // Output the document number. Note that here we've numbered the
1046 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
1047 // add 1 to the document id when we output it.
1048 phraseData << "d" << (i+1);
1049 // Next, output the frequency with which the document occurs, but
1050 // only if it is > 1.
1051 if (documentFrequency[i] > 1) {
1052 phraseData << "," << documentFrequency[i];
1053 }
1054 }
1055 }
1056
1057 phraseData << endl;
1058 phraseCounter++;
1059
1060 // feedback
1061 if (verbosity) {
1062 if (phraseCounter % 1000 == 0) {
1063 cout << "phrase " << phraseCounter << ": "<< "start " << start
1064 << ", length " << length << " - " << p << endl;
1065 }
1066 }
1067
1068 }
1069
1070 inPhrase.close();
1071 outPhrase.close();
1072 }
1073
1074 phraseData.close();
1075 deletePhraseMemory();
1076
1077 delete [] documentFrequency;
1078 delete [] symbols;
1079 delete [] suffixArray;
1080 delete [] prefixArray;
1081 delete [] suffixCheck;
1082 delete [] documentArray;
1083
1084
1085
1086 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1087 return 0;
1088}
1089
1090
1091
1092
1093
Note: See TracBrowser for help on using the repository browser.