source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2806

Last change on this file since 2806 was 2806, checked in by kjm18, 22 years ago

changed initialisation loops to memset

  • Property svn:keywords set to Author Date Id Revision
File size: 30.1 KB
Line 
1/**********************************************************************
2 *
3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
4 * and prefix arrays (cgn & gwp's simpler algorithm,
5 * and kjm's improvements).
6 *
7 * Copyright 2000 Gordon W. Paynter
8 * Copyright 2000 The New Zealand Digital Library Project
9 *
10 * A component of the Greenstone digital library software
11 * from the New Zealand Digital Library Project at the
12 * University of Waikato, New Zealand.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 *
28 *********************************************************************/
29
30#include <assert.h>
31#include <math.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#if defined(GSDL_USE_IOS_H)
37# include <fstream.h>
38# include <iostream.h>
39#else
40# include <fstream>
41# include <iostream>
42#endif
43
44#if defined(GSDL_USE_STL_H)
45# if defined(GSDL_USE_ALGO_H)
46# include <algo.h>
47# else
48# include <algorithm.h>
49# endif
50# include <vector.h>
51#else
52# include <algorithm>
53# include <vector>
54#endif
55
56#include "suffix.h"
57#include "phrase.h"
58
59// Global variables declared in suffix.h
60cellcount inputLength;
61
62symbol *symbols;
63symbol **suffixArray;
64symbol **prefixArray;
65check *suffixCheck;
66
67
68// How many documents are in this collection?
69cellcount numberOfDocuments;
70symbol **documentArray;
71
72
73// Do we accept any phrase, or do we eliminate those ending with stopwords ?
74int phraseMode = ANYPHRASE; //STOPWORDS;
75
76
77// The filestem of the collection's phindex directory
78char collection[FILENAME_MAX];
79
80
81// The ranges of the stopword and content-word symbols for the collection
82symbol firstStopSymbol = 0;
83symbol lastStopSymbol = 0;
84symbol firstContentSymbol = 0;
85symbol lastContentSymbol = 0;
86
87
88// Some useful comparison functions, defined below.
89int suffixCompare(const void *, const void *);
90int prefixCompare(const void *, const void *);
91int pointerCompare(const void *, const void *);
92
93
94// Functions for implementing "phrase memory". These let us "remember"
95// each phrase that we've expanded without using too much memory.
96void initialisePhraseMemory();
97void rememberThisPhrase(cellindex index, cellcount length);
98bool isPhraseStored(cellindex index, cellcount length);
99void deletePhraseMemory();
100
101
102// how much output do we want?
103int verbosity = 1;
104
105
106// Get a phrase's expansions
107//
108// Get the set of "minimal", "maximal", non-unique expansions of a
109// phrase p, using the simpler algorithm that Craig and Gordon came up
110// with at Google.
111//
112// Returns a vector of Expansions.
113
114void getExpansions(Phrase &p, vector<Phrase> &results) {
115
116 // 1. Get the initial candidates
117 vector<Phrase> candidates;
118 p.initialSuffixCandidates(candidates);
119 int suffcands = candidates.size();
120 p.initialPrefixCandidates(candidates);
121
122 if (candidates.size() == 0)
123 return;
124
125 vector<Phrase>::iterator i;
126 for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
127 i->expandWhileUniquePrefixExtension();
128 i->ensureSuffixFound();
129 }
130 for (i; i != candidates.end(); ++i) {
131 i->expandWhileUniqueSuffixExtension();
132 }
133
134 // 3. Ensure minimality: ignore phrases whose subphrases are also found
135 results.clear();
136
137 // Initialise the candidates, check array, and various variables.
138 sort(candidates.begin(), candidates.end(), isShorter);
139 memset(suffixCheck, 0, sizeof(check)*inputLength);
140 //for (cellcount j = 0; j < inputLength; j++)
141 // suffixCheck[j] = 0;
142 unsigned minimum_length = candidates.begin()->length;
143
144 // Try to add each candidate to the results set, ignoring the non-minimal
145 for (vector<Phrase>::iterator candidate = candidates.begin();
146 candidate != candidates.end(); candidate++) {
147
148 // Make a copy of candidate to mutilate while performing sub-phrase checks
149 Phrase temp_phrase(*candidate);
150 bool shorter_found = false;
151
152 // Check for shorter and shorter versions of the tenporary phrase
153 while (temp_phrase.length >= minimum_length && !shorter_found) {
154 temp_phrase.ensureSuffixFound();
155 if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
156 temp_phrase.shortenByOneAtPrefix();
157 else
158 shorter_found = true;
159
160 // Possible efficiency here: we can finish if the prefix of c
161 // and temp_phrase are the same for candidate->length symbols.
162 }
163
164 if (!shorter_found) {
165 results.push_back(*candidate);
166 candidate->ensureSuffixFound();
167 for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
168 suffixCheck[k] = candidate->length;
169 }
170 }
171}
172
173
174// suffixCompare
175//
176// Compare two pointers into a suffix array. We use this in the
177// qsort function, so the input are pointers to pointers.
178//
179// Return -1 if (a < b), otherwise (a > b) so return +1,
180
181int suffixCompare(const void *cpa, const void *cpb) {
182
183 // Cast then dereference pointers to suffix array elements
184 symbol *pa = (symbol *) cpa;
185 symbol *pb = (symbol *) cpb;
186 pa = (symbol *) *pa;
187 pb = (symbol *) *pb;
188
189 // If the two elements are the same, examine the next one
190 while (*pa == *pb) {
191 *pa++;
192 *pb++;
193 }
194
195 // Make the copmparison and return
196 if ( *pa < *pb) {
197 return -1;
198 } else {
199 return +1;
200 }
201}
202
203
204// prefixCompare
205//
206// Compare two pointers into a prefix array. We use this in the
207// qsort function, so the input are pointers to pointers.
208//
209// Return -1 if (a > b), otherwise (a < b) so return +1,
210
211int prefixCompare(const void *cpa, const void *cpb) {
212
213 // Cast then dereference pointers to prefix array elements
214 symbol *pa = (symbol *) cpa;
215 symbol *pb = (symbol *) cpb;
216 pa = (symbol *) *pa;
217 pb = (symbol *) *pb;
218
219 // If the two elements are the same, examine the next one
220 while (*pa == *pb) {
221 *pa--;
222 *pb--;
223 }
224
225 // Make the copmparison and return
226 if ( *pa > *pb) {
227 return -1;
228 } else {
229 return +1;
230 }
231}
232
233// simpleCompare
234//
235// Compare two pointers based on the memory location they point to.
236
237int pointerCompare( const void *pa, const void *pb ) {
238
239 symbol **a = (symbol **) pa;
240 symbol **b = (symbol **) pb;
241
242 if (*a < *b) {
243 return -1;
244 } else if (*a > *b) {
245 return 1;
246 } else {
247 return 0;
248 }
249}
250
251
252// Read the clauses.numbers file into the "symbols" array.
253//
254// Each number in the file is a symbol number; it is essential that
255// the first symbol (and no others) be COLLECTIONSTART and the last
256// symbol (and no others) be COLLECTIONEND.
257//
258// Return the number of numbers in the array.
259
260int readNumbers() {
261
262 char filename[FILENAME_MAX];
263 sprintf(filename, "%s/clauses.numbers", collection);
264 if (verbosity) {
265 cout << "Reading numbers file: " << filename << endl;
266 }
267
268 // Open the numbers file
269 ifstream inFile1(filename, ios::in);
270 if (!inFile1) {
271 cerr << "File " << filename << " could not be opened\n";
272 exit(1);
273 }
274
275 // Count the number of symbols
276 inputLength = 0;
277 symbol word;
278 while (inFile1 >> word) {
279 inputLength++;
280 }
281 inFile1.close();
282
283 // Allocate the symbbols array
284 if (verbosity > 1) {
285 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
286 }
287 symbols = new symbol[inputLength];
288 if (symbols == NULL) {
289 cerr << "Suffix error: not enough memory to hold " << inputLength
290 << " symbols." << endl;
291 exit(2);
292 }
293
294 // Read the numbers file into the numbers array
295 if (verbosity > 2) {
296 cout << "Reading the numbers" << endl;
297 }
298
299 ifstream inFile2(filename, ios::in);
300 if (!inFile2) {
301 cerr << "File " << filename << " could not be opened\n";
302 exit(1);
303 }
304 cellcount next = 0;
305 numberOfDocuments = 0;
306 while (inFile2 >> word) {
307 symbols[next++] = word;
308 if (word == DOCUMENTSTART) {
309 numberOfDocuments++;
310 }
311 }
312 inFile2.close();
313
314 // Make sure the numbers file is intact
315 assert(symbols[0] == COLLECTIONSTART);
316 assert(symbols[next-1] == COLLECTIONEND);
317
318 return inputLength;
319}
320
321
322
323// Get Document Occurrance statistics
324//
325// Given a phrase, what documents does it occur in?
326
327cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
328
329 // cout << "searching for \""<< p << "\" in documents "
330 // << 0 << "-" << numberOfDocuments - 1 << endl;
331
332 // The number of documents in which this phrase occurs
333 cellcount df = 0;
334
335 // Initialise the document frequency array
336 // for (cellindex i = 0; i < numberOfDocuments; i++) {
337 // frequency[i] = 0;
338 //}
339 memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
340
341 // variables used to facilitate the search
342 cellindex begin;
343 cellindex end;
344 cellindex d;
345 symbol *target;
346 bool found;
347
348 // search for the document in which each occurence of the phrase is found
349 for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
350
351 // cout << "looking for phrase at suffixArray[" << j << "]\n";
352
353 target = suffixArray[j];
354 begin = 0;
355 end = numberOfDocuments - 1;
356 found = false;
357
358 // Search for the occurence of a document delimiter that target
359 // occurs immediately after.
360 // We do this by performing a binary chop search on documentArray.
361 while (!found) {
362
363 // cout << "searching for " << (cellindex) target << " in "
364 // << begin << " - " << end << endl;
365
366 assert (begin <= end);
367
368 // If the beginning and end of the interval are the same,
369 // then we've found the correct document
370 if (begin == end) {
371 if (frequency[begin] == 0) {
372 df++;
373 }
374 frequency[begin]++;
375 found = true;
376 }
377
378 // Otherwise, examine a new document midway through the begin-end
379 // interval and see if it is the one.
380 else {
381 d = (begin + end) / 2;
382 if (target > documentArray[d]) {
383 // If target addrss is greater than this, but thisi sthe last document,
384 // then this must be the one we want. Or, if target is greater than
385 // this one but less then the next, this must be the one we wnat.
386 if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
387 if (frequency[d] == 0) {
388 df++;
389 }
390 frequency[d]++;
391 found = true;
392 } else {
393 // otherwise we know to search later in the document set
394 begin = d + 1;
395 }
396 } else {
397 // search earlier in the document set
398 end = d - 1;
399 }
400 }
401 }
402 }
403 return df;
404}
405
406
407
408
409
410
411// phraseExpansionMemory : Which phrases have we expanded?
412//
413// A set of utilities for keeping track of which phrases we have expanded.
414// We don't want to expand a phrase more than once, after all.
415//
416// This REALLY ought to be in its own class, but it works so that's okay.
417//
418// Phrases are identified by their firstSuffixPosition and length.
419//
420// Functions provided are:
421// void initialisePhraseMemory()
422// void rememberThisPhrase(index, length)
423// bool isPhraseStored(index, length)
424// void deletePhraseMemory()
425//
426// Internally, we will have two separate cases:
427//
428// Phrases of length 1-8:
429// unsigned char phraseMemory[inputLength]
430// is an array where each cell "remembers" the corresponding index in the
431// suffixArray, and each of the 8 bits of the cell correspond to the phrases
432// of length 1, 2... 8.
433// Eventually, we will make this disk-based (i.e. store the array in a file).
434//
435// Phrases of length 9+:
436// file hashTableFile
437// file listOfEntries
438// The first file is a hash table; each phrase maps to one of its cells, which
439// contains either 0 (empty, no occurence) or a number which is an entry number
440// in the second file. This file contains a "list" of entries. Each consists of
441// three numbers: the suffixArray index of the phrase, the length of the phrase,
442// and the entry number of the next phrase with the same hash.
443//
444
445
446unsigned char *phraseMemory;
447
448void initialiseLongPhraseMemory();
449void rememberThisLongPhrase(cellindex index, cellcount length);
450bool isLongPhraseStored(cellindex index, cellcount length);
451void deleteLongPhraseMemory();
452
453
454void initialisePhraseMemory() {
455
456 phraseMemory = new unsigned char[inputLength];
457
458 // to begin with, everything is empty
459 // for (cellcount i = 0; i < inputLength; i++) {
460 // phraseMemory[i] = 0;
461 //}
462 memset(phraseMemory, 0, sizeof(char)*inputLength);
463
464 // intialise the hashTable of long phrases
465 initialiseLongPhraseMemory();
466
467}
468
469void rememberThisPhrase(cellindex index, cellcount length) {
470
471 // if the phrase is very long, use the file-based system
472 if (length > 8) {
473 rememberThisLongPhrase(index, length);
474 return;
475 }
476
477 // create a char with just the bit corresponding to length set
478 unsigned char newbit = 1;
479 for (cellcount i = 1; i < length; i++) {
480 newbit <<= 1;
481 }
482
483 // set that bit in the memory array at position index
484 phraseMemory[index] |= newbit;
485}
486
487
488bool isPhraseStored(cellindex index, cellcount length) {
489
490 // if the phrase is very long, use the file-based system
491 if (length > 8) {
492 return isLongPhraseStored(index, length);
493 }
494
495 // create a char with just the bit corresponding to length set
496 unsigned char newbit = 1;
497 for (cellcount i = 1; i < length; i++) {
498 newbit <<= 1;
499 }
500
501 // retrurn true if that bit is set in memory arrayat position index
502 return (phraseMemory[index] & newbit);
503}
504
505void deletePhraseMemory() {
506 delete phraseMemory;
507 deleteLongPhraseMemory();
508}
509
510
511
512// Files etc used to store "long" equavlents of the above
513
514fstream hashTableFile;
515char hashTableFileName[FILENAME_MAX];
516fstream listOfEntries;
517char listOfEntriesName[FILENAME_MAX];
518cellindex nextEntryNumber;
519
520const cellcount bigPrime = 7919;
521
522
523void initialiseLongPhraseMemory() {
524
525 cellindex example = 0;
526
527 sprintf(hashTableFileName, "%s/hashTable", collection);
528 sprintf(listOfEntriesName, "%s/hashLists", collection);
529
530
531 // create the new hashtable
532 if (verbosity > 1) {
533 cout << "Initialising hashTable: " << hashTableFileName << endl;
534 }
535 hashTableFile.open(hashTableFileName, ios::in | ios::out);
536 for (cellcount i = 0; i < bigPrime; i++) {
537 hashTableFile.write((char *) &example, sizeof(example));
538 }
539
540 // create the list of phrases
541 if (verbosity > 1) {
542 cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
543 }
544 listOfEntries.open(listOfEntriesName, ios::in | ios::out);
545 listOfEntries.write((char *) &example, sizeof(example));
546 listOfEntries.write((char *) &example, sizeof(example));
547 listOfEntries.write((char *) &example, sizeof(example));
548 nextEntryNumber = 1;
549}
550
551
552void rememberThisLongPhrase(cellindex index, cellcount length) {
553
554 // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
555
556 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
557 cellindex pointer;
558 cellindex zero = 0;
559 cellindex readp = 0;
560 cellindex readi = 0;
561 cellindex readl = 0;
562
563 hashTableFile.seekg(hashOffset);
564 hashTableFile.read((char *) &pointer, sizeof(cellindex));
565
566 if (pointer == 0) {
567 // There is no entry at all in the hash table for this entry
568 // so create one
569
570 pointer = nextEntryNumber++;
571 hashTableFile.seekg(hashOffset);
572 hashTableFile.write((char *) &pointer, sizeof(cellindex));
573
574 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
575 listOfEntries.write((char *) &zero, sizeof(cellindex));
576 listOfEntries.write((char *) &index, sizeof(cellindex));
577 listOfEntries.write((char *) &length, sizeof(cellindex));
578
579 } else {
580 // There is a list starting at this hash value, so the phrase may
581 // be already remembered, or it might need to be appended
582
583 while (pointer != 0) {
584 // Read the entry pointed to by pointer
585 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
586 listOfEntries.read((char *) &readp, sizeof(cellindex));
587 listOfEntries.read((char *) &readi, sizeof(cellindex));
588 listOfEntries.read((char *) &readl, sizeof(cellindex));
589
590 // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
591
592 if ((readi == index) && (readl = length)) {
593 // we've found that we've already stored it
594 return;
595 } else if (readp == 0) {
596 // we're reached the end of the list. Add a new entry.
597 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
598 listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
599 pointer = nextEntryNumber++;
600
601 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
602 listOfEntries.write((char *) &zero, sizeof(cellindex));
603 listOfEntries.write((char *) &index, sizeof(cellindex));
604 listOfEntries.write((char *) &length, sizeof(cellindex));
605 return;
606 } else {
607 // go on to the next node
608 pointer = readp;
609 }
610 }
611 }
612
613
614}
615
616
617bool isLongPhraseStored(cellindex index, cellcount length) {
618
619 // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
620
621 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
622 cellindex pointer;
623 cellindex readp = 0;
624 cellindex readi = 0;
625 cellindex readl = 0;
626
627 // Find the phrase in the hashFile
628 hashTableFile.seekg(hashOffset);
629 hashTableFile.read((char *) &pointer, sizeof(cellindex));
630
631 if (pointer == 0) {
632 // There is no entry at all in the hash table for this entry
633 // so nothing is stored
634 return false;
635
636 } else {
637 // There is a list starting at this hash value, so the phrase may
638 // be already remembered in that list
639 while (pointer != 0) {
640 // Read the entry pointed to by pointer
641 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
642 listOfEntries.read((char *) &readp, sizeof(cellindex));
643 listOfEntries.read((char *) &readi, sizeof(cellindex));
644 listOfEntries.read((char *) &readl, sizeof(cellindex));
645
646 if ((readi == index) && (readl = length)) {
647 // we've found the phrase stored here
648 return true;
649 } else {
650 // go on to the next node
651 pointer = readp;
652 }
653 }
654 }
655 return false;
656}
657
658
659void deleteLongPhraseMemory() {
660 // remove the hash & other files
661
662 hashTableFile.close();
663 listOfEntries.close();
664 remove(hashTableFileName);
665 remove(listOfEntriesName);
666
667}
668
669
670// Read the collection statistics file
671//
672void readStatistics() {
673
674 // open the statistics file
675 char filename[FILENAME_MAX];
676 sprintf(filename, "%s/clauses.stats", collection);
677
678 // Open the file
679 ifstream inFile(filename, ios::in);
680 if (!inFile) {
681 cerr << "File " << filename << " could not be opened\n";
682 exit(1);
683 }
684
685 // Read the numbers file into the numbers array
686 char key[1000];
687 symbol value;
688 while (inFile >> key >> value){
689 if (strcmp(key, "first_stopword") == 0) {
690 firstStopSymbol = value;
691 } else if (strcmp(key, "last_stopword") == 0) {
692 lastStopSymbol = value;
693 } else if (strcmp(key, "first_contentword") == 0) {
694 firstContentSymbol = value;
695 } else if (strcmp(key, "last_contentword") == 0) {
696 lastContentSymbol = value;
697 }
698 }
699 inFile.close();
700
701 // Make sure we have the information we need
702 if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
703 cerr << "Statistics file incomplete" << endl;
704 exit(1);
705 }
706}
707
708cellcount getContentCount(symbol firstContent) {
709
710 cellcount content=0;
711 for (cellcount i=0; i<inputLength; i++) {
712 if (symbols[i]>=firstContent) content++;
713 }
714
715 return content;
716}
717
718int main (int argc, char * argv[]) {
719
720 // Command-line arguments
721 // argv[1] is the phindex directory
722 // argv[2] is the maximum array symbol length (optional)
723 // argv[3] is the mode, where 1 is stopword mode (optional)
724 if (argc < 2) {
725 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
726 exit(1);
727 }
728
729 // collection directory
730 strcpy(collection, argv[1]);
731
732 // mode parameter
733 phraseMode = atoi(argv[2]);
734 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
735
736 // optional verbosity parameter
737 if (argc == 4) {
738 verbosity = atoi(argv[3]);
739 assert (verbosity >= 0);
740 }
741
742 if (verbosity) {
743 cout << "suffix: the phrase extraction program" << endl;
744 }
745
746 if (verbosity > 1) {
747 if (phraseMode == STOPWORDS) {
748 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
749 } else {
750 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
751 }
752 }
753
754 // Read the statistics file
755 readStatistics();
756
757 // Read the numbers file
758 readNumbers();
759
760 if (numberOfDocuments == 0) {
761 cerr << "There are no documents in this collection!" << endl;
762 exit(1);
763 }
764
765 symbol firstContent;
766 if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
767 else firstContent = firstStopSymbol;
768
769 cellcount contentLength = 0;
770 contentLength = getContentCount(firstContent);
771
772 // Create the suffix & prefix arrays
773 suffixArray = new symbol *[contentLength];
774 prefixArray = new symbol *[contentLength];
775
776 cellcount here=0;
777 // Initialise prefix and suffix arrays, only use the needed suffixes
778 for (cellcount j = 0; j < inputLength; j++) {
779 if (symbols[j]>=firstContent) {
780 suffixArray[here] = &symbols[j];
781 prefixArray[here] = &symbols[j];
782 here++;
783 }
784 }
785 qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
786 qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
787
788 suffixCheck = new check[contentLength];
789 if (suffixCheck == NULL) {
790 cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
791 exit(2);
792 }
793 //for (cellcount j = 0; j < contentLength; j++)
794 // suffixCheck[j] = 0;
795 memset(suffixCheck, 0, sizeof(check)*contentLength);
796
797 cout <<"\ngenerating the phrase hierarchy\n\n";
798
799 // Create the document arrays
800 if (verbosity > 1) {
801 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
802 }
803
804 // The document frequecy array is used to count the number of times
805 // each phrase occurs in each document. The number of documents in
806 // which a phrase occurs is stored in df.
807 frequency *documentFrequency = new frequency[numberOfDocuments];
808 frequency df;
809
810 // documentArray will be searched in order to discover which document
811 // each phrase occurs in.
812 documentArray = new symbol *[numberOfDocuments];
813
814 // just scan through the input text to find the doc starts
815 cellindex d = 0;
816 for (cellindex i=0; i<inputLength; i++) {
817 if (symbols[i] == DOCUMENTSTART) {
818 documentArray[d] = &symbols[i];
819 d++;
820 }
821 }
822
823 // the phrases stuff is expecting inputLength to be the length of the
824 // suffix array, so change it.
825 inputLength = contentLength;
826
827 // Extract phrases
828 //
829 // We will make several passesover the data, in each case considering
830 // a set of input phrases and generating a set of output phrases, which
831 // we will expancd in later passes.
832 //
833 // The input phrases in the first pass will be the vocabulary.
834 // In later passes, the input phrases will be the output phrases of the
835 // previous pass.
836 //
837 // In each pass we will consider each input phrase in turn. If we
838 // have seen it before, we will ignore it. Otherwise, we will expand
839 // it and add its expansions to the set of output phrases.
840
841 // Store the phrase data in the phrases file
842 char phraseDataName[FILENAME_MAX];
843 sprintf(phraseDataName, "%s/phrases", collection);
844 ofstream phraseData(phraseDataName, ios::out);
845 if (!phraseData) {
846 cout << "File " << phraseDataName << " could not be opened\n";
847 exit(1);
848 }
849
850 // Count the number of phrases output
851 unsigned long int phraseCounter = 0;
852
853 // Set up the phrase expansion memory.
854 // We need this so that we don't expand a phrase more than once
855 initialisePhraseMemory();
856
857 // The current pass numebr
858 int phrasePass = 1;
859
860
861 // PASS NUMBER 1
862 if (verbosity > 1) {
863 cout << "Starting pass " << phrasePass << endl;
864 }
865
866 ofstream outPhrase;
867 char outPhraseName[FILENAME_MAX];
868 unsigned long int outPhraseCounter = 0;
869
870 // On the first pass, simply work through the vocabulary
871 sprintf(outPhraseName, "%s/outPhrase.1", collection);
872 outPhrase.open(outPhraseName, ios::out);
873 if (!outPhrase) {
874 cerr << "File " << outPhraseName << " could not be opened\n";
875 exit(1);
876 }
877
878 // Iterate over the different symbols by working through the suffix array
879 vector<Phrase> result;
880 cellindex ij = 0;
881 char *tmpString;
882
883 Phrase p;
884 while (ij < inputLength) {
885
886 // make a new phrase of length 1
887 p = Phrase(suffixArray[ij], 1, SUFFIX);
888 p.findFirstAndLastSuffix(ij, inputLength-1);
889
890 // We ignore this symbol if it occurs only once, if it is a delimiter,
891 // of if we are in stopwords mode and it is a stopword
892 // - in this new version, only need to check freq
893 // We could imagine a new mode/command-line option, which is like
894 // STOPWORDS but without this restrictrion. This would let you browse
895 // from "the" to "the AGRIS" for example, but not from "AGRIS" to
896 // "the AGRIS" (where the is a stopword and AGRIS a content word).
897 // The system used to work like this; it is easy to implement, but
898 // it explodes the size of the indexes. So: would it be useful?
899 if (p.suffixFrequency > 1) {
900 // Get minimal expansions of the phrase
901 getExpansions(p, result);
902
903 if (!result.empty()) {
904
905 // Remember that we have expanded this phrase
906 rememberThisPhrase(ij, 1);
907
908 // write the phrase text
909 phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
910 << result.size() << ":";
911
912 // write the results
913 for (cellcount k = 0; k < result.size(); k++) {
914 if (k) {
915 phraseData << ",";
916 }
917 phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
918 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
919 outPhraseCounter++;
920 }
921 result.clear();
922
923 // Write the documents in which this phrase occurs
924 df = getDocumentOccurrances(p, documentFrequency);
925 phraseData << ":" << df << ":";
926
927 // write the documents
928 for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
929 if (documentFrequency[m]) {
930 if (first) {
931 first = 0;
932 } else {
933 phraseData << ";";
934 }
935 // Output the document number. Note that here we've numbered the
936 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
937 // add 1 to the document id when we output it.
938 phraseData << "d" << (m+1);
939 // Next, output the frequency with which the document occurs, but
940 // only if it is > 1.
941 if (documentFrequency[m] > 1) {
942 phraseData << "," << documentFrequency[m];
943 }
944 }
945 }
946
947 phraseData << endl;
948 phraseCounter++;
949
950 // feedback
951 if (verbosity) {
952 if (phraseCounter % 1000 == 0) {
953 cout << "phrase " << phraseCounter << ": "
954 << "cell " << p.firstSuffixIndex << " - " << p << endl;
955 }
956 }
957 }
958 }
959 ij = p.lastSuffixIndex + 1;
960 }
961 outPhrase.close();
962
963 // REMAINING PASSES
964 // The previous outPhrase file forms the input to each new pass
965 cellcount start, length;
966 while (outPhraseCounter > 0) {
967
968 // Start a new pass
969 phrasePass++;
970 if (verbosity) {
971 cout << "Starting pass " << phrasePass << endl;
972 }
973
974 // Open the input file
975 char inPhraseName[FILENAME_MAX];
976 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
977 ifstream inPhrase (inPhraseName, ios::in);
978 if (!inPhrase) {
979 cerr << "File " << inPhraseName << " could not be opened\n";
980 exit(1);
981 }
982
983 // Open the output file
984 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
985 outPhrase.open(outPhraseName, ios::out);
986 if (!outPhrase) {
987 cerr << "File " << outPhraseName << " could not be opened\n";
988 exit(1);
989 }
990 outPhraseCounter = 0;
991
992 // Process each phrase
993 while(inPhrase >> start >> length) {
994
995 // Ignore the phrase if we have expanded it before
996 if (isPhraseStored(start, length))
997 continue;
998
999 // Remember that we have examined this phrase
1000 rememberThisPhrase(start, length);
1001
1002 // Find the phrase in the suffixarray
1003 p = Phrase(suffixArray[start], length, SUFFIX);
1004 p.findFirstAndLastSuffix(start, inputLength-1);
1005
1006 // Ignore the phrase if it only occurs once
1007 if (p.suffixFrequency < 2)
1008 continue;
1009
1010 // Write the phrase text;
1011 phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1012
1013 // Expand the phrase, if it is fewer than 8 words long
1014 if (length <= 8) {
1015
1016 // Get the minimal expansions for this phrase
1017 getExpansions(p, result);
1018
1019 // write the results
1020 phraseData << result.size() << ":";
1021
1022 for (cellcount i = 0; i < result.size(); i++) {
1023 if (i) {
1024 phraseData << ",";
1025 }
1026 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1027 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1028 outPhraseCounter++;
1029 }
1030 result.clear();
1031
1032 } else {
1033 // phrase is too long to expand further
1034 phraseData << "0:";
1035 }
1036
1037 // Write the documents in which this phrase occurs
1038 df = getDocumentOccurrances(p, documentFrequency);
1039 phraseData << ":" << df << ":";
1040
1041 // write the documents
1042 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
1043 if (documentFrequency[i]) {
1044 if (first) {
1045 first = 0;
1046 } else {
1047 phraseData << ";";
1048 }
1049 // Output the document number. Note that here we've numbered the
1050 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
1051 // add 1 to the document id when we output it.
1052 phraseData << "d" << (i+1);
1053 // Next, output the frequency with which the document occurs, but
1054 // only if it is > 1.
1055 if (documentFrequency[i] > 1) {
1056 phraseData << "," << documentFrequency[i];
1057 }
1058 }
1059 }
1060
1061 phraseData << endl;
1062 phraseCounter++;
1063
1064 // feedback
1065 if (verbosity) {
1066 if (phraseCounter % 1000 == 0) {
1067 cout << "phrase " << phraseCounter << ": "<< "start " << start
1068 << ", length " << length << " - " << p << endl;
1069 }
1070 }
1071
1072 }
1073
1074 inPhrase.close();
1075 outPhrase.close();
1076 }
1077
1078 phraseData.close();
1079 deletePhraseMemory();
1080
1081 delete [] documentFrequency;
1082 delete [] symbols;
1083 delete [] suffixArray;
1084 delete [] prefixArray;
1085 delete [] suffixCheck;
1086 delete [] documentArray;
1087
1088
1089
1090 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1091 return 0;
1092}
1093
1094
1095
1096
1097
Note: See TracBrowser for help on using the repository browser.