source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2807

Last change on this file since 2807 was 2807, checked in by kjm18, 20 years ago

changed suffixCheck to bit array.

  • Property svn:keywords set to Author Date Id Revision
File size: 30.7 KB
Line 
1/**********************************************************************
2 *
3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
4 * and prefix arrays (cgn & gwp's simpler algorithm,
5 * and kjm's improvements).
6 *
7 * Copyright 2000 Gordon W. Paynter
8 * Copyright 2000 The New Zealand Digital Library Project
9 *
10 * A component of the Greenstone digital library software
11 * from the New Zealand Digital Library Project at the
12 * University of Waikato, New Zealand.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 *
28 *********************************************************************/
29
30#include <assert.h>
31#include <math.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#if defined(GSDL_USE_IOS_H)
37# include <fstream.h>
38# include <iostream.h>
39#else
40# include <fstream>
41# include <iostream>
42#endif
43
44#if defined(GSDL_USE_STL_H)
45# if defined(GSDL_USE_ALGO_H)
46# include <algo.h>
47# else
48# include <algorithm.h>
49# endif
50# include <vector.h>
51#else
52# include <algorithm>
53# include <vector>
54#endif
55
56#include "suffix.h"
57#include "phrase.h"
58
59// Global variables declared in suffix.h
60cellcount inputLength;
61
62symbol *symbols;
63symbol **suffixArray;
64symbol **prefixArray;
65check *suffixCheck;
66
67// the length of the check array
68cellcount checkLength;
69
70// How many documents are in this collection?
71cellcount numberOfDocuments;
72symbol **documentArray;
73
74
75// Do we accept any phrase, or do we eliminate those ending with stopwords ?
76int phraseMode = ANYPHRASE; //STOPWORDS;
77
78
79// The filestem of the collection's phindex directory
80char collection[FILENAME_MAX];
81
82
83// The ranges of the stopword and content-word symbols for the collection
84symbol firstStopSymbol = 0;
85symbol lastStopSymbol = 0;
86symbol firstContentSymbol = 0;
87symbol lastContentSymbol = 0;
88
89
90// Some useful comparison functions, defined below.
91int suffixCompare(const void *, const void *);
92int prefixCompare(const void *, const void *);
93int pointerCompare(const void *, const void *);
94
95// some bit manipulation functions for the check arrays, defined below
96int getSuffixCheck(cellindex suff);
97void setSuffixCheck(cellindex suff);
98
99// Functions for implementing "phrase memory". These let us "remember"
100// each phrase that we've expanded without using too much memory.
101void initialisePhraseMemory();
102void rememberThisPhrase(cellindex index, cellcount length);
103bool isPhraseStored(cellindex index, cellcount length);
104void deletePhraseMemory();
105
106
107// how much output do we want?
108int verbosity = 1;
109
110
111// Get a phrase's expansions
112//
113// Get the set of "minimal", "maximal", non-unique expansions of a
114// phrase p, using the simpler algorithm that Craig and Gordon came up
115// with at Google.
116//
117// Returns a vector of Expansions.
118
119void getExpansions(Phrase &p, vector<Phrase> &results) {
120
121 // 1. Get the initial candidates
122 vector<Phrase> candidates;
123 p.initialSuffixCandidates(candidates);
124 int suffcands = candidates.size();
125 p.initialPrefixCandidates(candidates);
126
127 if (candidates.size() == 0)
128 return;
129
130 vector<Phrase>::iterator i;
131 for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
132 i->expandWhileUniquePrefixExtension();
133 i->ensureSuffixFound();
134 }
135 for (i; i != candidates.end(); ++i) {
136 i->expandWhileUniqueSuffixExtension();
137 }
138
139 // 3. Ensure minimality: ignore phrases whose subphrases are also found
140 results.clear();
141
142 // Initialise the candidates, check array, and various variables.
143 sort(candidates.begin(), candidates.end(), isShorter);
144 memset(suffixCheck, 0, sizeof(check)*checkLength);
145 unsigned minimum_length = candidates.begin()->length;
146
147 // Try to add each candidate to the results set, ignoring the non-minimal
148 for (vector<Phrase>::iterator candidate = candidates.begin();
149 candidate != candidates.end(); candidate++) {
150
151 // Make a copy of candidate to mutilate while performing sub-phrase checks
152 Phrase temp_phrase(*candidate);
153 bool shorter_found = false;
154
155 // Check for shorter and shorter versions of the tenporary phrase
156 while (temp_phrase.length >= minimum_length && !shorter_found) {
157 temp_phrase.ensureSuffixFound();
158 //if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
159 if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
160 temp_phrase.shortenByOneAtPrefix();
161 else
162 shorter_found = true;
163
164 // Possible efficiency here: we can finish if the prefix of c
165 // and temp_phrase are the same for candidate->length symbols.
166 }
167
168 if (!shorter_found) {
169 results.push_back(*candidate);
170 candidate->ensureSuffixFound();
171 for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
172 //suffixCheck[k] = candidate->length;
173 setSuffixCheck(k);
174 }
175 }
176}
177
178
179// suffixCompare
180//
181// Compare two pointers into a suffix array. We use this in the
182// qsort function, so the input are pointers to pointers.
183//
184// Return -1 if (a < b), otherwise (a > b) so return +1,
185
186int suffixCompare(const void *cpa, const void *cpb) {
187
188 // Cast then dereference pointers to suffix array elements
189 symbol *pa = (symbol *) cpa;
190 symbol *pb = (symbol *) cpb;
191 pa = (symbol *) *pa;
192 pb = (symbol *) *pb;
193
194 // If the two elements are the same, examine the next one
195 while (*pa == *pb) {
196 *pa++;
197 *pb++;
198 }
199
200 // Make the copmparison and return
201 if ( *pa < *pb) {
202 return -1;
203 } else {
204 return +1;
205 }
206}
207
208
209// prefixCompare
210//
211// Compare two pointers into a prefix array. We use this in the
212// qsort function, so the input are pointers to pointers.
213//
214// Return -1 if (a > b), otherwise (a < b) so return +1,
215
216int prefixCompare(const void *cpa, const void *cpb) {
217
218 // Cast then dereference pointers to prefix array elements
219 symbol *pa = (symbol *) cpa;
220 symbol *pb = (symbol *) cpb;
221 pa = (symbol *) *pa;
222 pb = (symbol *) *pb;
223
224 // If the two elements are the same, examine the next one
225 while (*pa == *pb) {
226 *pa--;
227 *pb--;
228 }
229
230 // Make the copmparison and return
231 if ( *pa > *pb) {
232 return -1;
233 } else {
234 return +1;
235 }
236}
237
238// simpleCompare
239//
240// Compare two pointers based on the memory location they point to.
241
242int pointerCompare( const void *pa, const void *pb ) {
243
244 symbol **a = (symbol **) pa;
245 symbol **b = (symbol **) pb;
246
247 if (*a < *b) {
248 return -1;
249 } else if (*a > *b) {
250 return 1;
251 } else {
252 return 0;
253 }
254}
255
256int getSuffixCheck(cellindex suff) {
257 cellindex cell = suff >> 3;
258 check remainder = suff & 0x07; // the last 3 bits
259 if (suffixCheck[cell]& (1 << remainder)) {
260 return 1;
261 }
262 return 0;
263}
264void setSuffixCheck(cellindex suff) {
265 cellindex cell = suff >> 3;
266 check remainder = suff & 0x07; // the last 3 bits
267 suffixCheck[cell] |= (1 << remainder);
268
269}
270
271// Read the clauses.numbers file into the "symbols" array.
272//
273// Each number in the file is a symbol number; it is essential that
274// the first symbol (and no others) be COLLECTIONSTART and the last
275// symbol (and no others) be COLLECTIONEND.
276//
277// Return the number of numbers in the array.
278
279int readNumbers() {
280
281 char filename[FILENAME_MAX];
282 sprintf(filename, "%s/clauses.numbers", collection);
283 if (verbosity) {
284 cout << "Reading numbers file: " << filename << endl;
285 }
286
287 // Open the numbers file
288 ifstream inFile1(filename, ios::in);
289 if (!inFile1) {
290 cerr << "File " << filename << " could not be opened\n";
291 exit(1);
292 }
293
294 // Count the number of symbols
295 inputLength = 0;
296 symbol word;
297 while (inFile1 >> word) {
298 inputLength++;
299 }
300 inFile1.close();
301
302 // Allocate the symbbols array
303 if (verbosity > 1) {
304 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
305 }
306 symbols = new symbol[inputLength];
307 if (symbols == NULL) {
308 cerr << "Suffix error: not enough memory to hold " << inputLength
309 << " symbols." << endl;
310 exit(2);
311 }
312
313 // Read the numbers file into the numbers array
314 if (verbosity > 2) {
315 cout << "Reading the numbers" << endl;
316 }
317
318 ifstream inFile2(filename, ios::in);
319 if (!inFile2) {
320 cerr << "File " << filename << " could not be opened\n";
321 exit(1);
322 }
323 cellcount next = 0;
324 numberOfDocuments = 0;
325 while (inFile2 >> word) {
326 symbols[next++] = word;
327 if (word == DOCUMENTSTART) {
328 numberOfDocuments++;
329 }
330 }
331 inFile2.close();
332
333 // Make sure the numbers file is intact
334 assert(symbols[0] == COLLECTIONSTART);
335 assert(symbols[next-1] == COLLECTIONEND);
336
337 return inputLength;
338}
339
340
341
342// Get Document Occurrance statistics
343//
344// Given a phrase, what documents does it occur in?
345
346cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
347
348 // cout << "searching for \""<< p << "\" in documents "
349 // << 0 << "-" << numberOfDocuments - 1 << endl;
350
351 // The number of documents in which this phrase occurs
352 cellcount df = 0;
353
354 // Initialise the document frequency array
355 // for (cellindex i = 0; i < numberOfDocuments; i++) {
356 // frequency[i] = 0;
357 //}
358 memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
359
360 // variables used to facilitate the search
361 cellindex begin;
362 cellindex end;
363 cellindex d;
364 symbol *target;
365 bool found;
366
367 // search for the document in which each occurence of the phrase is found
368 for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
369
370 // cout << "looking for phrase at suffixArray[" << j << "]\n";
371
372 target = suffixArray[j];
373 begin = 0;
374 end = numberOfDocuments - 1;
375 found = false;
376
377 // Search for the occurence of a document delimiter that target
378 // occurs immediately after.
379 // We do this by performing a binary chop search on documentArray.
380 while (!found) {
381
382 // cout << "searching for " << (cellindex) target << " in "
383 // << begin << " - " << end << endl;
384
385 assert (begin <= end);
386
387 // If the beginning and end of the interval are the same,
388 // then we've found the correct document
389 if (begin == end) {
390 if (frequency[begin] == 0) {
391 df++;
392 }
393 frequency[begin]++;
394 found = true;
395 }
396
397 // Otherwise, examine a new document midway through the begin-end
398 // interval and see if it is the one.
399 else {
400 d = (begin + end) / 2;
401 if (target > documentArray[d]) {
402 // If target addrss is greater than this, but thisi sthe last document,
403 // then this must be the one we want. Or, if target is greater than
404 // this one but less then the next, this must be the one we wnat.
405 if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
406 if (frequency[d] == 0) {
407 df++;
408 }
409 frequency[d]++;
410 found = true;
411 } else {
412 // otherwise we know to search later in the document set
413 begin = d + 1;
414 }
415 } else {
416 // search earlier in the document set
417 end = d - 1;
418 }
419 }
420 }
421 }
422 return df;
423}
424
425
426
427
428
429
430// phraseExpansionMemory : Which phrases have we expanded?
431//
432// A set of utilities for keeping track of which phrases we have expanded.
433// We don't want to expand a phrase more than once, after all.
434//
435// This REALLY ought to be in its own class, but it works so that's okay.
436//
437// Phrases are identified by their firstSuffixPosition and length.
438//
439// Functions provided are:
440// void initialisePhraseMemory()
441// void rememberThisPhrase(index, length)
442// bool isPhraseStored(index, length)
443// void deletePhraseMemory()
444//
445// Internally, we will have two separate cases:
446//
447// Phrases of length 1-8:
448// unsigned char phraseMemory[inputLength]
449// is an array where each cell "remembers" the corresponding index in the
450// suffixArray, and each of the 8 bits of the cell correspond to the phrases
451// of length 1, 2... 8.
452// Eventually, we will make this disk-based (i.e. store the array in a file).
453//
454// Phrases of length 9+:
455// file hashTableFile
456// file listOfEntries
457// The first file is a hash table; each phrase maps to one of its cells, which
458// contains either 0 (empty, no occurence) or a number which is an entry number
459// in the second file. This file contains a "list" of entries. Each consists of
460// three numbers: the suffixArray index of the phrase, the length of the phrase,
461// and the entry number of the next phrase with the same hash.
462//
463
464
465unsigned char *phraseMemory;
466
467void initialiseLongPhraseMemory();
468void rememberThisLongPhrase(cellindex index, cellcount length);
469bool isLongPhraseStored(cellindex index, cellcount length);
470void deleteLongPhraseMemory();
471
472
473void initialisePhraseMemory() {
474
475 phraseMemory = new unsigned char[inputLength];
476
477 // to begin with, everything is empty
478 // for (cellcount i = 0; i < inputLength; i++) {
479 // phraseMemory[i] = 0;
480 //}
481 memset(phraseMemory, 0, sizeof(char)*inputLength);
482
483 // intialise the hashTable of long phrases
484 initialiseLongPhraseMemory();
485
486}
487
488void rememberThisPhrase(cellindex index, cellcount length) {
489
490 // if the phrase is very long, use the file-based system
491 if (length > 8) {
492 rememberThisLongPhrase(index, length);
493 return;
494 }
495
496 // create a char with just the bit corresponding to length set
497 unsigned char newbit = 1;
498 for (cellcount i = 1; i < length; i++) {
499 newbit <<= 1;
500 }
501
502 // set that bit in the memory array at position index
503 phraseMemory[index] |= newbit;
504}
505
506
507bool isPhraseStored(cellindex index, cellcount length) {
508
509 // if the phrase is very long, use the file-based system
510 if (length > 8) {
511 return isLongPhraseStored(index, length);
512 }
513
514 // create a char with just the bit corresponding to length set
515 unsigned char newbit = 1;
516 for (cellcount i = 1; i < length; i++) {
517 newbit <<= 1;
518 }
519
520 // retrurn true if that bit is set in memory arrayat position index
521 return (phraseMemory[index] & newbit);
522}
523
524void deletePhraseMemory() {
525 delete phraseMemory;
526 deleteLongPhraseMemory();
527}
528
529
530
531// Files etc used to store "long" equavlents of the above
532
533fstream hashTableFile;
534char hashTableFileName[FILENAME_MAX];
535fstream listOfEntries;
536char listOfEntriesName[FILENAME_MAX];
537cellindex nextEntryNumber;
538
539const cellcount bigPrime = 7919;
540
541
542void initialiseLongPhraseMemory() {
543
544 cellindex example = 0;
545
546 sprintf(hashTableFileName, "%s/hashTable", collection);
547 sprintf(listOfEntriesName, "%s/hashLists", collection);
548
549
550 // create the new hashtable
551 if (verbosity > 1) {
552 cout << "Initialising hashTable: " << hashTableFileName << endl;
553 }
554 hashTableFile.open(hashTableFileName, ios::in | ios::out);
555 for (cellcount i = 0; i < bigPrime; i++) {
556 hashTableFile.write((char *) &example, sizeof(example));
557 }
558
559 // create the list of phrases
560 if (verbosity > 1) {
561 cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
562 }
563 listOfEntries.open(listOfEntriesName, ios::in | ios::out);
564 listOfEntries.write((char *) &example, sizeof(example));
565 listOfEntries.write((char *) &example, sizeof(example));
566 listOfEntries.write((char *) &example, sizeof(example));
567 nextEntryNumber = 1;
568}
569
570
571void rememberThisLongPhrase(cellindex index, cellcount length) {
572
573 // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
574
575 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
576 cellindex pointer;
577 cellindex zero = 0;
578 cellindex readp = 0;
579 cellindex readi = 0;
580 cellindex readl = 0;
581
582 hashTableFile.seekg(hashOffset);
583 hashTableFile.read((char *) &pointer, sizeof(cellindex));
584
585 if (pointer == 0) {
586 // There is no entry at all in the hash table for this entry
587 // so create one
588
589 pointer = nextEntryNumber++;
590 hashTableFile.seekg(hashOffset);
591 hashTableFile.write((char *) &pointer, sizeof(cellindex));
592
593 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
594 listOfEntries.write((char *) &zero, sizeof(cellindex));
595 listOfEntries.write((char *) &index, sizeof(cellindex));
596 listOfEntries.write((char *) &length, sizeof(cellindex));
597
598 } else {
599 // There is a list starting at this hash value, so the phrase may
600 // be already remembered, or it might need to be appended
601
602 while (pointer != 0) {
603 // Read the entry pointed to by pointer
604 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
605 listOfEntries.read((char *) &readp, sizeof(cellindex));
606 listOfEntries.read((char *) &readi, sizeof(cellindex));
607 listOfEntries.read((char *) &readl, sizeof(cellindex));
608
609 // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
610
611 if ((readi == index) && (readl = length)) {
612 // we've found that we've already stored it
613 return;
614 } else if (readp == 0) {
615 // we're reached the end of the list. Add a new entry.
616 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
617 listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
618 pointer = nextEntryNumber++;
619
620 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
621 listOfEntries.write((char *) &zero, sizeof(cellindex));
622 listOfEntries.write((char *) &index, sizeof(cellindex));
623 listOfEntries.write((char *) &length, sizeof(cellindex));
624 return;
625 } else {
626 // go on to the next node
627 pointer = readp;
628 }
629 }
630 }
631
632
633}
634
635
636bool isLongPhraseStored(cellindex index, cellcount length) {
637
638 // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
639
640 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
641 cellindex pointer;
642 cellindex readp = 0;
643 cellindex readi = 0;
644 cellindex readl = 0;
645
646 // Find the phrase in the hashFile
647 hashTableFile.seekg(hashOffset);
648 hashTableFile.read((char *) &pointer, sizeof(cellindex));
649
650 if (pointer == 0) {
651 // There is no entry at all in the hash table for this entry
652 // so nothing is stored
653 return false;
654
655 } else {
656 // There is a list starting at this hash value, so the phrase may
657 // be already remembered in that list
658 while (pointer != 0) {
659 // Read the entry pointed to by pointer
660 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
661 listOfEntries.read((char *) &readp, sizeof(cellindex));
662 listOfEntries.read((char *) &readi, sizeof(cellindex));
663 listOfEntries.read((char *) &readl, sizeof(cellindex));
664
665 if ((readi == index) && (readl = length)) {
666 // we've found the phrase stored here
667 return true;
668 } else {
669 // go on to the next node
670 pointer = readp;
671 }
672 }
673 }
674 return false;
675}
676
677
678void deleteLongPhraseMemory() {
679 // remove the hash & other files
680
681 hashTableFile.close();
682 listOfEntries.close();
683 remove(hashTableFileName);
684 remove(listOfEntriesName);
685
686}
687
688
689// Read the collection statistics file
690//
691void readStatistics() {
692
693 // open the statistics file
694 char filename[FILENAME_MAX];
695 sprintf(filename, "%s/clauses.stats", collection);
696
697 // Open the file
698 ifstream inFile(filename, ios::in);
699 if (!inFile) {
700 cerr << "File " << filename << " could not be opened\n";
701 exit(1);
702 }
703
704 // Read the numbers file into the numbers array
705 char key[1000];
706 symbol value;
707 while (inFile >> key >> value){
708 if (strcmp(key, "first_stopword") == 0) {
709 firstStopSymbol = value;
710 } else if (strcmp(key, "last_stopword") == 0) {
711 lastStopSymbol = value;
712 } else if (strcmp(key, "first_contentword") == 0) {
713 firstContentSymbol = value;
714 } else if (strcmp(key, "last_contentword") == 0) {
715 lastContentSymbol = value;
716 }
717 }
718 inFile.close();
719
720 // Make sure we have the information we need
721 if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
722 cerr << "Statistics file incomplete" << endl;
723 exit(1);
724 }
725}
726
727cellcount getContentCount(symbol firstContent) {
728
729 cellcount content=0;
730 for (cellcount i=0; i<inputLength; i++) {
731 if (symbols[i]>=firstContent) content++;
732 }
733
734 return content;
735}
736
737int main (int argc, char * argv[]) {
738
739 // Command-line arguments
740 // argv[1] is the phindex directory
741 // argv[2] is the maximum array symbol length (optional)
742 // argv[3] is the mode, where 1 is stopword mode (optional)
743 if (argc < 2) {
744 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
745 exit(1);
746 }
747
748 // collection directory
749 strcpy(collection, argv[1]);
750
751 // mode parameter
752 phraseMode = atoi(argv[2]);
753 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
754
755 // optional verbosity parameter
756 if (argc == 4) {
757 verbosity = atoi(argv[3]);
758 assert (verbosity >= 0);
759 }
760
761 if (verbosity) {
762 cout << "suffix: the phrase extraction program" << endl;
763 }
764
765 if (verbosity > 1) {
766 if (phraseMode == STOPWORDS) {
767 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
768 } else {
769 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
770 }
771 }
772
773 // Read the statistics file
774 readStatistics();
775
776 // Read the numbers file
777 readNumbers();
778
779 if (numberOfDocuments == 0) {
780 cerr << "There are no documents in this collection!" << endl;
781 exit(1);
782 }
783
784 symbol firstContent;
785 if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
786 else firstContent = firstStopSymbol;
787
788 cellcount contentLength = 0;
789 contentLength = getContentCount(firstContent);
790
791 // Create the suffix & prefix arrays
792 suffixArray = new symbol *[contentLength];
793 prefixArray = new symbol *[contentLength];
794
795 cellcount here=0;
796 // Initialise prefix and suffix arrays, only use the needed suffixes
797 for (cellcount j = 0; j < inputLength; j++) {
798 if (symbols[j]>=firstContent) {
799 suffixArray[here] = &symbols[j];
800 prefixArray[here] = &symbols[j];
801 here++;
802 }
803 }
804 qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
805 qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
806
807 checkLength = contentLength/8 + 1;
808 suffixCheck = new check[checkLength];
809 if (suffixCheck == NULL) {
810 cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
811 exit(2);
812 }
813 memset(suffixCheck, 0, sizeof(check)*checkLength);
814
815 cout <<"\ngenerating the phrase hierarchy\n\n";
816
817 // Create the document arrays
818 if (verbosity > 1) {
819 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
820 }
821
822 // The document frequecy array is used to count the number of times
823 // each phrase occurs in each document. The number of documents in
824 // which a phrase occurs is stored in df.
825 frequency *documentFrequency = new frequency[numberOfDocuments];
826 frequency df;
827
828 // documentArray will be searched in order to discover which document
829 // each phrase occurs in.
830 documentArray = new symbol *[numberOfDocuments];
831
832 // just scan through the input text to find the doc starts
833 cellindex d = 0;
834 for (cellindex i=0; i<inputLength; i++) {
835 if (symbols[i] == DOCUMENTSTART) {
836 documentArray[d] = &symbols[i];
837 d++;
838 }
839 }
840
841 // the phrases stuff is expecting inputLength to be the length of the
842 // suffix array, so change it.
843 inputLength = contentLength;
844
845 // Extract phrases
846 //
847 // We will make several passesover the data, in each case considering
848 // a set of input phrases and generating a set of output phrases, which
849 // we will expancd in later passes.
850 //
851 // The input phrases in the first pass will be the vocabulary.
852 // In later passes, the input phrases will be the output phrases of the
853 // previous pass.
854 //
855 // In each pass we will consider each input phrase in turn. If we
856 // have seen it before, we will ignore it. Otherwise, we will expand
857 // it and add its expansions to the set of output phrases.
858
859 // Store the phrase data in the phrases file
860 char phraseDataName[FILENAME_MAX];
861 sprintf(phraseDataName, "%s/phrases", collection);
862 ofstream phraseData(phraseDataName, ios::out);
863 if (!phraseData) {
864 cout << "File " << phraseDataName << " could not be opened\n";
865 exit(1);
866 }
867
868 // Count the number of phrases output
869 unsigned long int phraseCounter = 0;
870
871 // Set up the phrase expansion memory.
872 // We need this so that we don't expand a phrase more than once
873 initialisePhraseMemory();
874
875 // The current pass numebr
876 int phrasePass = 1;
877
878
879 // PASS NUMBER 1
880 if (verbosity > 1) {
881 cout << "Starting pass " << phrasePass << endl;
882 }
883
884 ofstream outPhrase;
885 char outPhraseName[FILENAME_MAX];
886 unsigned long int outPhraseCounter = 0;
887
888 // On the first pass, simply work through the vocabulary
889 sprintf(outPhraseName, "%s/outPhrase.1", collection);
890 outPhrase.open(outPhraseName, ios::out);
891 if (!outPhrase) {
892 cerr << "File " << outPhraseName << " could not be opened\n";
893 exit(1);
894 }
895
896 // Iterate over the different symbols by working through the suffix array
897 vector<Phrase> result;
898 cellindex ij = 0;
899 char *tmpString;
900
901 Phrase p;
902 while (ij < inputLength) {
903
904 // make a new phrase of length 1
905 p = Phrase(suffixArray[ij], 1, SUFFIX);
906 p.findFirstAndLastSuffix(ij, inputLength-1);
907
908 // We ignore this symbol if it occurs only once, if it is a delimiter,
909 // of if we are in stopwords mode and it is a stopword
910 // - in this new version, only need to check freq
911 // We could imagine a new mode/command-line option, which is like
912 // STOPWORDS but without this restrictrion. This would let you browse
913 // from "the" to "the AGRIS" for example, but not from "AGRIS" to
914 // "the AGRIS" (where the is a stopword and AGRIS a content word).
915 // The system used to work like this; it is easy to implement, but
916 // it explodes the size of the indexes. So: would it be useful?
917 if (p.suffixFrequency > 1) {
918 // Get minimal expansions of the phrase
919 getExpansions(p, result);
920
921 if (!result.empty()) {
922
923 // Remember that we have expanded this phrase
924 rememberThisPhrase(ij, 1);
925
926 // write the phrase text
927 phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
928 << result.size() << ":";
929
930 // write the results
931 for (cellcount k = 0; k < result.size(); k++) {
932 if (k) {
933 phraseData << ",";
934 }
935 phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
936 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
937 outPhraseCounter++;
938 }
939 result.clear();
940
941 // Write the documents in which this phrase occurs
942 df = getDocumentOccurrances(p, documentFrequency);
943 phraseData << ":" << df << ":";
944
945 // write the documents
946 for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
947 if (documentFrequency[m]) {
948 if (first) {
949 first = 0;
950 } else {
951 phraseData << ";";
952 }
953 // Output the document number. Note that here we've numbered the
954 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
955 // add 1 to the document id when we output it.
956 phraseData << "d" << (m+1);
957 // Next, output the frequency with which the document occurs, but
958 // only if it is > 1.
959 if (documentFrequency[m] > 1) {
960 phraseData << "," << documentFrequency[m];
961 }
962 }
963 }
964
965 phraseData << endl;
966 phraseCounter++;
967
968 // feedback
969 if (verbosity) {
970 if (phraseCounter % 1000 == 0) {
971 cout << "phrase " << phraseCounter << ": "
972 << "cell " << p.firstSuffixIndex << " - " << p << endl;
973 }
974 }
975 }
976 }
977 ij = p.lastSuffixIndex + 1;
978 }
979 outPhrase.close();
980
981 // REMAINING PASSES
982 // The previous outPhrase file forms the input to each new pass
983 cellcount start, length;
984 while (outPhraseCounter > 0) {
985
986 // Start a new pass
987 phrasePass++;
988 if (verbosity) {
989 cout << "Starting pass " << phrasePass << endl;
990 }
991
992 // Open the input file
993 char inPhraseName[FILENAME_MAX];
994 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
995 ifstream inPhrase (inPhraseName, ios::in);
996 if (!inPhrase) {
997 cerr << "File " << inPhraseName << " could not be opened\n";
998 exit(1);
999 }
1000
1001 // Open the output file
1002 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
1003 outPhrase.open(outPhraseName, ios::out);
1004 if (!outPhrase) {
1005 cerr << "File " << outPhraseName << " could not be opened\n";
1006 exit(1);
1007 }
1008 outPhraseCounter = 0;
1009
1010 // Process each phrase
1011 while(inPhrase >> start >> length) {
1012
1013 // Ignore the phrase if we have expanded it before
1014 if (isPhraseStored(start, length))
1015 continue;
1016
1017 // Remember that we have examined this phrase
1018 rememberThisPhrase(start, length);
1019
1020 // Find the phrase in the suffixarray
1021 p = Phrase(suffixArray[start], length, SUFFIX);
1022 p.findFirstAndLastSuffix(start, inputLength-1);
1023
1024 // Ignore the phrase if it only occurs once
1025 if (p.suffixFrequency < 2)
1026 continue;
1027
1028 // Write the phrase text;
1029 phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1030
1031 // Expand the phrase, if it is fewer than 8 words long
1032 if (length <= 8) {
1033
1034 // Get the minimal expansions for this phrase
1035 getExpansions(p, result);
1036
1037 // write the results
1038 phraseData << result.size() << ":";
1039
1040 for (cellcount i = 0; i < result.size(); i++) {
1041 if (i) {
1042 phraseData << ",";
1043 }
1044 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1045 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1046 outPhraseCounter++;
1047 }
1048 result.clear();
1049
1050 } else {
1051 // phrase is too long to expand further
1052 phraseData << "0:";
1053 }
1054
1055 // Write the documents in which this phrase occurs
1056 df = getDocumentOccurrances(p, documentFrequency);
1057 phraseData << ":" << df << ":";
1058
1059 // write the documents
1060 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
1061 if (documentFrequency[i]) {
1062 if (first) {
1063 first = 0;
1064 } else {
1065 phraseData << ";";
1066 }
1067 // Output the document number. Note that here we've numbered the
1068 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
1069 // add 1 to the document id when we output it.
1070 phraseData << "d" << (i+1);
1071 // Next, output the frequency with which the document occurs, but
1072 // only if it is > 1.
1073 if (documentFrequency[i] > 1) {
1074 phraseData << "," << documentFrequency[i];
1075 }
1076 }
1077 }
1078
1079 phraseData << endl;
1080 phraseCounter++;
1081
1082 // feedback
1083 if (verbosity) {
1084 if (phraseCounter % 1000 == 0) {
1085 cout << "phrase " << phraseCounter << ": "<< "start " << start
1086 << ", length " << length << " - " << p << endl;
1087 }
1088 }
1089
1090 }
1091
1092 inPhrase.close();
1093 outPhrase.close();
1094 }
1095
1096 phraseData.close();
1097 deletePhraseMemory();
1098
1099 delete [] documentFrequency;
1100 delete [] symbols;
1101 delete [] suffixArray;
1102 delete [] prefixArray;
1103 delete [] suffixCheck;
1104 delete [] documentArray;
1105
1106
1107
1108 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1109 return 0;
1110}
1111
1112
1113
1114
1115
Note: See TracBrowser for help on using the repository browser.