source: main/trunk/greenstone2/build-src/src/phind/generate/suffix.cpp@ 21356

Last change on this file since 21356 was 10335, checked in by kjdon, 19 years ago

++*pa is not the same as *pa++ - reversed these erroneous changes

  • Property svn:keywords set to Author Date Id Revision
File size: 30.3 KB
Line 
1/**********************************************************************
2 *
3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
4 * and prefix arrays (cgn & gwp's simpler algorithm,
5 * and kjm's improvements).
6 *
7 * Copyright 2000 Gordon W. Paynter
8 * Copyright 2000 The New Zealand Digital Library Project
9 *
10 * A component of the Greenstone digital library software from the
11 * New Zealand Digital Library Project at the University of Waikato,
12 * New Zealand.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License as
16 * published by the Free Software Foundation; either version 2 of
17 * the License, or (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 *
28 *********************************************************************/
29
30#include <assert.h>
31#include <math.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#if defined(GSDL_USE_IOS_H)
37# include <fstream.h>
38# include <iostream.h>
39#else
40# include <fstream>
41# include <iostream>
42#endif
43
44#if defined(GSDL_USE_STL_H)
45# if defined(GSDL_USE_ALGO_H)
46# include <algo.h>
47# else
48# include <algorithm.h>
49# endif
50# include <vector.h>
51#else
52# include <algorithm>
53# include <vector>
54#endif
55
56#include "suffix.h"
57#include "phrase.h"
58#include "check.h"
59
60// Global variables declared in suffix.h
61cellcount inputLength;
62
63symbol *symbols;
64symbol **suffixArray;
65symbol **prefixArray;
66
67// How many documents are in this collection?
68cellcount numberOfDocuments;
69symbol **documentArray;
70
71
72// Do we accept any phrase, or do we eliminate those ending with stopwords ?
73int phraseMode = ANYPHRASE; //STOPWORDS;
74// What is the minimum phrase frequency for a phrase to be included in the hierarchy
75int minOccurs = 2;
76
77// The filestem of the collection's phindex directory
78char collection[FILENAME_MAX];
79
80
81// The ranges of the stopword and content-word symbols for the collection
82symbol firstStopSymbol = 0;
83symbol lastStopSymbol = 0;
84symbol firstContentSymbol = 0;
85symbol lastContentSymbol = 0;
86
87
88// Some useful comparison functions, defined below.
89int suffixCompare(const void *, const void *);
90int prefixCompare(const void *, const void *);
91int pointerCompare(const void *, const void *);
92
93
94// Functions for implementing "phrase memory". These let us "remember"
95// each phrase that we've expanded without using too much memory.
96void initialisePhraseMemory();
97void rememberThisPhrase(cellindex index, cellcount length);
98bool isPhraseStored(cellindex index, cellcount length);
99void deletePhraseMemory();
100
101
102// how much output do we want?
103int verbosity = 1;
104
105
106// Get a phrase's expansions
107//
108// Get the set of "minimal", "maximal", non-unique expansions of a
109// phrase p, using the simpler algorithm that Craig and Gordon came up
110// with at Google.
111//
112// Returns a vector of Expansions.
113
114void getExpansions(Phrase &p, vector<Phrase> &results) {
115
116 // 1. Get the initial candidates
117 vector<Phrase> candidates;
118 p.initialSuffixCandidates(candidates);
119 int suffcands = candidates.size();
120 p.initialPrefixCandidates(candidates);
121
122 if (candidates.size() == 0)
123 return;
124
125 vector<Phrase>::iterator i;
126 for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
127 i->expandWhileUniquePrefixExtension();
128 i->ensureSuffixFound();
129 }
130 for (i; i != candidates.end(); ++i) {
131 i->expandWhileUniqueSuffixExtension();
132 }
133
134 // 3. Ensure minimality: ignore phrases whose subphrases are also found
135 results.clear();
136
137 // Initialise the candidates, check array, and various variables.
138 sort(candidates.begin(), candidates.end(), isShorter);
139 unsigned minimum_length = candidates.begin()->length;
140 clearSuffixCheck();
141
142 // Try to add each candidate to the results set, ignoring the non-minimal
143 for (vector<Phrase>::iterator candidate = candidates.begin();
144 candidate != candidates.end(); ++candidate) {
145
146 // Make a copy of candidate to mutilate while performing sub-phrase checks
147 Phrase temp_phrase(*candidate);
148 bool shorter_found = false;
149
150 // Check for shorter and shorter versions of the temporary phrase
151 while (temp_phrase.length >= minimum_length && !shorter_found) {
152 temp_phrase.ensureSuffixFound();
153 if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
154 temp_phrase.shortenByOneAtPrefix();
155 else
156 shorter_found = true;
157
158 // Possible efficiency here: we can finish if the prefix of c
159 // and temp_phrase are the same for candidate->length symbols.
160 }
161
162 // If no shorter phrase is found, use this one
163 if (!shorter_found) {
164 results.push_back(*candidate);
165 candidate->ensureSuffixFound();
166 setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
167 }
168 }
169}
170
171
172// suffixCompare
173//
174// Compare two pointers into a suffix array. We use this in the
175// qsort function, so the input are pointers to pointers.
176//
177// Return -1 if (a < b), otherwise (a > b) so return +1,
178
179int suffixCompare(const void *cpa, const void *cpb) {
180
181 // Cast then dereference pointers to suffix array elements
182 symbol *pa = (symbol *) cpa;
183 symbol *pb = (symbol *) cpb;
184 pa = (symbol *) *pa;
185 pb = (symbol *) *pb;
186
187 // If the two elements are the same, examine the next one
188 while (*pa == *pb) {
189 *pa++;
190 *pb++;
191 }
192
193 // Make the copmparison and return
194 if ( *pa < *pb) {
195 return -1;
196 } else {
197 return +1;
198 }
199}
200
201
202// prefixCompare
203//
204// Compare two pointers into a prefix array. We use this in the
205// qsort function, so the input are pointers to pointers.
206//
207// Return -1 if (a > b), otherwise (a < b) so return +1,
208
209int prefixCompare(const void *cpa, const void *cpb) {
210
211 // Cast then dereference pointers to prefix array elements
212 symbol *pa = (symbol *) cpa;
213 symbol *pb = (symbol *) cpb;
214 pa = (symbol *) *pa;
215 pb = (symbol *) *pb;
216
217 // If the two elements are the same, examine the next one
218 while (*pa == *pb) {
219 *pa--;
220 *pb--;
221 }
222
223 // Make the copmparison and return
224 if ( *pa > *pb) {
225 return -1;
226 } else {
227 return +1;
228 }
229}
230
231// simpleCompare
232//
233// Compare two pointers based on the memory location they point to.
234
235int pointerCompare( const void *pa, const void *pb ) {
236
237 symbol **a = (symbol **) pa;
238 symbol **b = (symbol **) pb;
239
240 if (*a < *b) {
241 return -1;
242 } else if (*a > *b) {
243 return 1;
244 } else {
245 return 0;
246 }
247}
248
249
250// Read the clauses.numbers file into the "symbols" array.
251//
252// Each number in the file is a symbol number; it is essential that
253// the first symbol (and no others) be COLLECTIONSTART and the last
254// symbol (and no others) be COLLECTIONEND.
255//
256// Return the number of numbers in the array.
257
258int readNumbers() {
259
260 char filename[FILENAME_MAX];
261 sprintf(filename, "%s/clauses.numbers", collection);
262 if (verbosity) {
263 cout << "Reading numbers file: " << filename << endl;
264 }
265
266 // Open the numbers file
267 ifstream inFile1(filename, ios::in);
268 if (!inFile1) {
269 cerr << "File " << filename << " could not be opened\n";
270 exit(1);
271 }
272
273 // Count the number of symbols
274 inputLength = 0;
275 symbol word;
276 while (inFile1 >> word) {
277 ++inputLength;
278 }
279 inFile1.close();
280
281 // Allocate the symbbols array
282 if (verbosity > 1) {
283 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
284 }
285 symbols = new symbol[inputLength];
286 if (symbols == NULL) {
287 cerr << "Suffix error: not enough memory to hold " << inputLength
288 << " symbols." << endl;
289 exit(2);
290 }
291
292 // Read the numbers file into the numbers array
293 if (verbosity > 2) {
294 cout << "Reading the numbers" << endl;
295 }
296
297 ifstream inFile2(filename, ios::in);
298 if (!inFile2) {
299 cerr << "File " << filename << " could not be opened\n";
300 exit(1);
301 }
302 cellcount next = 0;
303 numberOfDocuments = 0;
304 while (inFile2 >> word) {
305 symbols[next++] = word;
306 if (word == DOCUMENTSTART) {
307 ++numberOfDocuments;
308 }
309 }
310 inFile2.close();
311
312 // Make sure the numbers file is intact
313 assert(symbols[0] == COLLECTIONSTART);
314 assert(symbols[next-1] == COLLECTIONEND);
315
316 return inputLength;
317}
318
319
320
321// Get Document Occurrance statistics
322//
323// Given a phrase, what documents does it occur in?
324
325cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
326
327 // cout << "searching for \""<< p << "\" in documents "
328 // << 0 << "-" << numberOfDocuments - 1 << endl;
329
330 // The number of documents in which this phrase occurs
331 cellcount df = 0;
332
333 // Initialise the document frequency array
334 // for (cellindex i = 0; i < numberOfDocuments; ++i) {
335 // frequency[i] = 0;
336 //}
337 memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
338
339 // variables used to facilitate the search
340 cellindex begin;
341 cellindex end;
342 cellindex d;
343 symbol *target;
344 bool found;
345
346 // search for the document in which each occurence of the phrase is found
347 for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; ++j) {
348
349 // cout << "looking for phrase at suffixArray[" << j << "]\n";
350
351 target = suffixArray[j];
352 begin = 0;
353 end = numberOfDocuments - 1;
354 found = false;
355
356 // Search for the occurence of a document delimiter that target
357 // occurs immediately after.
358 // We do this by performing a binary chop search on documentArray.
359 while (!found) {
360
361 // cout << "searching for " << (cellindex) target << " in "
362 // << begin << " - " << end << endl;
363
364 assert (begin <= end);
365
366 // If the beginning and end of the interval are the same,
367 // then we've found the correct document
368 if (begin == end) {
369 if (frequency[begin] == 0) {
370 ++df;
371 }
372 ++frequency[begin];
373 found = true;
374 }
375
376 // Otherwise, examine a new document midway through the begin-end
377 // interval and see if it is the one.
378 else {
379 d = (begin + end) / 2;
380 if (target > documentArray[d]) {
381 // If target addrss is greater than this, but thisi sthe last document,
382 // then this must be the one we want. Or, if target is greater than
383 // this one but less then the next, this must be the one we wnat.
384 if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
385 if (frequency[d] == 0) {
386 ++df;
387 }
388 ++frequency[d];
389 found = true;
390 } else {
391 // otherwise we know to search later in the document set
392 begin = d + 1;
393 }
394 } else {
395 // search earlier in the document set
396 end = d - 1;
397 }
398 }
399 }
400 }
401 return df;
402}
403
404
405
406
407
408
409// phraseExpansionMemory : Which phrases have we expanded?
410//
411// A set of utilities for keeping track of which phrases we have expanded.
412// We don't want to expand a phrase more than once, after all.
413//
414// This REALLY ought to be in its own class, but it works so that's okay.
415//
416// Phrases are identified by their firstSuffixPosition and length.
417//
418// Functions provided are:
419// void initialisePhraseMemory()
420// void rememberThisPhrase(index, length)
421// bool isPhraseStored(index, length)
422// void deletePhraseMemory()
423//
424// Internally, we will have two separate cases:
425//
426// Phrases of length 1-8:
427// unsigned char phraseMemory[inputLength]
428// is an array where each cell "remembers" the corresponding index in the
429// suffixArray, and each of the 8 bits of the cell correspond to the phrases
430// of length 1, 2... 8.
431// Eventually, we will make this disk-based (i.e. store the array in a file).
432//
433// Phrases of length 9+:
434// file hashTableFile
435// file listOfEntries
436// The first file is a hash table; each phrase maps to one of its cells, which
437// contains either 0 (empty, no occurence) or a number which is an entry number
438// in the second file. This file contains a "list" of entries. Each consists of
439// three numbers: the suffixArray index of the phrase, the length of the phrase,
440// and the entry number of the next phrase with the same hash.
441//
442
443
444unsigned char *phraseMemory;
445
446void initialiseLongPhraseMemory();
447void rememberThisLongPhrase(cellindex index, cellcount length);
448bool isLongPhraseStored(cellindex index, cellcount length);
449void deleteLongPhraseMemory();
450
451
452void initialisePhraseMemory() {
453
454 phraseMemory = new unsigned char[inputLength];
455
456 // to begin with, everything is empty
457 // for (cellcount i = 0; i < inputLength; ++i) {
458 // phraseMemory[i] = 0;
459 //}
460 memset(phraseMemory, 0, sizeof(char)*inputLength);
461
462 // intialise the hashTable of long phrases
463 initialiseLongPhraseMemory();
464
465}
466
467void rememberThisPhrase(cellindex index, cellcount length) {
468
469 // if the phrase is very long, use the file-based system
470 if (length > 8) {
471 rememberThisLongPhrase(index, length);
472 return;
473 }
474
475 // create a char with just the bit corresponding to length set
476 unsigned char newbit = 1;
477 for (cellcount i = 1; i < length; ++i) {
478 newbit <<= 1;
479 }
480
481 // set that bit in the memory array at position index
482 phraseMemory[index] |= newbit;
483}
484
485
486bool isPhraseStored(cellindex index, cellcount length) {
487
488 // if the phrase is very long, use the file-based system
489 if (length > 8) {
490 return isLongPhraseStored(index, length);
491 }
492
493 // create a char with just the bit corresponding to length set
494 unsigned char newbit = 1;
495 for (cellcount i = 1; i < length; ++i) {
496 newbit <<= 1;
497 }
498
499 // retrurn true if that bit is set in memory arrayat position index
500 return (phraseMemory[index] & newbit);
501}
502
503void deletePhraseMemory() {
504 delete []phraseMemory;
505 deleteLongPhraseMemory();
506}
507
508
509
510// Files etc used to store "long" equavlents of the above
511
512fstream hashTableFile;
513char hashTableFileName[FILENAME_MAX];
514fstream listOfEntries;
515char listOfEntriesName[FILENAME_MAX];
516cellindex nextEntryNumber;
517
518const cellcount bigPrime = 7919;
519
520
521void initialiseLongPhraseMemory() {
522
523 cellindex example = 0;
524
525 sprintf(hashTableFileName, "%s/hashTable", collection);
526 sprintf(listOfEntriesName, "%s/hashLists", collection);
527
528
529 // create the new hashtable
530 if (verbosity > 1) {
531 cout << "Initialising hashTable: " << hashTableFileName << endl;
532 }
533 hashTableFile.open(hashTableFileName, ios::in | ios::out);
534 for (cellcount i = 0; i < bigPrime; ++i) {
535 hashTableFile.write((char *) &example, sizeof(example));
536 }
537
538 // create the list of phrases
539 if (verbosity > 1) {
540 cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
541 }
542 listOfEntries.open(listOfEntriesName, ios::in | ios::out);
543 listOfEntries.write((char *) &example, sizeof(example));
544 listOfEntries.write((char *) &example, sizeof(example));
545 listOfEntries.write((char *) &example, sizeof(example));
546 nextEntryNumber = 1;
547}
548
549
550void rememberThisLongPhrase(cellindex index, cellcount length) {
551
552 // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
553
554 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
555 cellindex pointer;
556 cellindex zero = 0;
557 cellindex readp = 0;
558 cellindex readi = 0;
559 cellindex readl = 0;
560
561 hashTableFile.seekg(hashOffset);
562 hashTableFile.read((char *) &pointer, sizeof(cellindex));
563
564 if (pointer == 0) {
565 // There is no entry at all in the hash table for this entry
566 // so create one
567
568 pointer = nextEntryNumber++;
569 hashTableFile.seekg(hashOffset);
570 hashTableFile.write((char *) &pointer, sizeof(cellindex));
571
572 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
573 listOfEntries.write((char *) &zero, sizeof(cellindex));
574 listOfEntries.write((char *) &index, sizeof(cellindex));
575 listOfEntries.write((char *) &length, sizeof(cellindex));
576
577 } else {
578 // There is a list starting at this hash value, so the phrase may
579 // be already remembered, or it might need to be appended
580
581 while (pointer != 0) {
582 // Read the entry pointed to by pointer
583 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
584 listOfEntries.read((char *) &readp, sizeof(cellindex));
585 listOfEntries.read((char *) &readi, sizeof(cellindex));
586 listOfEntries.read((char *) &readl, sizeof(cellindex));
587
588 // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
589
590 if ((readi == index) && (readl = length)) {
591 // we've found that we've already stored it
592 return;
593 } else if (readp == 0) {
594 // we're reached the end of the list. Add a new entry.
595 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
596 listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
597 pointer = nextEntryNumber++;
598
599 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
600 listOfEntries.write((char *) &zero, sizeof(cellindex));
601 listOfEntries.write((char *) &index, sizeof(cellindex));
602 listOfEntries.write((char *) &length, sizeof(cellindex));
603 return;
604 } else {
605 // go on to the next node
606 pointer = readp;
607 }
608 }
609 }
610
611
612}
613
614
615bool isLongPhraseStored(cellindex index, cellcount length) {
616
617 // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
618
619 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
620 cellindex pointer;
621 cellindex readp = 0;
622 cellindex readi = 0;
623 cellindex readl = 0;
624
625 // Find the phrase in the hashFile
626 hashTableFile.seekg(hashOffset);
627 hashTableFile.read((char *) &pointer, sizeof(cellindex));
628
629 if (pointer == 0) {
630 // There is no entry at all in the hash table for this entry
631 // so nothing is stored
632 return false;
633
634 } else {
635 // There is a list starting at this hash value, so the phrase may
636 // be already remembered in that list
637 while (pointer != 0) {
638 // Read the entry pointed to by pointer
639 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
640 listOfEntries.read((char *) &readp, sizeof(cellindex));
641 listOfEntries.read((char *) &readi, sizeof(cellindex));
642 listOfEntries.read((char *) &readl, sizeof(cellindex));
643
644 if ((readi == index) && (readl = length)) {
645 // we've found the phrase stored here
646 return true;
647 } else {
648 // go on to the next node
649 pointer = readp;
650 }
651 }
652 }
653 return false;
654}
655
656
657void deleteLongPhraseMemory() {
658 // remove the hash & other files
659
660 hashTableFile.close();
661 listOfEntries.close();
662 remove(hashTableFileName);
663 remove(listOfEntriesName);
664
665}
666
667
668// Read the collection statistics file
669//
670void readStatistics() {
671
672 // open the statistics file
673 char filename[FILENAME_MAX];
674 sprintf(filename, "%s/clauses.stats", collection);
675
676 // Open the file
677 ifstream inFile(filename, ios::in);
678 if (!inFile) {
679 cerr << "File " << filename << " could not be opened\n";
680 exit(1);
681 }
682
683 // Read the numbers file into the numbers array
684 char key[1000];
685 symbol value;
686 while (inFile >> key >> value){
687 if (strcmp(key, "first_stopword") == 0) {
688 firstStopSymbol = value;
689 } else if (strcmp(key, "last_stopword") == 0) {
690 lastStopSymbol = value;
691 } else if (strcmp(key, "first_contentword") == 0) {
692 firstContentSymbol = value;
693 } else if (strcmp(key, "last_contentword") == 0) {
694 lastContentSymbol = value;
695 }
696 }
697 inFile.close();
698
699 // Make sure we have the information we need
700 if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
701 cerr << "Statistics file incomplete" << endl;
702 exit(1);
703 }
704}
705
706cellcount getContentCount(symbol firstContent) {
707
708 cellcount content=0;
709 for (cellcount i=0; i<inputLength; ++i) {
710 if (symbols[i]>=firstContent) ++content;
711 }
712
713 return content;
714}
715
716
717int main (int argc, char * argv[]) {
718
719 // Command-line arguments
720 // argv[1] is the phindex directory
721 // argv[2] is the mode, where 1 is stopword mode
722 // argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy
723 // argv[4] is opitonal verbosity
724 if (argc < 4) {
725 cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl;
726 exit(1);
727 }
728
729 // collection directory
730 strcpy(collection, argv[1]);
731
732 // mode parameter
733 phraseMode = atoi(argv[2]);
734 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
735
736 minOccurs = atoi(argv[3]);
737 assert((minOccurs > 0));
738
739 // optional verbosity parameter
740 if (argc == 5) {
741 verbosity = atoi(argv[4]);
742 assert (verbosity >= 0);
743 }
744
745 if (verbosity) {
746 cout << "suffix: the phrase extraction program" << endl;
747 }
748 if (verbosity > 1) {
749 if (phraseMode == STOPWORDS) {
750 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
751 } else {
752 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
753 }
754 }
755
756 // Read the statistics file
757 readStatistics();
758 // Read the numbers file
759 readNumbers();
760
761 if (numberOfDocuments == 0) {
762 cerr << "There are no documents in this collection!" << endl;
763 exit(0); // not necessarily an error...
764 }
765
766 symbol firstContent;
767 if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
768 else firstContent = firstStopSymbol;
769
770 // Allocate memory for the suffix & prefix arrays
771 cellcount contentLength = 0;
772 contentLength = getContentCount(firstContent);
773 suffixArray = new symbol *[contentLength];
774 prefixArray = new symbol *[contentLength];
775 if (prefixArray == NULL) {
776 cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
777 exit(2);
778 }
779 allocateSuffixCheck(contentLength);
780 // Initialise prefix and suffix arrays, only use the needed suffixes
781 for (cellcount j = 0, here = 0; j < inputLength; ++j) {
782 if (symbols[j]>=firstContent) {
783 suffixArray[here] = &symbols[j];
784 prefixArray[here] = &symbols[j];
785 ++here;
786 }
787 }
788 qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
789 qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
790 // Create the document arrays
791 if (verbosity > 1) {
792 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
793 }
794
795 // The document frequecy array is used to count the number of times
796 // each phrase occurs in each document. The number of documents in
797 // which a phrase occurs is stored in df.
798 frequency *documentFrequency = new frequency[numberOfDocuments];
799 frequency df;
800
801 // documentArray will be searched in order to discover which document
802 // each phrase occurs in.
803 documentArray = new symbol *[numberOfDocuments];
804 if (documentArray == NULL) {
805 cerr << "Suffix: out of memory allocating document arrays." << endl;
806 exit(2);
807 }
808
809 // just scan through the input text to find the doc starts
810 cellindex d = 0;
811 for (cellindex i=0; i<inputLength; ++i) {
812 if (symbols[i] == DOCUMENTSTART) {
813 documentArray[d] = &symbols[i];
814 ++d;
815 }
816 }
817
818 // the phrases stuff is expecting inputLength to be the length of the
819 // suffix array, so change it.
820 inputLength = contentLength;
821
822 // Extract phrases
823 //
824 // We will make several passesover the data, in each case considering
825 // a set of input phrases and generating a set of output phrases, which
826 // we will expancd in later passes.
827 //
828 // The input phrases in the first pass will be the vocabulary.
829 // In later passes, the input phrases will be the output phrases of the
830 // previous pass.
831 //
832 // In each pass we will consider each input phrase in turn. If we
833 // have seen it before, we will ignore it. Otherwise, we will expand
834 // it and add its expansions to the set of output phrases.
835
836 cout <<"\ngenerating the phrase hierarchy\n\n";
837
838 // Store the phrase data in the phrases file
839 char phraseDataName[FILENAME_MAX];
840 sprintf(phraseDataName, "%s/phrases", collection);
841 ofstream phraseData(phraseDataName, ios::out);
842 if (!phraseData) {
843 cout << "File " << phraseDataName << " could not be opened\n";
844 exit(1);
845 }
846
847 // Count the number of phrases output
848 unsigned long int phraseCounter = 0;
849
850 // Set up the phrase expansion memory.
851 // We need this so that we don't expand a phrase more than once
852 initialisePhraseMemory();
853
854 // The current pass numebr
855 int phrasePass = 1;
856
857
858 // PASS NUMBER 1
859 if (verbosity > 1) {
860 cout << "Starting pass " << phrasePass << endl;
861 }
862
863 ofstream outPhrase;
864 char outPhraseName[FILENAME_MAX];
865 unsigned long int outPhraseCounter = 0;
866
867 // On the first pass, simply work through the vocabulary
868 sprintf(outPhraseName, "%s/outPhrase.1", collection);
869 outPhrase.open(outPhraseName, ios::out);
870 if (!outPhrase) {
871 cerr << "File " << outPhraseName << " could not be opened\n";
872 exit(1);
873 }
874
875 // Iterate over the different symbols by working through the suffix array
876 vector<Phrase> result;
877 cellindex ij = 0;
878 char *tmpString;
879
880 Phrase p;
881 while (ij < inputLength) {
882
883 // make a new phrase of length 1
884 p = Phrase(suffixArray[ij], 1, SUFFIX);
885 p.findFirstAndLastSuffix(ij, inputLength-1);
886
887 // We ignore this symbol if it occurs only once, if it is a delimiter,
888 // of if we are in stopwords mode and it is a stopword
889 // - in this new version, only need to check freq
890 // We could imagine a new mode/command-line option, which is like
891 // STOPWORDS but without this restrictrion. This would let you browse
892 // from "the" to "the AGRIS" for example, but not from "AGRIS" to
893 // "the AGRIS" (where the is a stopword and AGRIS a content word).
894 // The system used to work like this; it is easy to implement, but
895 // it explodes the size of the indexes. So: would it be useful?
896 if (p.suffixFrequency >= minOccurs) {
897 // Get minimal expansions of the phrase
898 getExpansions(p, result);
899
900 if (!result.empty()) {
901
902 // Remember that we have expanded this phrase
903 rememberThisPhrase(ij, 1);
904
905 // write the phrase text
906 phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
907 << result.size() << ":";
908
909 // write the results
910 for (cellcount k = 0; k < result.size(); ++k) {
911 if (k) {
912 phraseData << ",";
913 }
914 phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
915 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
916 ++outPhraseCounter;
917 }
918 result.clear();
919
920 // Write the documents in which this phrase occurs
921 df = getDocumentOccurrances(p, documentFrequency);
922 phraseData << ":" << df << ":";
923
924 // write the documents
925 for (cellcount m = 0, first = 1; m < numberOfDocuments; ++m) {
926 if (documentFrequency[m]) {
927 if (first) {
928 first = 0;
929 } else {
930 phraseData << ";";
931 }
932 // Output the document number. Note that here we've numbered the
933 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
934 // add 1 to the document id when we output it.
935 phraseData << "d" << (m+1);
936 // Next, output the frequency with which the document occurs, but
937 // only if it is > 1.
938 if (documentFrequency[m] > 1) {
939 phraseData << "," << documentFrequency[m];
940 }
941 }
942 }
943
944 phraseData << endl;
945 ++phraseCounter;
946
947 // feedback
948 if (verbosity) {
949 if (phraseCounter % 1000 == 0) {
950 cout << "phrase " << phraseCounter << ": "
951 << "cell " << p.firstSuffixIndex << " - " << p << endl;
952 }
953 }
954 }
955 }
956 ij = p.lastSuffixIndex + 1;
957 }
958 outPhrase.close();
959
960 // REMAINING PASSES
961 // The previous outPhrase file forms the input to each new pass
962 cellcount start, length;
963 while (outPhraseCounter > 0) {
964
965 // Start a new pass
966 ++phrasePass;
967 if (verbosity) {
968 cout << "Starting pass " << phrasePass << endl;
969 }
970
971 // Open the input file
972 char inPhraseName[FILENAME_MAX];
973 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
974 ifstream inPhrase (inPhraseName, ios::in);
975 if (!inPhrase) {
976 cerr << "File " << inPhraseName << " could not be opened\n";
977 exit(1);
978 }
979
980 // Open the output file
981 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
982 outPhrase.open(outPhraseName, ios::out);
983 if (!outPhrase) {
984 cerr << "File " << outPhraseName << " could not be opened\n";
985 exit(1);
986 }
987 outPhraseCounter = 0;
988
989 // Process each phrase
990 while(inPhrase >> start >> length) {
991
992 // Ignore the phrase if we have expanded it before
993 if (isPhraseStored(start, length))
994 continue;
995
996 // Remember that we have examined this phrase
997 rememberThisPhrase(start, length);
998
999 // Find the phrase in the suffixarray
1000 p = Phrase(suffixArray[start], length, SUFFIX);
1001 p.findFirstAndLastSuffix(start, inputLength-1);
1002
1003 // Ignore the phrase if it only occurs once
1004 if (p.suffixFrequency < minOccurs)
1005 continue;
1006
1007 // Write the phrase text;
1008 phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1009
1010 // Expand the phrase, if it is fewer than 8 words long
1011 if (length <= 8) {
1012
1013 // Get the minimal expansions for this phrase
1014 getExpansions(p, result);
1015
1016 // write the results
1017 phraseData << result.size() << ":";
1018
1019 for (cellcount i = 0; i < result.size(); ++i) {
1020 if (i) {
1021 phraseData << ",";
1022 }
1023 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1024 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1025 ++outPhraseCounter;
1026 }
1027 result.clear();
1028
1029 } else {
1030 // phrase is too long to expand further
1031 phraseData << "0:";
1032 }
1033
1034 // Write the documents in which this phrase occurs
1035 df = getDocumentOccurrances(p, documentFrequency);
1036 phraseData << ":" << df << ":";
1037
1038 // write the documents
1039 for (cellcount i = 0, first = 1; i < numberOfDocuments; ++i) {
1040 if (documentFrequency[i]) {
1041 if (first) {
1042 first = 0;
1043 } else {
1044 phraseData << ";";
1045 }
1046 // Output the document number. Note that here we've numbered the
1047 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
1048 // add 1 to the document id when we output it.
1049 phraseData << "d" << (i+1);
1050 // Next, output the frequency with which the document occurs, but
1051 // only if it is > 1.
1052 if (documentFrequency[i] > 1) {
1053 phraseData << "," << documentFrequency[i];
1054 }
1055 }
1056 }
1057
1058 phraseData << endl;
1059 ++phraseCounter;
1060
1061 // feedback
1062 if (verbosity) {
1063 if (phraseCounter % 1000 == 0) {
1064 cout << "phrase " << phraseCounter << ": "<< "start " << start
1065 << ", length " << length << " - " << p << endl;
1066 }
1067 }
1068
1069 }
1070
1071 inPhrase.close();
1072 outPhrase.close();
1073 }
1074
1075 phraseData.close();
1076 deletePhraseMemory();
1077
1078 delete [] documentFrequency;
1079 delete [] symbols;
1080 delete [] suffixArray;
1081 delete [] prefixArray;
1082 delete [] suffixCheck;
1083 delete [] documentArray;
1084
1085
1086
1087 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1088 return 0;
1089}
1090
1091
1092
1093
1094
Note: See TracBrowser for help on using the repository browser.