source: main/trunk/greenstone2/build-src/src/phind/generate/suffix.cpp@ 26135

Last change on this file since 26135 was 26135, checked in by ak19, 12 years ago

Changing unsigned long to mg_u_long in Phind's suffix.exe program. This did not help to get suffix to work on 64 bit linux, but the changes have not broken it on Windows either (where it already worked). Therefore, since the unsigned long to mg_u_long changes bring this part of the code up to speed with the same changes made elsewhere in the C code, am still committing this.

  • Property svn:keywords set to Author Date Id Revision
File size: 30.3 KB
Line 
1/**********************************************************************
2 *
3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
4 * and prefix arrays (cgn & gwp's simpler algorithm,
5 * and kjm's improvements).
6 *
7 * Copyright 2000 Gordon W. Paynter
8 * Copyright 2000 The New Zealand Digital Library Project
9 *
10 * A component of the Greenstone digital library software from the
11 * New Zealand Digital Library Project at the University of Waikato,
12 * New Zealand.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License as
16 * published by the Free Software Foundation; either version 2 of
17 * the License, or (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 *
28 *********************************************************************/
29
30#include <assert.h>
31#include <math.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#if defined(GSDL_USE_IOS_H)
37# include <fstream.h>
38# include <iostream.h>
39#else
40# include <fstream>
41# include <iostream>
42#endif
43
44#if defined(GSDL_USE_STL_H)
45# if defined(GSDL_USE_ALGO_H)
46# include <algo.h>
47# else
48# include <algorithm.h>
49# endif
50# include <vector.h>
51#else
52# include <algorithm>
53# include <vector>
54#endif
55
56#include "suffix.h"
57#include "phrase.h"
58#include "check.h"
59#include "mglong.h"
60
61// Global variables declared in suffix.h
62cellcount inputLength;
63
64symbol *symbols;
65symbol **suffixArray;
66symbol **prefixArray;
67
68// How many documents are in this collection?
69cellcount numberOfDocuments;
70symbol **documentArray;
71
72
73// Do we accept any phrase, or do we eliminate those ending with stopwords ?
74int phraseMode = ANYPHRASE; //STOPWORDS;
75// What is the minimum phrase frequency for a phrase to be included in the hierarchy
76int minOccurs = 2;
77
78// The filestem of the collection's phindex directory
79char collection[FILENAME_MAX];
80
81
82// The ranges of the stopword and content-word symbols for the collection
83symbol firstStopSymbol = 0;
84symbol lastStopSymbol = 0;
85symbol firstContentSymbol = 0;
86symbol lastContentSymbol = 0;
87
88
89// Some useful comparison functions, defined below.
90int suffixCompare(const void *, const void *);
91int prefixCompare(const void *, const void *);
92int pointerCompare(const void *, const void *);
93
94
95// Functions for implementing "phrase memory". These let us "remember"
96// each phrase that we've expanded without using too much memory.
97void initialisePhraseMemory();
98void rememberThisPhrase(cellindex index, cellcount length);
99bool isPhraseStored(cellindex index, cellcount length);
100void deletePhraseMemory();
101
102
103// how much output do we want?
104int verbosity = 1;
105
106
107// Get a phrase's expansions
108//
109// Get the set of "minimal", "maximal", non-unique expansions of a
110// phrase p, using the simpler algorithm that Craig and Gordon came up
111// with at Google.
112//
113// Returns a vector of Expansions.
114
115void getExpansions(Phrase &p, vector<Phrase> &results) {
116
117 // 1. Get the initial candidates
118 vector<Phrase> candidates;
119 p.initialSuffixCandidates(candidates);
120 int suffcands = candidates.size();
121 p.initialPrefixCandidates(candidates);
122
123 if (candidates.size() == 0)
124 return;
125
126 vector<Phrase>::iterator i;
127 for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
128 i->expandWhileUniquePrefixExtension();
129 i->ensureSuffixFound();
130 }
131 for (i; i != candidates.end(); ++i) {
132 i->expandWhileUniqueSuffixExtension();
133 }
134
135 // 3. Ensure minimality: ignore phrases whose subphrases are also found
136 results.clear();
137
138 // Initialise the candidates, check array, and various variables.
139 sort(candidates.begin(), candidates.end(), isShorter);
140 unsigned minimum_length = candidates.begin()->length;
141 clearSuffixCheck();
142
143 // Try to add each candidate to the results set, ignoring the non-minimal
144 for (vector<Phrase>::iterator candidate = candidates.begin();
145 candidate != candidates.end(); ++candidate) {
146
147 // Make a copy of candidate to mutilate while performing sub-phrase checks
148 Phrase temp_phrase(*candidate);
149 bool shorter_found = false;
150
151 // Check for shorter and shorter versions of the temporary phrase
152 while (temp_phrase.length >= minimum_length && !shorter_found) {
153 temp_phrase.ensureSuffixFound();
154 if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
155 temp_phrase.shortenByOneAtPrefix();
156 else
157 shorter_found = true;
158
159 // Possible efficiency here: we can finish if the prefix of c
160 // and temp_phrase are the same for candidate->length symbols.
161 }
162
163 // If no shorter phrase is found, use this one
164 if (!shorter_found) {
165 results.push_back(*candidate);
166 candidate->ensureSuffixFound();
167 setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
168 }
169 }
170}
171
172
173// suffixCompare
174//
175// Compare two pointers into a suffix array. We use this in the
176// qsort function, so the input are pointers to pointers.
177//
178// Return -1 if (a < b), otherwise (a > b) so return +1,
179
180int suffixCompare(const void *cpa, const void *cpb) {
181
182 // Cast then dereference pointers to suffix array elements
183 symbol *pa = (symbol *) cpa;
184 symbol *pb = (symbol *) cpb;
185 pa = (symbol *) *pa;
186 pb = (symbol *) *pb;
187
188 // If the two elements are the same, examine the next one
189 while (*pa == *pb) {
190 *pa++;
191 *pb++;
192 }
193
194 // Make the copmparison and return
195 if ( *pa < *pb) {
196 return -1;
197 } else {
198 return +1;
199 }
200}
201
202
203// prefixCompare
204//
205// Compare two pointers into a prefix array. We use this in the
206// qsort function, so the input are pointers to pointers.
207//
208// Return -1 if (a > b), otherwise (a < b) so return +1,
209
210int prefixCompare(const void *cpa, const void *cpb) {
211
212 // Cast then dereference pointers to prefix array elements
213 symbol *pa = (symbol *) cpa;
214 symbol *pb = (symbol *) cpb;
215 pa = (symbol *) *pa;
216 pb = (symbol *) *pb;
217
218 // If the two elements are the same, examine the next one
219 while (*pa == *pb) {
220 *pa--;
221 *pb--;
222 }
223
224 // Make the copmparison and return
225 if ( *pa > *pb) {
226 return -1;
227 } else {
228 return +1;
229 }
230}
231
232// simpleCompare
233//
234// Compare two pointers based on the memory location they point to.
235
236int pointerCompare( const void *pa, const void *pb ) {
237
238 symbol **a = (symbol **) pa;
239 symbol **b = (symbol **) pb;
240
241 if (*a < *b) {
242 return -1;
243 } else if (*a > *b) {
244 return 1;
245 } else {
246 return 0;
247 }
248}
249
250
251// Read the clauses.numbers file into the "symbols" array.
252//
253// Each number in the file is a symbol number; it is essential that
254// the first symbol (and no others) be COLLECTIONSTART and the last
255// symbol (and no others) be COLLECTIONEND.
256//
257// Return the number of numbers in the array.
258
259int readNumbers() {
260
261 char filename[FILENAME_MAX];
262 sprintf(filename, "%s/clauses.numbers", collection);
263 if (verbosity) {
264 cout << "Reading numbers file: " << filename << endl;
265 }
266
267 // Open the numbers file
268 ifstream inFile1(filename, ios::in);
269 if (!inFile1) {
270 cerr << "File " << filename << " could not be opened\n";
271 exit(1);
272 }
273
274 // Count the number of symbols
275 inputLength = 0;
276 symbol word;
277 while (inFile1 >> word) {
278 ++inputLength;
279 }
280 inFile1.close();
281
282 // Allocate the symbbols array
283 if (verbosity > 1) {
284 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
285 }
286 symbols = new symbol[inputLength];
287 if (symbols == NULL) {
288 cerr << "Suffix error: not enough memory to hold " << inputLength
289 << " symbols." << endl;
290 exit(2);
291 }
292
293 // Read the numbers file into the numbers array
294 if (verbosity > 2) {
295 cout << "Reading the numbers" << endl;
296 }
297
298 ifstream inFile2(filename, ios::in);
299 if (!inFile2) {
300 cerr << "File " << filename << " could not be opened\n";
301 exit(1);
302 }
303 cellcount next = 0;
304 numberOfDocuments = 0;
305 while (inFile2 >> word) {
306 symbols[next++] = word;
307 if (word == DOCUMENTSTART) {
308 ++numberOfDocuments;
309 }
310 }
311 inFile2.close();
312
313 // Make sure the numbers file is intact
314 assert(symbols[0] == COLLECTIONSTART);
315 assert(symbols[next-1] == COLLECTIONEND);
316
317 return inputLength;
318}
319
320
321
322// Get Document Occurrance statistics
323//
324// Given a phrase, what documents does it occur in?
325
326cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
327
328 // cout << "searching for \""<< p << "\" in documents "
329 // << 0 << "-" << numberOfDocuments - 1 << endl;
330
331 // The number of documents in which this phrase occurs
332 cellcount df = 0;
333
334 // Initialise the document frequency array
335 // for (cellindex i = 0; i < numberOfDocuments; ++i) {
336 // frequency[i] = 0;
337 //}
338 memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
339
340 // variables used to facilitate the search
341 cellindex begin;
342 cellindex end;
343 cellindex d;
344 symbol *target;
345 bool found;
346
347 // search for the document in which each occurence of the phrase is found
348 for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; ++j) {
349
350 // cout << "looking for phrase at suffixArray[" << j << "]\n";
351
352 target = suffixArray[j];
353 begin = 0;
354 end = numberOfDocuments - 1;
355 found = false;
356
357 // Search for the occurence of a document delimiter that target
358 // occurs immediately after.
359 // We do this by performing a binary chop search on documentArray.
360 while (!found) {
361
362 // cout << "searching for " << (cellindex) target << " in "
363 // << begin << " - " << end << endl;
364
365 assert (begin <= end);
366
367 // If the beginning and end of the interval are the same,
368 // then we've found the correct document
369 if (begin == end) {
370 if (frequency[begin] == 0) {
371 ++df;
372 }
373 ++frequency[begin];
374 found = true;
375 }
376
377 // Otherwise, examine a new document midway through the begin-end
378 // interval and see if it is the one.
379 else {
380 d = (begin + end) / 2;
381 if (target > documentArray[d]) {
382 // If target addrss is greater than this, but thisi sthe last document,
383 // then this must be the one we want. Or, if target is greater than
384 // this one but less then the next, this must be the one we wnat.
385 if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
386 if (frequency[d] == 0) {
387 ++df;
388 }
389 ++frequency[d];
390 found = true;
391 } else {
392 // otherwise we know to search later in the document set
393 begin = d + 1;
394 }
395 } else {
396 // search earlier in the document set
397 end = d - 1;
398 }
399 }
400 }
401 }
402 return df;
403}
404
405
406
407
408
409
410// phraseExpansionMemory : Which phrases have we expanded?
411//
412// A set of utilities for keeping track of which phrases we have expanded.
413// We don't want to expand a phrase more than once, after all.
414//
415// This REALLY ought to be in its own class, but it works so that's okay.
416//
417// Phrases are identified by their firstSuffixPosition and length.
418//
419// Functions provided are:
420// void initialisePhraseMemory()
421// void rememberThisPhrase(index, length)
422// bool isPhraseStored(index, length)
423// void deletePhraseMemory()
424//
425// Internally, we will have two separate cases:
426//
427// Phrases of length 1-8:
428// unsigned char phraseMemory[inputLength]
429// is an array where each cell "remembers" the corresponding index in the
430// suffixArray, and each of the 8 bits of the cell correspond to the phrases
431// of length 1, 2... 8.
432// Eventually, we will make this disk-based (i.e. store the array in a file).
433//
434// Phrases of length 9+:
435// file hashTableFile
436// file listOfEntries
437// The first file is a hash table; each phrase maps to one of its cells, which
438// contains either 0 (empty, no occurence) or a number which is an entry number
439// in the second file. This file contains a "list" of entries. Each consists of
440// three numbers: the suffixArray index of the phrase, the length of the phrase,
441// and the entry number of the next phrase with the same hash.
442//
443
444
445unsigned char *phraseMemory;
446
447void initialiseLongPhraseMemory();
448void rememberThisLongPhrase(cellindex index, cellcount length);
449bool isLongPhraseStored(cellindex index, cellcount length);
450void deleteLongPhraseMemory();
451
452
453void initialisePhraseMemory() {
454
455 phraseMemory = new unsigned char[inputLength];
456
457 // to begin with, everything is empty
458 // for (cellcount i = 0; i < inputLength; ++i) {
459 // phraseMemory[i] = 0;
460 //}
461 memset(phraseMemory, 0, sizeof(char)*inputLength);
462
463 // intialise the hashTable of long phrases
464 initialiseLongPhraseMemory();
465
466}
467
468void rememberThisPhrase(cellindex index, cellcount length) {
469
470 // if the phrase is very long, use the file-based system
471 if (length > 8) {
472 rememberThisLongPhrase(index, length);
473 return;
474 }
475
476 // create a char with just the bit corresponding to length set
477 unsigned char newbit = 1;
478 for (cellcount i = 1; i < length; ++i) {
479 newbit <<= 1;
480 }
481
482 // set that bit in the memory array at position index
483 phraseMemory[index] |= newbit;
484}
485
486
487bool isPhraseStored(cellindex index, cellcount length) {
488
489 // if the phrase is very long, use the file-based system
490 if (length > 8) {
491 return isLongPhraseStored(index, length);
492 }
493
494 // create a char with just the bit corresponding to length set
495 unsigned char newbit = 1;
496 for (cellcount i = 1; i < length; ++i) {
497 newbit <<= 1;
498 }
499
500 // retrurn true if that bit is set in memory arrayat position index
501 return (phraseMemory[index] & newbit);
502}
503
504void deletePhraseMemory() {
505 delete []phraseMemory;
506 deleteLongPhraseMemory();
507}
508
509
510
511// Files etc used to store "long" equavlents of the above
512
513fstream hashTableFile;
514char hashTableFileName[FILENAME_MAX];
515fstream listOfEntries;
516char listOfEntriesName[FILENAME_MAX];
517cellindex nextEntryNumber;
518
519const cellcount bigPrime = 7919;
520
521
522void initialiseLongPhraseMemory() {
523
524 cellindex example = 0;
525
526 sprintf(hashTableFileName, "%s/hashTable", collection);
527 sprintf(listOfEntriesName, "%s/hashLists", collection);
528
529
530 // create the new hashtable
531 if (verbosity > 1) {
532 cout << "Initialising hashTable: " << hashTableFileName << endl;
533 }
534 hashTableFile.open(hashTableFileName, ios::in | ios::out);
535 for (cellcount i = 0; i < bigPrime; ++i) {
536 hashTableFile.write((char *) &example, sizeof(example));
537 }
538
539 // create the list of phrases
540 if (verbosity > 1) {
541 cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
542 }
543 listOfEntries.open(listOfEntriesName, ios::in | ios::out);
544 listOfEntries.write((char *) &example, sizeof(example));
545 listOfEntries.write((char *) &example, sizeof(example));
546 listOfEntries.write((char *) &example, sizeof(example));
547 nextEntryNumber = 1;
548}
549
550
551void rememberThisLongPhrase(cellindex index, cellcount length) {
552
553 // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
554
555 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
556 cellindex pointer;
557 cellindex zero = 0;
558 cellindex readp = 0;
559 cellindex readi = 0;
560 cellindex readl = 0;
561
562 hashTableFile.seekg(hashOffset);
563 hashTableFile.read((char *) &pointer, sizeof(cellindex));
564
565 if (pointer == 0) {
566 // There is no entry at all in the hash table for this entry
567 // so create one
568
569 pointer = nextEntryNumber++;
570 hashTableFile.seekg(hashOffset);
571 hashTableFile.write((char *) &pointer, sizeof(cellindex));
572
573 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
574 listOfEntries.write((char *) &zero, sizeof(cellindex));
575 listOfEntries.write((char *) &index, sizeof(cellindex));
576 listOfEntries.write((char *) &length, sizeof(cellindex));
577
578 } else {
579 // There is a list starting at this hash value, so the phrase may
580 // be already remembered, or it might need to be appended
581
582 while (pointer != 0) {
583 // Read the entry pointed to by pointer
584 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
585 listOfEntries.read((char *) &readp, sizeof(cellindex));
586 listOfEntries.read((char *) &readi, sizeof(cellindex));
587 listOfEntries.read((char *) &readl, sizeof(cellindex));
588
589 // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
590
591 if ((readi == index) && (readl = length)) {
592 // we've found that we've already stored it
593 return;
594 } else if (readp == 0) {
595 // we're reached the end of the list. Add a new entry.
596 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
597 listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
598 pointer = nextEntryNumber++;
599
600 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
601 listOfEntries.write((char *) &zero, sizeof(cellindex));
602 listOfEntries.write((char *) &index, sizeof(cellindex));
603 listOfEntries.write((char *) &length, sizeof(cellindex));
604 return;
605 } else {
606 // go on to the next node
607 pointer = readp;
608 }
609 }
610 }
611
612
613}
614
615
616bool isLongPhraseStored(cellindex index, cellcount length) {
617
618 // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
619
620 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
621 cellindex pointer;
622 cellindex readp = 0;
623 cellindex readi = 0;
624 cellindex readl = 0;
625
626 // Find the phrase in the hashFile
627 hashTableFile.seekg(hashOffset);
628 hashTableFile.read((char *) &pointer, sizeof(cellindex));
629
630 if (pointer == 0) {
631 // There is no entry at all in the hash table for this entry
632 // so nothing is stored
633 return false;
634
635 } else {
636 // There is a list starting at this hash value, so the phrase may
637 // be already remembered in that list
638 while (pointer != 0) {
639 // Read the entry pointed to by pointer
640 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
641 listOfEntries.read((char *) &readp, sizeof(cellindex));
642 listOfEntries.read((char *) &readi, sizeof(cellindex));
643 listOfEntries.read((char *) &readl, sizeof(cellindex));
644
645 if ((readi == index) && (readl = length)) {
646 // we've found the phrase stored here
647 return true;
648 } else {
649 // go on to the next node
650 pointer = readp;
651 }
652 }
653 }
654 return false;
655}
656
657
658void deleteLongPhraseMemory() {
659 // remove the hash & other files
660
661 hashTableFile.close();
662 listOfEntries.close();
663 remove(hashTableFileName);
664 remove(listOfEntriesName);
665
666}
667
668
669// Read the collection statistics file
670//
671void readStatistics() {
672
673 // open the statistics file
674 char filename[FILENAME_MAX];
675 sprintf(filename, "%s/clauses.stats", collection);
676
677 // Open the file
678 ifstream inFile(filename, ios::in);
679 if (!inFile) {
680 cerr << "File " << filename << " could not be opened\n";
681 exit(1);
682 }
683
684 // Read the numbers file into the numbers array
685 char key[1000];
686 symbol value;
687 while (inFile >> key >> value){
688 if (strcmp(key, "first_stopword") == 0) {
689 firstStopSymbol = value;
690 } else if (strcmp(key, "last_stopword") == 0) {
691 lastStopSymbol = value;
692 } else if (strcmp(key, "first_contentword") == 0) {
693 firstContentSymbol = value;
694 } else if (strcmp(key, "last_contentword") == 0) {
695 lastContentSymbol = value;
696 }
697 }
698 inFile.close();
699
700 // Make sure we have the information we need
701 if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
702 cerr << "Statistics file incomplete" << endl;
703 exit(1);
704 }
705}
706
707cellcount getContentCount(symbol firstContent) {
708
709 cellcount content=0;
710 for (cellcount i=0; i<inputLength; ++i) {
711 if (symbols[i]>=firstContent) ++content;
712 }
713
714 return content;
715}
716
717
718int main (int argc, char * argv[]) {
719
720 // Command-line arguments
721 // argv[1] is the phindex directory
722 // argv[2] is the mode, where 1 is stopword mode
723 // argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy
724 // argv[4] is opitonal verbosity
725 if (argc < 4) {
726 cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl;
727 exit(1);
728 }
729
730 // collection directory
731 strcpy(collection, argv[1]);
732
733 // mode parameter
734 phraseMode = atoi(argv[2]);
735 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
736
737 minOccurs = atoi(argv[3]);
738 assert((minOccurs > 0));
739
740 // optional verbosity parameter
741 if (argc == 5) {
742 verbosity = atoi(argv[4]);
743 assert (verbosity >= 0);
744 }
745
746 if (verbosity) {
747 cout << "suffix: the phrase extraction program" << endl;
748 }
749 if (verbosity > 1) {
750 if (phraseMode == STOPWORDS) {
751 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
752 } else {
753 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
754 }
755 }
756
757 // Read the statistics file
758 readStatistics();
759 // Read the numbers file
760 readNumbers();
761
762 if (numberOfDocuments == 0) {
763 cerr << "There are no documents in this collection!" << endl;
764 exit(0); // not necessarily an error...
765 }
766
767 symbol firstContent;
768 if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
769 else firstContent = firstStopSymbol;
770
771 // Allocate memory for the suffix & prefix arrays
772 cellcount contentLength = 0;
773 contentLength = getContentCount(firstContent);
774 suffixArray = new symbol *[contentLength];
775 prefixArray = new symbol *[contentLength];
776 if (prefixArray == NULL) {
777 cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
778 exit(2);
779 }
780 allocateSuffixCheck(contentLength);
781 // Initialise prefix and suffix arrays, only use the needed suffixes
782 for (cellcount j = 0, here = 0; j < inputLength; ++j) {
783 if (symbols[j]>=firstContent) {
784 suffixArray[here] = &symbols[j];
785 prefixArray[here] = &symbols[j];
786 ++here;
787 }
788 }
789 qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
790 qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
791 // Create the document arrays
792 if (verbosity > 1) {
793 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
794 }
795
796 // The document frequecy array is used to count the number of times
797 // each phrase occurs in each document. The number of documents in
798 // which a phrase occurs is stored in df.
799 frequency *documentFrequency = new frequency[numberOfDocuments];
800 frequency df;
801
802 // documentArray will be searched in order to discover which document
803 // each phrase occurs in.
804 documentArray = new symbol *[numberOfDocuments];
805 if (documentArray == NULL) {
806 cerr << "Suffix: out of memory allocating document arrays." << endl;
807 exit(2);
808 }
809
810 // just scan through the input text to find the doc starts
811 cellindex d = 0;
812 for (cellindex i=0; i<inputLength; ++i) {
813 if (symbols[i] == DOCUMENTSTART) {
814 documentArray[d] = &symbols[i];
815 ++d;
816 }
817 }
818
819 // the phrases stuff is expecting inputLength to be the length of the
820 // suffix array, so change it.
821 inputLength = contentLength;
822
823 // Extract phrases
824 //
825 // We will make several passesover the data, in each case considering
826 // a set of input phrases and generating a set of output phrases, which
827 // we will expancd in later passes.
828 //
829 // The input phrases in the first pass will be the vocabulary.
830 // In later passes, the input phrases will be the output phrases of the
831 // previous pass.
832 //
833 // In each pass we will consider each input phrase in turn. If we
834 // have seen it before, we will ignore it. Otherwise, we will expand
835 // it and add its expansions to the set of output phrases.
836
837 cout <<"\ngenerating the phrase hierarchy\n\n";
838
839 // Store the phrase data in the phrases file
840 char phraseDataName[FILENAME_MAX];
841 sprintf(phraseDataName, "%s/phrases", collection);
842 ofstream phraseData(phraseDataName, ios::out);
843 if (!phraseData) {
844 cout << "File " << phraseDataName << " could not be opened\n";
845 exit(1);
846 }
847
848 // Count the number of phrases output
849 mg_u_long phraseCounter = 0; //unsigned long int phraseCounter = 0;
850
851 // Set up the phrase expansion memory.
852 // We need this so that we don't expand a phrase more than once
853 initialisePhraseMemory();
854
855 // The current pass numebr
856 int phrasePass = 1;
857
858
859 // PASS NUMBER 1
860 if (verbosity > 1) {
861 cout << "Starting pass " << phrasePass << endl;
862 }
863
864 ofstream outPhrase;
865 char outPhraseName[FILENAME_MAX];
866 mg_u_long outPhraseCounter = 0; // unsigned long int outPhraseCounter = 0;
867
868 // On the first pass, simply work through the vocabulary
869 sprintf(outPhraseName, "%s/outPhrase.1", collection);
870 outPhrase.open(outPhraseName, ios::out);
871 if (!outPhrase) {
872 cerr << "File " << outPhraseName << " could not be opened\n";
873 exit(1);
874 }
875
876 // Iterate over the different symbols by working through the suffix array
877 vector<Phrase> result;
878 cellindex ij = 0;
879 char *tmpString;
880
881 Phrase p;
882 while (ij < inputLength) {
883
884 // make a new phrase of length 1
885 p = Phrase(suffixArray[ij], 1, SUFFIX);
886 p.findFirstAndLastSuffix(ij, inputLength-1);
887
888 // We ignore this symbol if it occurs only once, if it is a delimiter,
889 // of if we are in stopwords mode and it is a stopword
890 // - in this new version, only need to check freq
891 // We could imagine a new mode/command-line option, which is like
892 // STOPWORDS but without this restrictrion. This would let you browse
893 // from "the" to "the AGRIS" for example, but not from "AGRIS" to
894 // "the AGRIS" (where the is a stopword and AGRIS a content word).
895 // The system used to work like this; it is easy to implement, but
896 // it explodes the size of the indexes. So: would it be useful?
897 if (p.suffixFrequency >= minOccurs) {
898 // Get minimal expansions of the phrase
899 getExpansions(p, result);
900
901 if (!result.empty()) {
902
903 // Remember that we have expanded this phrase
904 rememberThisPhrase(ij, 1);
905
906 // write the phrase text
907 phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
908 << result.size() << ":";
909
910 // write the results
911 for (cellcount k = 0; k < result.size(); ++k) {
912 if (k) {
913 phraseData << ",";
914 }
915 phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
916 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
917 ++outPhraseCounter;
918 }
919 result.clear();
920
921 // Write the documents in which this phrase occurs
922 df = getDocumentOccurrances(p, documentFrequency);
923 phraseData << ":" << df << ":";
924
925 // write the documents
926 for (cellcount m = 0, first = 1; m < numberOfDocuments; ++m) {
927 if (documentFrequency[m]) {
928 if (first) {
929 first = 0;
930 } else {
931 phraseData << ";";
932 }
933 // Output the document number. Note that here we've numbered the
934 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
935 // add 1 to the document id when we output it.
936 phraseData << "d" << (m+1);
937 // Next, output the frequency with which the document occurs, but
938 // only if it is > 1.
939 if (documentFrequency[m] > 1) {
940 phraseData << "," << documentFrequency[m];
941 }
942 }
943 }
944
945 phraseData << endl;
946 ++phraseCounter;
947
948 // feedback
949 if (verbosity) {
950 if (phraseCounter % 1000 == 0) {
951 cout << "phrase " << phraseCounter << ": "
952 << "cell " << p.firstSuffixIndex << " - " << p << endl;
953 }
954 }
955 }
956 }
957 ij = p.lastSuffixIndex + 1;
958 }
959 outPhrase.close();
960
961 // REMAINING PASSES
962 // The previous outPhrase file forms the input to each new pass
963 cellcount start, length;
964 while (outPhraseCounter > 0) {
965
966 // Start a new pass
967 ++phrasePass;
968 if (verbosity) {
969 cout << "Starting pass " << phrasePass << endl;
970 }
971
972 // Open the input file
973 char inPhraseName[FILENAME_MAX];
974 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
975 ifstream inPhrase (inPhraseName, ios::in);
976 if (!inPhrase) {
977 cerr << "File " << inPhraseName << " could not be opened\n";
978 exit(1);
979 }
980
981 // Open the output file
982 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
983 outPhrase.open(outPhraseName, ios::out);
984 if (!outPhrase) {
985 cerr << "File " << outPhraseName << " could not be opened\n";
986 exit(1);
987 }
988 outPhraseCounter = 0;
989
990 // Process each phrase
991 while(inPhrase >> start >> length) {
992
993 // Ignore the phrase if we have expanded it before
994 if (isPhraseStored(start, length))
995 continue;
996
997 // Remember that we have examined this phrase
998 rememberThisPhrase(start, length);
999
1000 // Find the phrase in the suffixarray
1001 p = Phrase(suffixArray[start], length, SUFFIX);
1002 p.findFirstAndLastSuffix(start, inputLength-1);
1003
1004 // Ignore the phrase if it only occurs once
1005 if (p.suffixFrequency < minOccurs)
1006 continue;
1007
1008 // Write the phrase text;
1009 phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1010
1011 // Expand the phrase, if it is fewer than 8 words long
1012 if (length <= 8) {
1013
1014 // Get the minimal expansions for this phrase
1015 getExpansions(p, result);
1016
1017 // write the results
1018 phraseData << result.size() << ":";
1019
1020 for (cellcount i = 0; i < result.size(); ++i) {
1021 if (i) {
1022 phraseData << ",";
1023 }
1024 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1025 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1026 ++outPhraseCounter;
1027 }
1028 result.clear();
1029
1030 } else {
1031 // phrase is too long to expand further
1032 phraseData << "0:";
1033 }
1034
1035 // Write the documents in which this phrase occurs
1036 df = getDocumentOccurrances(p, documentFrequency);
1037 phraseData << ":" << df << ":";
1038
1039 // write the documents
1040 for (cellcount i = 0, first = 1; i < numberOfDocuments; ++i) {
1041 if (documentFrequency[i]) {
1042 if (first) {
1043 first = 0;
1044 } else {
1045 phraseData << ";";
1046 }
1047 // Output the document number. Note that here we've numbered the
1048 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
1049 // add 1 to the document id when we output it.
1050 phraseData << "d" << (i+1);
1051 // Next, output the frequency with which the document occurs, but
1052 // only if it is > 1.
1053 if (documentFrequency[i] > 1) {
1054 phraseData << "," << documentFrequency[i];
1055 }
1056 }
1057 }
1058
1059 phraseData << endl;
1060 ++phraseCounter;
1061
1062 // feedback
1063 if (verbosity) {
1064 if (phraseCounter % 1000 == 0) {
1065 cout << "phrase " << phraseCounter << ": "<< "start " << start
1066 << ", length " << length << " - " << p << endl;
1067 }
1068 }
1069
1070 }
1071
1072 inPhrase.close();
1073 outPhrase.close();
1074 }
1075
1076 phraseData.close();
1077 deletePhraseMemory();
1078
1079 delete [] documentFrequency;
1080 delete [] symbols;
1081 delete [] suffixArray;
1082 delete [] prefixArray;
1083 delete [] suffixCheck;
1084 delete [] documentArray;
1085
1086
1087
1088 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1089 return 0;
1090}
1091
1092
1093
1094
1095
Note: See TracBrowser for help on using the repository browser.