source: main/trunk/greenstone2/build-src/src/phind/generate/suffix.cpp@ 26158

Last change on this file since 26158 was 26158, checked in by ak19, 12 years ago

Bugfix. Many thanks to Anthony Blake. The phind browse classifier used to fail on certain particular combinations of documents on 64 bit linux. This was because of a bug in suffix.exe that caused a seg fault: all pointers are 64 bits on 64 bit machines. However, suffixCompare() and prefixCompare() functions used to try to convert their arguments, which were void* pointers to pointers, by first casting the void* pointers to symbol* and then dereference these to get their values (which are ultimately 32 bit unsigned int) but which are meant to be addresses (pointers themselves) and then converting these back to pointers objects which are 64 bit values. During this process of multiple conversion, 64 bit pointers were truncated to 32 bits before being given 64 bits for storage again. This used to work on 32 bit machines before since pointers there were 32 bits, but showed up here. The solution was a double pointer cast operation: since the data in the void* arguments are actually pointers to pointers, convert them directly back to the specific pointer data types we need. Many thanks to Anthony Blake who initially came to help with working out why gdb wasn't loading the symbols despite suffix.cpp being compiled up with the minus-g flag (when attempting to debug by printing out the values of the dereferenced pointers where things appeared to go wrong or at least step through the code). He ended up finding out what the bug actually was and that a conversion to double pointers was the solution necessary to get things working on 64 bit machines as well as still working on 32 bit machines. Now the phind classifier's been tested on 64 bit and 32 bit machines. And Anthony confirmed that this issue has indeed been resolved. Though it struck randomly, as only certain document combinations in collections show up the underlying error via a seg fault, he inspected the assembly instructions in gdb to see that the move operations were now correct, which they hadn't been before.

  • Property svn:keywords set to Author Date Id Revision
File size: 31.7 KB
Line 
1/**********************************************************************
2 *
3 * Suffix.cpp -- Extract the repeated phrases in the input with suffix
4 * and prefix arrays (cgn & gwp's simpler algorithm,
5 * and kjm's improvements).
6 *
7 * Copyright 2000 Gordon W. Paynter
8 * Copyright 2000 The New Zealand Digital Library Project
9 *
10 * A component of the Greenstone digital library software from the
11 * New Zealand Digital Library Project at the University of Waikato,
12 * New Zealand.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License as
16 * published by the Free Software Foundation; either version 2 of
17 * the License, or (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License
25 * along with this program; if not, write to the Free Software
26 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 *
28 *********************************************************************/
29
30#include <assert.h>
31#include <math.h>
32#include <stdio.h>
33#include <stdlib.h>
34#include <string.h>
35
36#if defined(GSDL_USE_IOS_H)
37# include <fstream.h>
38# include <iostream.h>
39#else
40# include <fstream>
41# include <iostream>
42#endif
43
44#if defined(GSDL_USE_STL_H)
45# if defined(GSDL_USE_ALGO_H)
46# include <algo.h>
47# else
48# include <algorithm.h>
49# endif
50# include <vector.h>
51#else
52# include <algorithm>
53# include <vector>
54#endif
55
56#include "suffix.h"
57#include "phrase.h"
58#include "check.h"
59#include "mglong.h"
60
61// Global variables declared in suffix.h
62cellcount inputLength;
63
64symbol *symbols;
65symbol **suffixArray;
66symbol **prefixArray;
67
68// How many documents are in this collection?
69cellcount numberOfDocuments;
70symbol **documentArray;
71
72
73// Do we accept any phrase, or do we eliminate those ending with stopwords ?
74int phraseMode = ANYPHRASE; //STOPWORDS;
75// What is the minimum phrase frequency for a phrase to be included in the hierarchy
76int minOccurs = 2;
77
78// The filestem of the collection's phindex directory
79char collection[FILENAME_MAX];
80
81
82// The ranges of the stopword and content-word symbols for the collection
83symbol firstStopSymbol = 0;
84symbol lastStopSymbol = 0;
85symbol firstContentSymbol = 0;
86symbol lastContentSymbol = 0;
87
88
89// Some useful comparison functions, defined below.
90int suffixCompare(const void *, const void *);
91int prefixCompare(const void *, const void *);
92int pointerCompare(const void *, const void *);
93
94
95// Functions for implementing "phrase memory". These let us "remember"
96// each phrase that we've expanded without using too much memory.
97void initialisePhraseMemory();
98void rememberThisPhrase(cellindex index, cellcount length);
99bool isPhraseStored(cellindex index, cellcount length);
100void deletePhraseMemory();
101
102
103// how much output do we want?
104int verbosity = 1;
105
106
107// Get a phrase's expansions
108//
109// Get the set of "minimal", "maximal", non-unique expansions of a
110// phrase p, using the simpler algorithm that Craig and Gordon came up
111// with at Google.
112//
113// Returns a vector of Expansions.
114
115void getExpansions(Phrase &p, vector<Phrase> &results) {
116
117 // 1. Get the initial candidates
118 vector<Phrase> candidates;
119 p.initialSuffixCandidates(candidates);
120 int suffcands = candidates.size();
121 p.initialPrefixCandidates(candidates);
122
123 if (candidates.size() == 0)
124 return;
125
126 vector<Phrase>::iterator i;
127 for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
128 i->expandWhileUniquePrefixExtension();
129 i->ensureSuffixFound();
130 }
131 for (i; i != candidates.end(); ++i) {
132 i->expandWhileUniqueSuffixExtension();
133 }
134
135 // 3. Ensure minimality: ignore phrases whose subphrases are also found
136 results.clear();
137
138 // Initialise the candidates, check array, and various variables.
139 sort(candidates.begin(), candidates.end(), isShorter);
140 unsigned minimum_length = candidates.begin()->length;
141 clearSuffixCheck();
142
143 // Try to add each candidate to the results set, ignoring the non-minimal
144 for (vector<Phrase>::iterator candidate = candidates.begin();
145 candidate != candidates.end(); ++candidate) {
146
147 // Make a copy of candidate to mutilate while performing sub-phrase checks
148 Phrase temp_phrase(*candidate);
149 bool shorter_found = false;
150
151 // Check for shorter and shorter versions of the temporary phrase
152 while (temp_phrase.length >= minimum_length && !shorter_found) {
153 temp_phrase.ensureSuffixFound();
154 if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
155 temp_phrase.shortenByOneAtPrefix();
156 else
157 shorter_found = true;
158
159 // Possible efficiency here: we can finish if the prefix of c
160 // and temp_phrase are the same for candidate->length symbols.
161 }
162
163 // If no shorter phrase is found, use this one
164 if (!shorter_found) {
165 results.push_back(*candidate);
166 candidate->ensureSuffixFound();
167 setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
168 }
169 }
170}
171
172
173// suffixCompare
174//
175// Compare two pointers into a suffix array. We use this in the
176// qsort function, so the input are pointers to pointers.
177//
178// Return -1 if (a < b), otherwise (a > b) so return +1,
179
180int suffixCompare(const void *cpa, const void *cpb) {
181
182 // The following is a bug and so at times results in a segmentation fault on 64 bit linux machines.
183 // The bug is in the 3rd and 4th lines, which perform identical operations. Considering the 3rd line:
184 // The *pa part of this statement truncates pa, the 64 bit pointer (symbol*), with the 32 bit value
185 // symbol (unsigned int) that it's being dereferenced to, before trying to reconvert it to a pointer
186 // which is always 64 bit on 64 bit machines. The 4th line has the same problem and prefixCompare() too.
187
188 // WRONG: Cast then dereference pointers to suffix array elements
189 //symbol *pa = (symbol *) cpa;
190 //symbol *pb = (symbol *) cpb;
191 //pa = (symbol *) *pa;
192 //pb = (symbol *) *pb;
193
194 // This is the correct way that will work on both 64 bit and 32 bit machines:
195 // the 2 void* arguments to this function, cpa and cpb, are "pointers to elements" as per
196 // http://www.cplusplus.com/reference/clibrary/cstdlib/qsort/. The array elements themselves are also
197 // *pointers* themselves: to symbol values. As the comment to this function says: cpa and cpb are
198 // pointers to pointers, making them double pointers. When we cast the void* cpa and cpb arguments to
199 // the types we need to work with here, they therefore need to be cast as double pointers to symbol.
200 // We declare pa and pb as symbol pointers (symbol*) to the actual values being compared, therefore pa
201 // and pb need to be instantiated as the address of the values referenced by cpa and cpb.
202 symbol *pa = *((symbol **) cpa);
203 symbol *pb = *((symbol **) cpb);
204
205
206 // If the two elements are the same, examine the next one
207 while (*pa == *pb) {
208 *pa++;
209 *pb++;
210 }
211
212 // Make the copmparison and return
213 if ( *pa < *pb) {
214 return -1;
215 } else {
216 return +1;
217 }
218}
219
220
221// prefixCompare
222//
223// Compare two pointers into a prefix array. We use this in the
224// qsort function, so the input are pointers to pointers.
225//
226// Return -1 if (a > b), otherwise (a < b) so return +1,
227
228int prefixCompare(const void *cpa, const void *cpb) {
229
230 // Cast and dereference double pointers (pointers to prefix array elements)
231 // See comments in suffixCompare()
232 symbol *pa = *((symbol **) cpa);
233 symbol *pb = *((symbol **) cpb);
234
235 // If the two elements are the same, examine the next one
236 while (*pa == *pb) {
237 *pa--;
238 *pb--;
239 }
240
241 // Make the comparison and return
242 if ( *pa > *pb) {
243 return -1;
244 } else {
245 return +1;
246 }
247}
248
249// simpleCompare
250//
251// Compare two pointers based on the memory location they point to.
252
253int pointerCompare( const void *pa, const void *pb ) {
254
255 symbol **a = (symbol **) pa;
256 symbol **b = (symbol **) pb;
257
258 if (*a < *b) {
259 return -1;
260 } else if (*a > *b) {
261 return 1;
262 } else {
263 return 0;
264 }
265}
266
267
268// Read the clauses.numbers file into the "symbols" array.
269//
270// Each number in the file is a symbol number; it is essential that
271// the first symbol (and no others) be COLLECTIONSTART and the last
272// symbol (and no others) be COLLECTIONEND.
273//
274// Return the number of numbers in the array.
275
276int readNumbers() {
277
278 char filename[FILENAME_MAX];
279 sprintf(filename, "%s/clauses.numbers", collection);
280 if (verbosity) {
281 cout << "Reading numbers file: " << filename << endl;
282 }
283
284 // Open the numbers file
285 ifstream inFile1(filename, ios::in);
286 if (!inFile1) {
287 cerr << "File " << filename << " could not be opened\n";
288 exit(1);
289 }
290
291 // Count the number of symbols
292 inputLength = 0;
293 symbol word;
294 while (inFile1 >> word) {
295 ++inputLength;
296 }
297 inFile1.close();
298
299 // Allocate the symbbols array
300 if (verbosity > 1) {
301 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
302 }
303 symbols = new symbol[inputLength];
304 if (symbols == NULL) {
305 cerr << "Suffix error: not enough memory to hold " << inputLength
306 << " symbols." << endl;
307 exit(2);
308 }
309
310 // Read the numbers file into the numbers array
311 if (verbosity > 2) {
312 cout << "Reading the numbers" << endl;
313 }
314
315 ifstream inFile2(filename, ios::in);
316 if (!inFile2) {
317 cerr << "File " << filename << " could not be opened\n";
318 exit(1);
319 }
320 cellcount next = 0;
321 numberOfDocuments = 0;
322 while (inFile2 >> word) {
323 symbols[next++] = word;
324 if (word == DOCUMENTSTART) {
325 ++numberOfDocuments;
326 }
327 }
328 inFile2.close();
329
330 // Make sure the numbers file is intact
331 assert(symbols[0] == COLLECTIONSTART);
332 assert(symbols[next-1] == COLLECTIONEND);
333
334 return inputLength;
335}
336
337
338
339// Get Document Occurrance statistics
340//
341// Given a phrase, what documents does it occur in?
342
343cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
344
345 // cout << "searching for \""<< p << "\" in documents "
346 // << 0 << "-" << numberOfDocuments - 1 << endl;
347
348 // The number of documents in which this phrase occurs
349 cellcount df = 0;
350
351 // Initialise the document frequency array
352 // for (cellindex i = 0; i < numberOfDocuments; ++i) {
353 // frequency[i] = 0;
354 //}
355 memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
356
357 // variables used to facilitate the search
358 cellindex begin;
359 cellindex end;
360 cellindex d;
361 symbol *target;
362 bool found;
363
364 // search for the document in which each occurence of the phrase is found
365 for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; ++j) {
366
367 // cout << "looking for phrase at suffixArray[" << j << "]\n";
368
369 target = suffixArray[j];
370 begin = 0;
371 end = numberOfDocuments - 1;
372 found = false;
373
374 // Search for the occurence of a document delimiter that target
375 // occurs immediately after.
376 // We do this by performing a binary chop search on documentArray.
377 while (!found) {
378
379 // cout << "searching for " << (cellindex) target << " in "
380 // << begin << " - " << end << endl;
381
382 assert (begin <= end);
383
384 // If the beginning and end of the interval are the same,
385 // then we've found the correct document
386 if (begin == end) {
387 if (frequency[begin] == 0) {
388 ++df;
389 }
390 ++frequency[begin];
391 found = true;
392 }
393
394 // Otherwise, examine a new document midway through the begin-end
395 // interval and see if it is the one.
396 else {
397 d = (begin + end) / 2;
398 if (target > documentArray[d]) {
399 // If target addrss is greater than this, but thisi sthe last document,
400 // then this must be the one we want. Or, if target is greater than
401 // this one but less then the next, this must be the one we wnat.
402 if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
403 if (frequency[d] == 0) {
404 ++df;
405 }
406 ++frequency[d];
407 found = true;
408 } else {
409 // otherwise we know to search later in the document set
410 begin = d + 1;
411 }
412 } else {
413 // search earlier in the document set
414 end = d - 1;
415 }
416 }
417 }
418 }
419 return df;
420}
421
422
423
424
425
426
427// phraseExpansionMemory : Which phrases have we expanded?
428//
429// A set of utilities for keeping track of which phrases we have expanded.
430// We don't want to expand a phrase more than once, after all.
431//
432// This REALLY ought to be in its own class, but it works so that's okay.
433//
434// Phrases are identified by their firstSuffixPosition and length.
435//
436// Functions provided are:
437// void initialisePhraseMemory()
438// void rememberThisPhrase(index, length)
439// bool isPhraseStored(index, length)
440// void deletePhraseMemory()
441//
442// Internally, we will have two separate cases:
443//
444// Phrases of length 1-8:
445// unsigned char phraseMemory[inputLength]
446// is an array where each cell "remembers" the corresponding index in the
447// suffixArray, and each of the 8 bits of the cell correspond to the phrases
448// of length 1, 2... 8.
449// Eventually, we will make this disk-based (i.e. store the array in a file).
450//
451// Phrases of length 9+:
452// file hashTableFile
453// file listOfEntries
454// The first file is a hash table; each phrase maps to one of its cells, which
455// contains either 0 (empty, no occurence) or a number which is an entry number
456// in the second file. This file contains a "list" of entries. Each consists of
457// three numbers: the suffixArray index of the phrase, the length of the phrase,
458// and the entry number of the next phrase with the same hash.
459//
460
461
462unsigned char *phraseMemory;
463
464void initialiseLongPhraseMemory();
465void rememberThisLongPhrase(cellindex index, cellcount length);
466bool isLongPhraseStored(cellindex index, cellcount length);
467void deleteLongPhraseMemory();
468
469
470void initialisePhraseMemory() {
471
472 phraseMemory = new unsigned char[inputLength];
473
474 // to begin with, everything is empty
475 // for (cellcount i = 0; i < inputLength; ++i) {
476 // phraseMemory[i] = 0;
477 //}
478 memset(phraseMemory, 0, sizeof(char)*inputLength);
479
480 // intialise the hashTable of long phrases
481 initialiseLongPhraseMemory();
482
483}
484
485void rememberThisPhrase(cellindex index, cellcount length) {
486
487 // if the phrase is very long, use the file-based system
488 if (length > 8) {
489 rememberThisLongPhrase(index, length);
490 return;
491 }
492
493 // create a char with just the bit corresponding to length set
494 unsigned char newbit = 1;
495 for (cellcount i = 1; i < length; ++i) {
496 newbit <<= 1;
497 }
498
499 // set that bit in the memory array at position index
500 phraseMemory[index] |= newbit;
501}
502
503
504bool isPhraseStored(cellindex index, cellcount length) {
505
506 // if the phrase is very long, use the file-based system
507 if (length > 8) {
508 return isLongPhraseStored(index, length);
509 }
510
511 // create a char with just the bit corresponding to length set
512 unsigned char newbit = 1;
513 for (cellcount i = 1; i < length; ++i) {
514 newbit <<= 1;
515 }
516
517 // retrurn true if that bit is set in memory arrayat position index
518 return (phraseMemory[index] & newbit);
519}
520
521void deletePhraseMemory() {
522 delete []phraseMemory;
523 deleteLongPhraseMemory();
524}
525
526
527
528// Files etc used to store "long" equavlents of the above
529
530fstream hashTableFile;
531char hashTableFileName[FILENAME_MAX];
532fstream listOfEntries;
533char listOfEntriesName[FILENAME_MAX];
534cellindex nextEntryNumber;
535
536const cellcount bigPrime = 7919;
537
538
539void initialiseLongPhraseMemory() {
540
541 cellindex example = 0;
542
543 sprintf(hashTableFileName, "%s/hashTable", collection);
544 sprintf(listOfEntriesName, "%s/hashLists", collection);
545
546
547 // create the new hashtable
548 if (verbosity > 1) {
549 cout << "Initialising hashTable: " << hashTableFileName << endl;
550 }
551 hashTableFile.open(hashTableFileName, ios::in | ios::out);
552 for (cellcount i = 0; i < bigPrime; ++i) {
553 hashTableFile.write((char *) &example, sizeof(example));
554 }
555
556 // create the list of phrases
557 if (verbosity > 1) {
558 cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
559 }
560 listOfEntries.open(listOfEntriesName, ios::in | ios::out);
561 listOfEntries.write((char *) &example, sizeof(example));
562 listOfEntries.write((char *) &example, sizeof(example));
563 listOfEntries.write((char *) &example, sizeof(example));
564 nextEntryNumber = 1;
565}
566
567
568void rememberThisLongPhrase(cellindex index, cellcount length) {
569
570 // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
571
572 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
573 cellindex pointer;
574 cellindex zero = 0;
575 cellindex readp = 0;
576 cellindex readi = 0;
577 cellindex readl = 0;
578
579 hashTableFile.seekg(hashOffset);
580 hashTableFile.read((char *) &pointer, sizeof(cellindex));
581
582 if (pointer == 0) {
583 // There is no entry at all in the hash table for this entry
584 // so create one
585
586 pointer = nextEntryNumber++;
587 hashTableFile.seekg(hashOffset);
588 hashTableFile.write((char *) &pointer, sizeof(cellindex));
589
590 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
591 listOfEntries.write((char *) &zero, sizeof(cellindex));
592 listOfEntries.write((char *) &index, sizeof(cellindex));
593 listOfEntries.write((char *) &length, sizeof(cellindex));
594
595 } else {
596 // There is a list starting at this hash value, so the phrase may
597 // be already remembered, or it might need to be appended
598
599 while (pointer != 0) {
600 // Read the entry pointed to by pointer
601 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
602 listOfEntries.read((char *) &readp, sizeof(cellindex));
603 listOfEntries.read((char *) &readi, sizeof(cellindex));
604 listOfEntries.read((char *) &readl, sizeof(cellindex));
605
606 // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
607
608 if ((readi == index) && (readl = length)) {
609 // we've found that we've already stored it
610 return;
611 } else if (readp == 0) {
612 // we're reached the end of the list. Add a new entry.
613 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
614 listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
615 pointer = nextEntryNumber++;
616
617 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
618 listOfEntries.write((char *) &zero, sizeof(cellindex));
619 listOfEntries.write((char *) &index, sizeof(cellindex));
620 listOfEntries.write((char *) &length, sizeof(cellindex));
621 return;
622 } else {
623 // go on to the next node
624 pointer = readp;
625 }
626 }
627 }
628
629
630}
631
632
633bool isLongPhraseStored(cellindex index, cellcount length) {
634
635 // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
636
637 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
638 cellindex pointer;
639 cellindex readp = 0;
640 cellindex readi = 0;
641 cellindex readl = 0;
642
643 // Find the phrase in the hashFile
644 hashTableFile.seekg(hashOffset);
645 hashTableFile.read((char *) &pointer, sizeof(cellindex));
646
647 if (pointer == 0) {
648 // There is no entry at all in the hash table for this entry
649 // so nothing is stored
650 return false;
651
652 } else {
653 // There is a list starting at this hash value, so the phrase may
654 // be already remembered in that list
655 while (pointer != 0) {
656 // Read the entry pointed to by pointer
657 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
658 listOfEntries.read((char *) &readp, sizeof(cellindex));
659 listOfEntries.read((char *) &readi, sizeof(cellindex));
660 listOfEntries.read((char *) &readl, sizeof(cellindex));
661
662 if ((readi == index) && (readl = length)) {
663 // we've found the phrase stored here
664 return true;
665 } else {
666 // go on to the next node
667 pointer = readp;
668 }
669 }
670 }
671 return false;
672}
673
674
675void deleteLongPhraseMemory() {
676 // remove the hash & other files
677
678 hashTableFile.close();
679 listOfEntries.close();
680 remove(hashTableFileName);
681 remove(listOfEntriesName);
682
683}
684
685
686// Read the collection statistics file
687//
688void readStatistics() {
689
690 // open the statistics file
691 char filename[FILENAME_MAX];
692 sprintf(filename, "%s/clauses.stats", collection);
693
694 // Open the file
695 ifstream inFile(filename, ios::in);
696 if (!inFile) {
697 cerr << "File " << filename << " could not be opened\n";
698 exit(1);
699 }
700
701 // Read the numbers file into the numbers array
702 char key[1000];
703 symbol value;
704 while (inFile >> key >> value){
705 if (strcmp(key, "first_stopword") == 0) {
706 firstStopSymbol = value;
707 } else if (strcmp(key, "last_stopword") == 0) {
708 lastStopSymbol = value;
709 } else if (strcmp(key, "first_contentword") == 0) {
710 firstContentSymbol = value;
711 } else if (strcmp(key, "last_contentword") == 0) {
712 lastContentSymbol = value;
713 }
714 }
715 inFile.close();
716
717 // Make sure we have the information we need
718 if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
719 cerr << "Statistics file incomplete" << endl;
720 exit(1);
721 }
722}
723
724cellcount getContentCount(symbol firstContent) {
725
726 cellcount content=0;
727 for (cellcount i=0; i<inputLength; ++i) {
728 if (symbols[i]>=firstContent) ++content;
729 }
730
731 return content;
732}
733
734
735int main (int argc, char * argv[]) {
736
737 // Command-line arguments
738 // argv[1] is the phindex directory
739 // argv[2] is the mode, where 1 is stopword mode
740 // argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy
741 // argv[4] is opitonal verbosity
742 if (argc < 4) {
743 cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl;
744 exit(1);
745 }
746
747 // collection directory
748 strcpy(collection, argv[1]);
749
750 // mode parameter
751 phraseMode = atoi(argv[2]);
752 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
753
754 minOccurs = atoi(argv[3]);
755 assert((minOccurs > 0));
756
757 // optional verbosity parameter
758 if (argc == 5) {
759 verbosity = atoi(argv[4]);
760 assert (verbosity >= 0);
761 }
762
763 if (verbosity) {
764 cout << "suffix: the phrase extraction program" << endl;
765 }
766 if (verbosity > 1) {
767 if (phraseMode == STOPWORDS) {
768 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
769 } else {
770 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
771 }
772 }
773
774 // Read the statistics file
775 readStatistics();
776 // Read the numbers file
777 readNumbers();
778
779 if (numberOfDocuments == 0) {
780 cerr << "There are no documents in this collection!" << endl;
781 exit(0); // not necessarily an error...
782 }
783
784 symbol firstContent;
785 if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
786 else firstContent = firstStopSymbol;
787
788 // Allocate memory for the suffix & prefix arrays
789 cellcount contentLength = 0;
790 contentLength = getContentCount(firstContent);
791 suffixArray = new symbol *[contentLength];
792 prefixArray = new symbol *[contentLength];
793 if (prefixArray == NULL) {
794 cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
795 exit(2);
796 }
797 allocateSuffixCheck(contentLength);
798 // Initialise prefix and suffix arrays, only use the needed suffixes
799 for (cellcount j = 0, here = 0; j < inputLength; ++j) {
800 if (symbols[j]>=firstContent) {
801 suffixArray[here] = &symbols[j];
802 prefixArray[here] = &symbols[j];
803 ++here;
804 }
805 }
806 qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
807 qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
808 // Create the document arrays
809 if (verbosity > 1) {
810 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
811 }
812
813 // The document frequecy array is used to count the number of times
814 // each phrase occurs in each document. The number of documents in
815 // which a phrase occurs is stored in df.
816 frequency *documentFrequency = new frequency[numberOfDocuments];
817 frequency df;
818
819 // documentArray will be searched in order to discover which document
820 // each phrase occurs in.
821 documentArray = new symbol *[numberOfDocuments];
822 if (documentArray == NULL) {
823 cerr << "Suffix: out of memory allocating document arrays." << endl;
824 exit(2);
825 }
826
827 // just scan through the input text to find the doc starts
828 cellindex d = 0;
829 for (cellindex i=0; i<inputLength; ++i) {
830 if (symbols[i] == DOCUMENTSTART) {
831 documentArray[d] = &symbols[i];
832 ++d;
833 }
834 }
835
836 // the phrases stuff is expecting inputLength to be the length of the
837 // suffix array, so change it.
838 inputLength = contentLength;
839
840 // Extract phrases
841 //
842 // We will make several passesover the data, in each case considering
843 // a set of input phrases and generating a set of output phrases, which
844 // we will expancd in later passes.
845 //
846 // The input phrases in the first pass will be the vocabulary.
847 // In later passes, the input phrases will be the output phrases of the
848 // previous pass.
849 //
850 // In each pass we will consider each input phrase in turn. If we
851 // have seen it before, we will ignore it. Otherwise, we will expand
852 // it and add its expansions to the set of output phrases.
853
854 cout <<"\ngenerating the phrase hierarchy\n\n";
855
856 // Store the phrase data in the phrases file
857 char phraseDataName[FILENAME_MAX];
858 sprintf(phraseDataName, "%s/phrases", collection);
859 ofstream phraseData(phraseDataName, ios::out);
860 if (!phraseData) {
861 cout << "File " << phraseDataName << " could not be opened\n";
862 exit(1);
863 }
864
865 // Count the number of phrases output
866 mg_u_long phraseCounter = 0; //unsigned long int phraseCounter = 0;
867
868 // Set up the phrase expansion memory.
869 // We need this so that we don't expand a phrase more than once
870 initialisePhraseMemory();
871
872 // The current pass numebr
873 int phrasePass = 1;
874
875
876 // PASS NUMBER 1
877 if (verbosity > 1) {
878 cout << "Starting pass " << phrasePass << endl;
879 }
880
881 ofstream outPhrase;
882 char outPhraseName[FILENAME_MAX];
883 mg_u_long outPhraseCounter = 0; // unsigned long int outPhraseCounter = 0;
884
885 // On the first pass, simply work through the vocabulary
886 sprintf(outPhraseName, "%s/outPhrase.1", collection);
887 outPhrase.open(outPhraseName, ios::out);
888 if (!outPhrase) {
889 cerr << "File " << outPhraseName << " could not be opened\n";
890 exit(1);
891 }
892
893 // Iterate over the different symbols by working through the suffix array
894 vector<Phrase> result;
895 cellindex ij = 0;
896 char *tmpString;
897
898 Phrase p;
899 while (ij < inputLength) {
900
901 // make a new phrase of length 1
902 p = Phrase(suffixArray[ij], 1, SUFFIX);
903 p.findFirstAndLastSuffix(ij, inputLength-1);
904
905 // We ignore this symbol if it occurs only once, if it is a delimiter,
906 // of if we are in stopwords mode and it is a stopword
907 // - in this new version, only need to check freq
908 // We could imagine a new mode/command-line option, which is like
909 // STOPWORDS but without this restrictrion. This would let you browse
910 // from "the" to "the AGRIS" for example, but not from "AGRIS" to
911 // "the AGRIS" (where the is a stopword and AGRIS a content word).
912 // The system used to work like this; it is easy to implement, but
913 // it explodes the size of the indexes. So: would it be useful?
914 if (p.suffixFrequency >= minOccurs) {
915 // Get minimal expansions of the phrase
916 getExpansions(p, result);
917
918 if (!result.empty()) {
919
920 // Remember that we have expanded this phrase
921 rememberThisPhrase(ij, 1);
922
923 // write the phrase text
924 phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
925 << result.size() << ":";
926
927 // write the results
928 for (cellcount k = 0; k < result.size(); ++k) {
929 if (k) {
930 phraseData << ",";
931 }
932 phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
933 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
934 ++outPhraseCounter;
935 }
936 result.clear();
937
938 // Write the documents in which this phrase occurs
939 df = getDocumentOccurrances(p, documentFrequency);
940 phraseData << ":" << df << ":";
941
942 // write the documents
943 for (cellcount m = 0, first = 1; m < numberOfDocuments; ++m) {
944 if (documentFrequency[m]) {
945 if (first) {
946 first = 0;
947 } else {
948 phraseData << ";";
949 }
950 // Output the document number. Note that here we've numbered the
951 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
952 // add 1 to the document id when we output it.
953 phraseData << "d" << (m+1);
954 // Next, output the frequency with which the document occurs, but
955 // only if it is > 1.
956 if (documentFrequency[m] > 1) {
957 phraseData << "," << documentFrequency[m];
958 }
959 }
960 }
961
962 phraseData << endl;
963 ++phraseCounter;
964
965 // feedback
966 if (verbosity) {
967 if (phraseCounter % 1000 == 0) {
968 cout << "phrase " << phraseCounter << ": "
969 << "cell " << p.firstSuffixIndex << " - " << p << endl;
970 }
971 }
972 }
973 }
974 ij = p.lastSuffixIndex + 1;
975 }
976 outPhrase.close();
977
978 // REMAINING PASSES
979 // The previous outPhrase file forms the input to each new pass
980 cellcount start, length;
981 while (outPhraseCounter > 0) {
982
983 // Start a new pass
984 ++phrasePass;
985 if (verbosity) {
986 cout << "Starting pass " << phrasePass << endl;
987 }
988
989 // Open the input file
990 char inPhraseName[FILENAME_MAX];
991 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
992 ifstream inPhrase (inPhraseName, ios::in);
993 if (!inPhrase) {
994 cerr << "File " << inPhraseName << " could not be opened\n";
995 exit(1);
996 }
997
998 // Open the output file
999 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
1000 outPhrase.open(outPhraseName, ios::out);
1001 if (!outPhrase) {
1002 cerr << "File " << outPhraseName << " could not be opened\n";
1003 exit(1);
1004 }
1005 outPhraseCounter = 0;
1006
1007 // Process each phrase
1008 while(inPhrase >> start >> length) {
1009
1010 // Ignore the phrase if we have expanded it before
1011 if (isPhraseStored(start, length))
1012 continue;
1013
1014 // Remember that we have examined this phrase
1015 rememberThisPhrase(start, length);
1016
1017 // Find the phrase in the suffixarray
1018 p = Phrase(suffixArray[start], length, SUFFIX);
1019 p.findFirstAndLastSuffix(start, inputLength-1);
1020
1021 // Ignore the phrase if it only occurs once
1022 if (p.suffixFrequency < minOccurs)
1023 continue;
1024
1025 // Write the phrase text;
1026 phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1027
1028 // Expand the phrase, if it is fewer than 8 words long
1029 if (length <= 8) {
1030
1031 // Get the minimal expansions for this phrase
1032 getExpansions(p, result);
1033
1034 // write the results
1035 phraseData << result.size() << ":";
1036
1037 for (cellcount i = 0; i < result.size(); ++i) {
1038 if (i) {
1039 phraseData << ",";
1040 }
1041 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1042 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1043 ++outPhraseCounter;
1044 }
1045 result.clear();
1046
1047 } else {
1048 // phrase is too long to expand further
1049 phraseData << "0:";
1050 }
1051
1052 // Write the documents in which this phrase occurs
1053 df = getDocumentOccurrances(p, documentFrequency);
1054 phraseData << ":" << df << ":";
1055
1056 // write the documents
1057 for (cellcount i = 0, first = 1; i < numberOfDocuments; ++i) {
1058 if (documentFrequency[i]) {
1059 if (first) {
1060 first = 0;
1061 } else {
1062 phraseData << ";";
1063 }
1064 // Output the document number. Note that here we've numbered the
1065 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
1066 // add 1 to the document id when we output it.
1067 phraseData << "d" << (i+1);
1068 // Next, output the frequency with which the document occurs, but
1069 // only if it is > 1.
1070 if (documentFrequency[i] > 1) {
1071 phraseData << "," << documentFrequency[i];
1072 }
1073 }
1074 }
1075
1076 phraseData << endl;
1077 ++phraseCounter;
1078
1079 // feedback
1080 if (verbosity) {
1081 if (phraseCounter % 1000 == 0) {
1082 cout << "phrase " << phraseCounter << ": "<< "start " << start
1083 << ", length " << length << " - " << p << endl;
1084 }
1085 }
1086
1087 }
1088
1089 inPhrase.close();
1090 outPhrase.close();
1091 }
1092
1093 phraseData.close();
1094 deletePhraseMemory();
1095
1096 delete [] documentFrequency;
1097 delete [] symbols;
1098 delete [] suffixArray;
1099 delete [] prefixArray;
1100 delete [] suffixCheck;
1101 delete [] documentArray;
1102
1103
1104
1105 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1106 return 0;
1107}
1108
1109
1110
1111
1112
Note: See TracBrowser for help on using the repository browser.