source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2498

Last change on this file since 2498 was 2498, checked in by sjboddie, 23 years ago

A couple of small changes to phind's generate code to get it working
under windows 95

  • Property svn:keywords set to Author Date Id Revision
File size: 30.8 KB
Line 
1/**********************************************************************
2 *
3 * suffix.cpp -- Extract the repeated phrases in the input using
4 * suffix and prefix arrays.
5 *
6 * Copyright 2000 Gordon W. Paynter
7 * Copyright 2000 The New Zealand Digital Library Project
8 *
9 * A component of the Greenstone digital library software
10 * from the New Zealand Digital Library Project at the
11 * University of Waikato, New Zealand.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 *********************************************************************/
28
29#include <assert.h>
30#include <math.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <string.h>
34
35#if defined(GSDL_USE_IOS_H)
36# include <fstream.h>
37# include <iostream.h>
38#else
39# include <fstream>
40# include <iostream>
41#endif
42
43#if defined(GSDL_USE_STL_H)
44# if defined(GSDL_USE_ALGO_H)
45# include <algo.h>
46# else
47# include <algorithm.h>
48# endif
49# include <vector.h>
50#else
51# include <algorithm>
52# include <vector>
53#endif
54
55#include "suffix.h"
56#include "phrase.h"
57
58// Global variables declared in suffix.h
59cellcount inputLength;
60
61symbol *symbols;
62symbol **suffixArray;
63check *suffixCheck;
64symbol **prefixArray;
65check *prefixCheck;
66
67
68// How many documents are in this collection?
69cellcount numberOfDocuments;
70symbol **documentArray;
71
72// Do we accept any phrase, or do we eliminate those ending with stopwords ?
73int phraseMode = ANYPHRASE; //STOPWORDS;
74
75// The filestem of the collection's phindex directory
76char collection[FILENAME_MAX];
77
78int suffixCompare(const void *, const void *);
79int prefixCompare(const void *, const void *);
80int pointerCompare(const void *, const void *);
81
82int readNumbers();
83void readStatistics();
84
85void getMinimalExpansions(Phrase &p, vector<Phrase> &results);
86cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
87
88// The ranges of the stopword and content-word symbols for the collection
89symbol firstStopSymbol = 0;
90symbol lastStopSymbol = 0;
91symbol firstContentSymbol = 0;
92symbol lastContentSymbol = 0;
93
94
95
96
97// Phrase memory
98// We have to "remember" each phrase that we've expanded
99void initialisePhraseMemory();
100void rememberThisPhrase(cellindex index, cellcount length);
101bool isPhraseStored(cellindex index, cellcount length);
102void deletePhraseMemory();
103
104
105// how much output do we want?
106int verbosity = 1;
107
108
109int main (int argc, char * argv[]) {
110
111 // Command-line arguments
112 // argv[1] is the phindex directory
113 // argv[2] is the maximum array symbol length (optional)
114 // argv[3] is the mode, where 1 is stopword mode (optional)
115 if (argc < 2) {
116 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
117 exit(1);
118 }
119
120 // collection directory
121 strcpy(collection, argv[1]);
122
123 // mode parameter
124 phraseMode = atoi(argv[2]);
125 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
126
127 // optional verbosity parameter
128 if (argc == 4) {
129 verbosity = atoi(argv[3]);
130 assert (verbosity >= 0);
131 }
132
133 if (verbosity) {
134 cout << "Suffix phrase extraction program" << endl;
135 }
136
137 if (verbosity > 1) {
138 if (phraseMode == STOPWORDS) {
139 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
140 } else {
141 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
142 }
143 }
144
145 // Read the statistics file
146 readStatistics();
147
148 // Read the numbers file
149 readNumbers();
150
151 // Create the suffix & prefix arrays
152 suffixArray = new symbol *[inputLength];
153 prefixArray = new symbol *[inputLength];
154 suffixCheck = new check[inputLength];
155 prefixCheck = new check[inputLength];
156 if (prefixCheck == NULL) {
157 cerr << "Suffix error: not enough memory to hold " << inputLength
158 << " symbols." << endl;
159 exit(2);
160 }
161
162 // Initialise prefix and suffix arrays
163 for (cellcount j = 0; j < inputLength; j++) {
164 suffixArray[j] = &symbols[j];
165 prefixArray[j] = &symbols[j];
166 }
167 qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
168 qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
169
170
171 // Create the document arrays
172 if (numberOfDocuments == 0) {
173 cerr << "There are no documents in this collection!" << endl;
174 exit(1);
175 }
176 if (verbosity > 1) {
177 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
178 }
179
180 // The document frequecy array is used to count the number of times
181 // each phrase occurs in each document. The number of documents in
182 // which a phrase occurs is stored in df.
183 frequency *documentFrequency = new frequency[numberOfDocuments];
184 frequency df;
185
186 // documentArray will be searched in order to discover which document
187 // each phrase occurs in.
188 documentArray = new symbol *[numberOfDocuments];
189
190 // Discover all the DOCUMENTSTART symbols and store as a phrase
191 cellindex d = 0;
192 while (*suffixArray[d] != DOCUMENTSTART) {
193 d++;
194 }
195 Phrase p(suffixArray[d], 1, SUFFIX);
196 p.findFirstAndLastSuffix(d, inputLength-1);
197
198 // Insert the document locations (as pointers) into documentArray
199 for (cellcount i = 0; i < p.suffixFrequency; i++) {
200 documentArray[i] = suffixArray[i + p.firstSuffixIndex];
201 }
202
203 // Sort the document array into ascending order of raw pointer value
204 qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
205
206
207 // Extract phrases
208 //
209 // We will make several passesover the data, in each case considering
210 // a set of input phrases and generating a set of output phrases, which
211 // we will expancd in later passes.
212 //
213 // The input phrases in the first pass will be the vocabulary.
214 // In later passes, the input phrases will be the output phrases of the
215 // previous pass.
216 //
217 // In each pass we will consider each input phrase in turn. If we
218 // have seen it before, we will ignore it. Otherwise, we will expand
219 // it and add its expansions to the set of output phrases.
220
221 // Store the phrase data in the phrases file
222 char phraseDataName[FILENAME_MAX];
223 sprintf(phraseDataName, "%s/phrases", collection);
224 ofstream phraseData(phraseDataName, ios::out);
225 if (!phraseData) {
226 cout << "File " << phraseDataName << " could not be opened\n";
227 exit(1);
228 }
229
230 // Count the number of phrases output
231 unsigned long int phraseCounter = 0;
232
233 // Set up the phrase expansion memory.
234 // We need this so that we don't expand a phrase more than once
235 initialisePhraseMemory();
236
237 // The current pass numebr
238 int phrasePass = 1;
239
240
241 // PASS NUMBER 1
242 if (verbosity > 1) {
243 cout << "Starting pass " << phrasePass << endl;
244 }
245
246 ofstream outPhrase;
247 char outPhraseName[FILENAME_MAX];
248 unsigned long int outPhraseCounter = 0;
249
250 // On the first pass, simply work through the vocabulary
251 sprintf(outPhraseName, "%s/outPhrase.1", collection);
252 outPhrase.open(outPhraseName, ios::out);
253 if (!outPhrase) {
254 cerr << "File " << outPhraseName << " could not be opened\n";
255 exit(1);
256 }
257
258 // Iterate over the different symbols by working through the suffix array
259 vector<Phrase> result;
260 cellindex ij = 0;
261 char *tmpString;
262
263 while (ij < inputLength) {
264
265 // make a new phrase of length 1
266 p = Phrase(suffixArray[ij], 1, SUFFIX);
267 p.findFirstAndLastSuffix(ij, inputLength-1);
268
269 // cout << "cell " << ij << " - " << p.toString() << endl;
270
271 // We ignore this symbol if it occurs only once, if it is a delimiter,
272 // of if we are in stopwords mode and it is a stopword
273 //
274 // We could imagine a new mode/command-line option, which is like
275 // STOPWORDS but without this restrictrion. This would let you browse
276 // from "the" to "the AGRIS" for example, but not from "AGRIS" to
277 // "the AGRIS" (where the is a stopword and AGRIS a content word).
278 // The system used to work like this; it is easy to implement, but
279 // it explodes the size of the indexes. So: would it be useful?
280 if (!((p.suffixFrequency <= 1) ||
281 // (*suffixArray[ij] != 23054) ||
282 (*suffixArray[ij] <= LASTDELIMITER) ||
283 ((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {
284
285 // Get minimal expansions of the phrase
286 getMinimalExpansions(p, result);
287
288 if (!result.empty()) {
289
290 // Remember that we have expanded this phrase
291 rememberThisPhrase(ij, 1);
292
293 // write the phrase text
294 tmpString = p.toString();
295 phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
296 << result.size() << ":";
297 delete [] tmpString;
298
299 // write the results
300 for (cellcount k = 0; k < result.size(); k++) {
301 if (k) {
302 phraseData << ",";
303 }
304 phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
305 outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
306 outPhraseCounter++;
307 }
308 result.clear();
309
310 // Write the documents in which this phrase occurs
311 df = getDocumentOccurrances(p, documentFrequency);
312 phraseData << ":" << df << ":";
313
314 // write the documents
315 for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
316 if (documentFrequency[m]) {
317 if (first) {
318 first = 0;
319 } else {
320 phraseData << ";";
321 }
322 // Output the document number. Note that here we've numbered the
323 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
324 // add 1 to the document id when we output it.
325 phraseData << "d" << (m+1);
326 // Next, output the frequency with which the document occurs, but
327 // only if it is > 1.
328 if (documentFrequency[m] > 1) {
329 phraseData << "," << documentFrequency[m];
330 }
331 }
332 }
333
334 phraseData << endl;
335 phraseCounter++;
336
337 // feedback
338 if (verbosity) {
339 if (phraseCounter % 1000 == 0) {
340 tmpString = p.toString();
341 cout << "phrase " << phraseCounter << ": "
342 << "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
343 delete [] tmpString;
344 }
345 }
346 }
347 }
348 ij = p.lastSuffixIndex + 1;
349 }
350 outPhrase.close();
351
352 // REMAINING PASSES
353 // The previous outPhrase file forms the input to each new pass
354 cellcount start, length;
355 while (outPhraseCounter > 0) {
356
357 // Start a new pass
358 phrasePass++;
359 if (verbosity) {
360 cout << "Starting pass " << phrasePass << endl;
361 }
362
363 // Open the input file
364 char inPhraseName[FILENAME_MAX];
365 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
366 ifstream inPhrase (inPhraseName, ios::in);
367 if (!inPhrase) {
368 cerr << "File " << inPhraseName << " could not be opened\n";
369 exit(1);
370 }
371
372 // Open the output file
373 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
374 outPhrase.open(outPhraseName, ios::out);
375 if (!outPhrase) {
376 cerr << "File " << outPhraseName << " could not be opened\n";
377 exit(1);
378 }
379 outPhraseCounter = 0;
380
381 // Process each phrase
382 while(inPhrase >> start >> length) {
383
384 // Ignore the phrase if we have expanded it before
385 if (isPhraseStored(start, length)) {
386 continue;
387 }
388
389 // Remember that we have examined this phrase
390 rememberThisPhrase(start, length);
391
392 // Find the phrase in the suffixarray
393 p = Phrase(suffixArray[start], length, SUFFIX);
394 p.findFirstAndLastSuffix(start, inputLength-1);
395
396 // cout << "index " << start << ", length " << length << " - " << p.toString() << endl;
397
398
399 // Ignore the phrase if it only occurs once
400 if (p.suffixFrequency < 2) {
401 continue;
402 }
403
404
405 // Write the phrase text tmpString = p.toString();
406 tmpString = p.toString();
407 phraseData << start << "-" << length << ":" << tmpString << ":"
408 << p.suffixFrequency << ":";
409 delete [] tmpString;
410
411
412 // Expand the phrase, if it is fewer than 8 words long
413 if (length <= 8) {
414
415 // Get the minimal expansions for this phrase
416 getMinimalExpansions(p, result);
417
418 // write the results
419 phraseData << result.size() << ":";
420
421 for (cellcount i = 0; i < result.size(); i++) {
422 if (i) {
423 phraseData << ",";
424 }
425 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
426 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
427 outPhraseCounter++;
428 }
429 result.clear();
430
431 } else {
432 // phrase is too long to expand further
433 phraseData << "0:";
434 }
435
436
437 // Write the documents in which this phrase occurs
438 df = getDocumentOccurrances(p, documentFrequency);
439 phraseData << ":" << df << ":";
440
441 // write the documents
442 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
443 if (documentFrequency[i]) {
444 if (first) {
445 first = 0;
446 } else {
447 phraseData << ";";
448 }
449 // Output the document number. Note that here we've numbered the
450 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
451 // add 1 to the document id when we output it.
452 phraseData << "d" << (i+1);
453 // Next, output the frequency with which the document occurs, but
454 // only if it is > 1.
455 if (documentFrequency[i] > 1) {
456 phraseData << "," << documentFrequency[i];
457 }
458 }
459 }
460
461 phraseData << endl;
462 phraseCounter++;
463
464 // feedback
465 if (verbosity) {
466 if (phraseCounter % 1000 == 0) {
467 tmpString = p.toString();
468 cout << "phrase " << phraseCounter << ": "<< "start " << start
469 << ", length " << length << " - " << tmpString << endl;
470 delete [] tmpString;
471 }
472 }
473
474 }
475
476 inPhrase.close();
477 outPhrase.close();
478 }
479
480 phraseData.close();
481 deletePhraseMemory();
482
483 delete [] documentFrequency;
484 delete [] symbols;
485 delete [] suffixArray;
486 delete [] prefixArray;
487 delete [] suffixCheck;
488 delete [] prefixCheck;
489 delete [] documentArray;
490
491
492
493 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
494 return 0;
495}
496
497
498// Get Minimal Expansions
499//
500// Get the set of "minimal" expansions of a phrase p, using the
501// algorithm described in the documentation.
502//
503// Returns a vector of Expansions.
504
505void getMinimalExpansions(Phrase &p, vector<Phrase> &results) {
506
507 // 1. Initialise the result and candiate vectors
508 vector<Phrase> candidates;
509 for (cellcount j = 0; j < inputLength; j++) {
510 suffixCheck[j] = 0;
511 prefixCheck[j] = 0;
512 }
513
514 // 2. Expand the phrase p
515
516 // 2.1 Create the candidate set
517 p.initialSuffixCandidates(candidates);
518 p.initialPrefixCandidates(candidates);
519
520 // 2.2 Sort the candidates by phrase length
521 make_heap(candidates.begin(), candidates.end(), isLonger);
522
523 // 3. While candidates is non-empty, confirm the phrases it
524 // contains, expanding them as required
525 while (!candidates.empty()) {
526
527 // 3.1 Get next candidate
528 pop_heap(candidates.begin(), candidates.end(), isLonger);
529 Phrase c = candidates.back();
530 candidates.pop_back();
531
532 // 3.2 If we know there are no unique right extensions
533 // (i.e. this is a phrase drawn from the suffix array)
534 if (!c.hasUniqueSuffixExtension()) {
535
536 c.ensurePrefixFound();
537
538 // 3.2.1 Ignore candidate if we have used a subphrase instead
539 if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
540 // cout << "ignoring" << endl;
541 }
542
543 // 3.2.2 If candidate has a unique left (prefix) extension,
544 // Then extend it and add it back into Candidates.
545 else if (c.hasUniquePrefixExtension()) {
546 // cout << "expanding prefix " << c.toString() << "=> ";
547 c.expandUniquePrefixExtensionByOne();
548 candidates.push_back(c);
549 push_heap(candidates.begin(), candidates.end(), isLonger);
550 }
551
552 // 3.2.3 If candidate has no unique left (prefix) extension,
553 // Then add it to the list of results.
554 else {
555 // cout << "no unique prefix, add to results" << endl;
556 results.push_back(c);
557 for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
558 suffixCheck[i] = c.length;
559 }
560 for (cellcount ik = c.firstPrefixIndex; ik <= c.lastPrefixIndex; ik++) {
561 prefixCheck[ik] = c.length;
562 }
563 }
564 }
565
566 // 3.3 If we know there are no unique left extensions,
567 // Then fdo the same as for 3.2 but exchange suffix & prefix
568 else if (!c.hasUniquePrefixExtension()) {
569
570 c.ensureSuffixFound();
571
572 // 3.3.1
573 if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
574
575 }
576
577 // 3.3.2
578 else if (c.hasUniqueSuffixExtension()) {
579 c.expandUniqueSuffixExtensionByOne();
580 candidates.push_back(c);
581 push_heap(candidates.begin(), candidates.end(), isLonger);
582 }
583
584 // 3.3.3
585 else {
586 results.push_back(c);
587 for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
588 suffixCheck[i] = c.length;
589 }
590 for (cellcount ijk = c.firstPrefixIndex; ijk <= c.lastPrefixIndex; ijk++) {
591 prefixCheck[ijk] = c.length;
592 }
593
594 }
595 }
596 }
597}
598
599
600// suffixCompare
601//
602// Compare two pointers into a suffix array. We use this in the
603// qsort function, so the input are pointers to pointers.
604//
605// Return -1 if (a < b), otherwise (a > b) so return +1,
606
607int suffixCompare(const void *cpa, const void *cpb) {
608
609 // Cast then dereference pointers to suffix array elements
610 symbol *pa = (symbol *) cpa;
611 symbol *pb = (symbol *) cpb;
612 pa = (symbol *) *pa;
613 pb = (symbol *) *pb;
614
615 // If the two elements are the same, examine the next one
616 while (*pa == *pb) {
617 *pa++;
618 *pb++;
619 }
620
621 // Make the copmparison and return
622 if ( *pa < *pb) {
623 return -1;
624 } else {
625 return +1;
626 }
627}
628
629
630// prefixCompare
631//
632// Compare two pointers into a prefix array. We use this in the
633// qsort function, so the input are pointers to pointers.
634//
635// Return -1 if (a > b), otherwise (a < b) so return +1,
636
637int prefixCompare(const void *cpa, const void *cpb) {
638
639 // Cast then dereference pointers to prefix array elements
640 symbol *pa = (symbol *) cpa;
641 symbol *pb = (symbol *) cpb;
642 pa = (symbol *) *pa;
643 pb = (symbol *) *pb;
644
645 // If the two elements are the same, examine the next one
646 while (*pa == *pb) {
647 *pa--;
648 *pb--;
649 }
650
651 // Make the copmparison and return
652 if ( *pa > *pb) {
653 return -1;
654 } else {
655 return +1;
656 }
657}
658
659// simpleCompare
660//
661// Compare two pointers based on the memory location they point to.
662
663int pointerCompare( const void *pa, const void *pb ) {
664
665 symbol **a = (symbol **) pa;
666 symbol **b = (symbol **) pb;
667
668 if (*a < *b) {
669 return -1;
670 } else if (*a > *b) {
671 return 1;
672 } else {
673 return 0;
674 }
675}
676
677
678// Read the clauses.numbers file into the "symbols" array.
679//
680// Each number in the file is a symbol number; it is essential that
681// the first symbol (and no others) be COLLECTIONSTART and the last
682// symbol (and no others) be COLLECTIONEND.
683//
684// Return the number of numbers in the array.
685
686int readNumbers() {
687
688 char filename[FILENAME_MAX];
689 sprintf(filename, "%s/clauses.numbers", collection);
690 if (verbosity) {
691 cout << "Reading numbers file: " << filename << endl;
692 }
693
694 // Open the numbers file
695 ifstream inFile1(filename, ios::in);
696 if (!inFile1) {
697 cerr << "File " << filename << " could not be opened\n";
698 exit(1);
699 }
700
701 // Count the number of symbols
702 inputLength = 0;
703 symbol word;
704 while (inFile1 >> word) {
705 inputLength++;
706 }
707 inFile1.close();
708
709 // Allocate the symbbols array
710 if (verbosity > 1) {
711 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
712 }
713 symbols = new symbol[inputLength];
714 if (symbols == NULL) {
715 cerr << "Suffix error: not enough memory to hold " << inputLength
716 << " symbols." << endl;
717 exit(2);
718 }
719
720 // Read the numbers file into the numbers array
721 if (verbosity > 2) {
722 cout << "Reading the numbers" << endl;
723 }
724
725 ifstream inFile2(filename, ios::in);
726 if (!inFile2) {
727 cerr << "File " << filename << " could not be opened\n";
728 exit(1);
729 }
730 cellcount next = 0;
731 numberOfDocuments = 0;
732 while (inFile2 >> word) {
733 symbols[next++] = word;
734 if (word == DOCUMENTSTART) {
735 numberOfDocuments++;
736 }
737 }
738 inFile2.close();
739
740 // Make sure the numbers file is intact
741 assert(symbols[0] == COLLECTIONSTART);
742 assert(symbols[next-1] == COLLECTIONEND);
743
744 return inputLength;
745}
746
747
748
749// Get Document Occurrance statistics
750//
751// Given a phrase, what documents does it occur in?
752
753cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
754
755 // cout << "searching for \""<< p.toString() << "\" in documents "
756 // << 0 << "-" << numberOfDocuments - 1 << endl;
757
758 // The number of documents in which this phrase occurs
759 cellcount df = 0;
760
761 // Initialise the document frequency array
762 for (cellindex i = 0; i < numberOfDocuments; i++) {
763 frequency[i] = 0;
764 }
765
766 // variables used to facilitate the search
767 cellindex begin;
768 cellindex end;
769 cellindex d;
770 symbol *target;
771 bool found;
772
773 // search for the document in which each occurence of the phrase is found
774 for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
775
776 // cout << "looking for phrase at suffixArray[" << j << "]\n";
777
778 target = suffixArray[j];
779 begin = 0;
780 end = numberOfDocuments - 1;
781 found = false;
782
783 // Search for the occurence of a document delimiter that target
784 // occurs immediately after.
785 // We do this by performing a binary chop search on documentArray.
786 while (!found) {
787
788 // cout << "searching for " << (cellindex) target << " in "
789 // << begin << " - " << end << endl;
790
791 assert (begin <= end);
792
793 // If the beginning and end of the interval are the same,
794 // then we've found the correct document
795 if (begin == end) {
796 if (frequency[begin] == 0) {
797 df++;
798 }
799 frequency[begin]++;
800 found = true;
801 }
802
803 // Otherwise, examine a new document midway through the begin-end
804 // interval and see if it is the one.
805 else {
806 d = (begin + end) / 2;
807 if (target > documentArray[d]) {
808 // If target addrss is greater than this, but thisi sthe last document,
809 // then this must be the one we want. Or, if target is greater than
810 // this one but less then the next, this must be the one we wnat.
811 if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
812 if (frequency[d] == 0) {
813 df++;
814 }
815 frequency[d]++;
816 found = true;
817 } else {
818 // otherwise we know to search later in the document set
819 begin = d + 1;
820 }
821 } else {
822 // search earlier in the document set
823 end = d - 1;
824 }
825 }
826 }
827 }
828 return df;
829}
830
831
832
833
834
835
836// phraseExpansionMemory : Which phrases have we expanded?
837//
838// A set of utilities for keeping track of which phrases we have expanded.
839// We don't want to expand a phrase more than once, after all.
840//
841// This REALLY ought to be in its own class, but it works so that's okay.
842//
843// Phrases are identified by their firstSuffixPosition and length.
844//
845// Functions provided are:
846// void initialisePhraseMemory()
847// void rememberThisPhrase(index, length)
848// bool isPhraseStored(index, length)
849// void deletePhraseMemory()
850//
851// Internally, we will have two separate cases:
852//
853// Phrases of length 1-8:
854// unsigned char phraseMemory[inputLength]
855// is an array where each cell "remembers" the corresponding index in the
856// suffixArray, and each of the 8 bits of the cell correspond to the phrases
857// of length 1, 2... 8.
858// Eventually, we will make this disk-based (i.e. store the array in a file).
859//
860// Phrases of length 9+:
861// file hashTableFile
862// file listOfEntries
863// The first file is a hash table; each phrase maps to one of its cells, which
864// contains either 0 (empty, no occurence) or a number which is an entry number
865// in the second file. This file contains a "list" of entries. Each consists of
866// three numbers: the suffixArray index of the phrase, the length of the phrase,
867// and the entry number of the next phrase with the same hash.
868//
869
870
871unsigned char *phraseMemory;
872
873void initialiseLongPhraseMemory();
874void rememberThisLongPhrase(cellindex index, cellcount length);
875bool isLongPhraseStored(cellindex index, cellcount length);
876void deleteLongPhraseMemory();
877
878
879void initialisePhraseMemory() {
880
881 phraseMemory = new unsigned char[inputLength];
882
883 // to begin with, everything is empty
884 for (cellcount i = 0; i < inputLength; i++) {
885 phraseMemory[i] = 0;
886 }
887
888 // intialise the hashTable of long phrases
889 initialiseLongPhraseMemory();
890
891}
892
893void rememberThisPhrase(cellindex index, cellcount length) {
894
895 // if the phrase is very long, use the file-based system
896 if (length > 8) {
897 rememberThisLongPhrase(index, length);
898 return;
899 }
900
901 // create a char with just the bit corresponding to length set
902 unsigned char newbit = 1;
903 for (cellcount i = 1; i < length; i++) {
904 newbit <<= 1;
905 }
906
907 // set that bit in the memory array at position index
908 phraseMemory[index] |= newbit;
909}
910
911
912bool isPhraseStored(cellindex index, cellcount length) {
913
914 // if the phrase is very long, use the file-based system
915 if (length > 8) {
916 return isLongPhraseStored(index, length);
917 }
918
919 // create a char with just the bit corresponding to length set
920 unsigned char newbit = 1;
921 for (cellcount i = 1; i < length; i++) {
922 newbit <<= 1;
923 }
924
925 // retrurn true if that bit is set in memory arrayat position index
926 return (phraseMemory[index] & newbit);
927}
928
929void deletePhraseMemory() {
930 delete phraseMemory;
931 deleteLongPhraseMemory();
932}
933
934
935
936// Files etc used to store "long" equavlents of the above
937
938fstream hashTableFile;
939char hashTableFileName[FILENAME_MAX];
940fstream listOfEntries;
941char listOfEntriesName[FILENAME_MAX];
942cellindex nextEntryNumber;
943
944const cellcount bigPrime = 7919;
945
946
947void initialiseLongPhraseMemory() {
948
949 cellindex example = 0;
950
951 sprintf(hashTableFileName, "%s/hashTable", collection);
952 sprintf(listOfEntriesName, "%s/hashLists", collection);
953
954
955 // create the new hashtable
956 if (verbosity > 1) {
957 cout << "Initialising hashTable: " << hashTableFileName << endl;
958 }
959 hashTableFile.open(hashTableFileName, ios::in | ios::out);
960 for (cellcount i = 0; i < bigPrime; i++) {
961 hashTableFile.write((char *) &example, sizeof(example));
962 }
963
964 // create the list of phrases
965 if (verbosity > 1) {
966 cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
967 }
968 listOfEntries.open(listOfEntriesName, ios::in | ios::out);
969 listOfEntries.write((char *) &example, sizeof(example));
970 listOfEntries.write((char *) &example, sizeof(example));
971 listOfEntries.write((char *) &example, sizeof(example));
972 nextEntryNumber = 1;
973}
974
975
976void rememberThisLongPhrase(cellindex index, cellcount length) {
977
978 // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
979
980 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
981 cellindex pointer;
982 cellindex zero = 0;
983 cellindex readp = 0;
984 cellindex readi = 0;
985 cellindex readl = 0;
986
987 hashTableFile.seekg(hashOffset);
988 hashTableFile.read((char *) &pointer, sizeof(cellindex));
989
990 if (pointer == 0) {
991 // There is no entry at all in the hash table for this entry
992 // so create one
993
994 pointer = nextEntryNumber++;
995 hashTableFile.seekg(hashOffset);
996 hashTableFile.write((char *) &pointer, sizeof(cellindex));
997
998 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
999 listOfEntries.write((char *) &zero, sizeof(cellindex));
1000 listOfEntries.write((char *) &index, sizeof(cellindex));
1001 listOfEntries.write((char *) &length, sizeof(cellindex));
1002
1003 } else {
1004 // There is a list starting at this hash value, so the phrase may
1005 // be already remembered, or it might need to be appended
1006
1007 while (pointer != 0) {
1008 // Read the entry pointed to by pointer
1009 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
1010 listOfEntries.read((char *) &readp, sizeof(cellindex));
1011 listOfEntries.read((char *) &readi, sizeof(cellindex));
1012 listOfEntries.read((char *) &readl, sizeof(cellindex));
1013
1014 // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
1015
1016 if ((readi == index) && (readl = length)) {
1017 // we've found that we've already stored it
1018 return;
1019 } else if (readp == 0) {
1020 // we're reached the end of the list. Add a new entry.
1021 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1022 listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
1023 pointer = nextEntryNumber++;
1024
1025 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1026 listOfEntries.write((char *) &zero, sizeof(cellindex));
1027 listOfEntries.write((char *) &index, sizeof(cellindex));
1028 listOfEntries.write((char *) &length, sizeof(cellindex));
1029 return;
1030 } else {
1031 // go on to the next node
1032 pointer = readp;
1033 }
1034 }
1035 }
1036
1037
1038}
1039
1040bool isLongPhraseStored(cellindex index, cellcount length) {
1041
1042 // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
1043
1044 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
1045 cellindex pointer;
1046 cellindex readp = 0;
1047 cellindex readi = 0;
1048 cellindex readl = 0;
1049
1050 // Find the phrase in the hashFile
1051 hashTableFile.seekg(hashOffset);
1052 hashTableFile.read((char *) &pointer, sizeof(cellindex));
1053
1054 if (pointer == 0) {
1055 // There is no entry at all in the hash table for this entry
1056 // so nothing is stored
1057 return false;
1058
1059 } else {
1060 // There is a list starting at this hash value, so the phrase may
1061 // be already remembered in that list
1062 while (pointer != 0) {
1063 // Read the entry pointed to by pointer
1064 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
1065 listOfEntries.read((char *) &readp, sizeof(cellindex));
1066 listOfEntries.read((char *) &readi, sizeof(cellindex));
1067 listOfEntries.read((char *) &readl, sizeof(cellindex));
1068
1069 if ((readi == index) && (readl = length)) {
1070 // we've found the phrase stored here
1071 return true;
1072 } else {
1073 // go on to the next node
1074 pointer = readp;
1075 }
1076 }
1077 }
1078 return false;
1079}
1080
1081void deleteLongPhraseMemory() {
1082 // remove the hash & other files
1083
1084 hashTableFile.close();
1085 listOfEntries.close();
1086 remove(hashTableFileName);
1087 remove(listOfEntriesName);
1088
1089}
1090
1091
1092
1093
1094// Read the collection statistics file
1095void readStatistics() {
1096
1097 // open the statistics file
1098 char filename[FILENAME_MAX];
1099 sprintf(filename, "%s/clauses.stats", collection);
1100
1101 // Open the file
1102 ifstream inFile(filename, ios::in);
1103 if (!inFile) {
1104 cerr << "File " << filename << " could not be opened\n";
1105 exit(1);
1106 }
1107
1108 // Read the numbers file into the numbers array
1109 char key[1000];
1110 symbol value;
1111 while (inFile >> key >> value){
1112 if (strcmp(key, "first_stopword") == 0) {
1113 firstStopSymbol = value;
1114 } else if (strcmp(key, "last_stopword") == 0) {
1115 lastStopSymbol = value;
1116 } else if (strcmp(key, "first_contentword") == 0) {
1117 firstContentSymbol = value;
1118 } else if (strcmp(key, "last_contentword") == 0) {
1119 lastContentSymbol = value;
1120 }
1121 }
1122 inFile.close();
1123
1124 // Make sure we have the information we need
1125 if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
1126 cerr << "Statistics file incomplete" << endl;
1127 exit(1);
1128 }
1129}
1130
1131
1132
1133
1134
Note: See TracBrowser for help on using the repository browser.