source: trunk/gsdl/src/phind/generate/suffix.cpp@ 1882

Last change on this file since 1882 was 1882, checked in by paynter, 23 years ago

The length of the main symbols array is now calculated from the
clauses.numbers file, not passedin as a command-line parameter.
This is a little slower, as we have to make an extra pass over
the text, but oh-so-much-more convienient. Requires changes to
command line arguments (which now support verbosity).

  • Property svn:keywords set to Author Date Id Revision
File size: 30.5 KB
Line 
1/**********************************************************************
2 *
3 * suffix.cpp -- Extract the repeated phrases in the input using
4 * suffix and prefix arrays.
5 *
6 * Copyright 2000 Gordon W. Paynter
7 * Copyright 2000 The New Zealand Digital Library Project
8 *
9 * A component of the Greenstone digital library software
10 * from the New Zealand Digital Library Project at the
11 * University of Waikato, New Zealand.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 *********************************************************************/
28
29#include <assert.h>
30#include <fstream.h>
31#include <iostream.h>
32#include <math.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36
37#include <algo.h>
38#include <heap.h>
39#include <vector.h>
40
41#include "suffix.h"
42#include "phrase.h"
43
44
45
46// Global variables declared in suffix.h
47cellcount inputLength;
48
49symbol *symbols;
50symbol **suffixArray;
51check *suffixCheck;
52symbol **prefixArray;
53check *prefixCheck;
54
55
56// How many documents are in this collection?
57cellcount numberOfDocuments;
58symbol **documentArray;
59
60// Do we accept any phrase, or do we eliminate those ending with stopwords ?
61int phraseMode = ANYPHRASE; //STOPWORDS;
62
63// The filestem of the collection's phindex directory
64char collection[FILENAME_MAX];
65
66int suffixCompare(const void *, const void *);
67int prefixCompare(const void *, const void *);
68int pointerCompare(const void *, const void *);
69
70int readNumbers();
71void readStatistics();
72
73void getMinimalExpansions(Phrase &p, vector<Phrase> &results);
74cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
75
76// The ranges of the stopword and content-word symbols for the collection
77symbol firstStopSymbol = 0;
78symbol lastStopSymbol = 0;
79symbol firstContentSymbol = 0;
80symbol lastContentSymbol = 0;
81
82
83
84
85// Phrase memory
86// We have to "remember" each phrase that we've expanded
87void initialisePhraseMemory();
88void rememberThisPhrase(cellindex index, cellcount length);
89bool isPhraseStored(cellindex index, cellcount length);
90void deletePhraseMemory();
91
92
93// how much output do we want?
94int verbosity = 1;
95
96
97int main (int argc, char * argv[]) {
98
99 // Command-line arguments
100 // argv[1] is the phindex directory
101 // argv[2] is the maximum array symbol length (optional)
102 // argv[3] is the mode, where 1 is stopword mode (optional)
103 if (argc < 2) {
104 cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
105 exit(1);
106 }
107
108 // collection directory
109 strcpy(collection, argv[1]);
110
111 // mode parameter
112 phraseMode = atoi(argv[2]);
113 assert((phraseMode == STOPWORDS) || (phraseMode == ANYPHRASE));
114
115 // optional verbosity parameter
116 if (argc == 4) {
117 verbosity = atoi(argv[3]);
118 assert (verbosity >= 0);
119 }
120
121 if (verbosity) {
122 cout << "Suffix phrase extraction program" << endl;
123 }
124
125 if (verbosity > 1) {
126 if (phraseMode == STOPWORDS) {
127 cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
128 } else {
129 cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
130 }
131 }
132
133 // Read the statistics file
134 readStatistics();
135
136 // Read the numbers file
137 readNumbers();
138
139 // Create the suffix & prefix arrays
140 suffixArray = new (symbol *)[inputLength];
141 prefixArray = new (symbol *)[inputLength];
142 suffixCheck = new (check)[inputLength];
143 prefixCheck = new (check)[inputLength];
144 if (prefixCheck == NULL) {
145 cerr << "Suffix error: not enough memory to hold " << inputLength
146 << " symbols." << endl;
147 exit(2);
148 }
149
150 // Initialise prefix and suffix arrays
151 for (cellcount j = 0; j < inputLength; j++) {
152 suffixArray[j] = &symbols[j];
153 prefixArray[j] = &symbols[j];
154 }
155 qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
156 qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
157
158
159 // Create the document arrays
160 if (numberOfDocuments == 0) {
161 cerr << "There are no documents in this collection!" << endl;
162 exit(1);
163 }
164 if (verbosity > 1) {
165 cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
166 }
167
168 // The document frequecy array is used to count the number of times
169 // each phrase occurs in each document. The number of documents in
170 // which a phrase occurs is stored in df.
171 frequency documentFrequency[numberOfDocuments];
172 frequency df;
173
174 // documentArray will be searched in order to discover which document
175 // each phrase occurs in.
176 documentArray = new (symbol *)[numberOfDocuments];
177
178 // Discover all the DOCUMENTSTART symbols and store as a phrase
179 cellindex d = 0;
180 while (*suffixArray[d] != DOCUMENTSTART) {
181 d++;
182 }
183 Phrase p(suffixArray[d], 1, SUFFIX);
184 p.findFirstAndLastSuffix(d, inputLength-1);
185
186 // Insert the document locations (as pointers) into documentArray
187 for (cellcount i = 0; i < p.suffixFrequency; i++) {
188 documentArray[i] = suffixArray[i + p.firstSuffixIndex];
189 }
190
191 // Sort the document array into ascending order of raw pointer value
192 qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
193
194
195 // Extract phrases
196 //
197 // We will make several passesover the data, in each case considering
198 // a set of input phrases and generating a set of output phrases, which
199 // we will expancd in later passes.
200 //
201 // The input phrases in the first pass will be the vocabulary.
202 // In later passes, the input phrases will be the output phrases of the
203 // previous pass.
204 //
205 // In each pass we will consider each input phrase in turn. If we
206 // have seen it before, we will ignore it. Otherwise, we will expand
207 // it and add its expansions to the set of output phrases.
208
209 // Store the phrase data in the phrases file
210 char phraseDataName[FILENAME_MAX];
211 sprintf(phraseDataName, "%s/phrases", collection);
212 ofstream phraseData(phraseDataName, ios::out);
213 if (!phraseData) {
214 cout << "File " << phraseDataName << " could not be opened\n";
215 exit(1);
216 }
217
218 // Count the number of phrases output
219 unsigned long int phraseCounter = 0;
220
221 // Set up the phrase expansion memory.
222 // We need this so that we don't expand a phrase more than once
223 initialisePhraseMemory();
224
225 // The current pass numebr
226 int phrasePass = 1;
227
228
229 // PASS NUMBER 1
230 if (verbosity > 1) {
231 cout << "Starting pass " << phrasePass << endl;
232 }
233
234 // We need an input file, for phrases we are about to examine, and an
235 // output file, for phrases still to come.
236 ifstream inPhrase;
237 char inPhraseName[FILENAME_MAX];
238 ofstream outPhrase;
239 char outPhraseName[FILENAME_MAX];
240 unsigned long int outPhraseCounter = 0;
241
242 // On the first pass, simply work through the vocabulary
243 sprintf(outPhraseName, "%s/outPhrase.1", collection);
244 outPhrase.open(outPhraseName, ios::out);
245 if (!outPhrase) {
246 cerr << "File " << outPhraseName << " could not be opened\n";
247 exit(1);
248 }
249
250 // Iterate over the different symbols by working through the suffix array
251 vector<Phrase> result;
252 cellindex i = 0;
253 char *tmpString;
254
255 while (i < inputLength) {
256
257 // make a new phrase of length 1
258 p = Phrase(suffixArray[i], 1, SUFFIX);
259 p.findFirstAndLastSuffix(i, inputLength-1);
260
261 // cout << "cell " << i << " - " << p.toString() << endl;
262
263 // We ignore this symbol if it occurs only once, if it is a delimiter,
264 // of if we are in stopwords mode and it is a stopword
265 //
266 // We could imagine a new mode/command-line option, which is like
267 // STOPWORDS but without this restrictrion. This would let you browse
268 // from "the" to "the AGRIS" for example, but not from "AGRIS" to
269 // "the AGRIS" (where the is a stopword and AGRIS a content word).
270 // The system used to work like this; it is easy to implement, but
271 // it explodes the size of the indexes. So: would it be useful?
272 if (!((p.suffixFrequency <= 1) ||
273 // (*suffixArray[i] != 23054) ||
274 (*suffixArray[i] <= LASTDELIMITER) ||
275 ((phraseMode == STOPWORDS) && (*suffixArray[i] <= lastStopSymbol)))) {
276
277 // Get minimal expansions of the phrase
278 getMinimalExpansions(p, result);
279
280 if (!result.empty()) {
281
282 // Remember that we have expanded this phrase
283 rememberThisPhrase(i, 1);
284
285 // write the phrase text
286 tmpString = p.toString();
287 phraseData << i << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
288 << result.size() << ":";
289 delete [] tmpString;
290
291 // write the results
292 for (cellcount i = 0; i < result.size(); i++) {
293 if (i) {
294 phraseData << ",";
295 }
296 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
297 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
298 outPhraseCounter++;
299 }
300 result.clear();
301
302 // Write the documents in which this phrase occurs
303 df = getDocumentOccurrances(p, documentFrequency);
304 phraseData << ":" << df << ":";
305
306 // write the documents
307 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
308 if (documentFrequency[i]) {
309 if (first) {
310 first = 0;
311 } else {
312 phraseData << ";";
313 }
314 // Output the document number. Note that here we've numbered the
315 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
316 // add 1 to the document id when we output it.
317 phraseData << "d" << (i+1);
318 // Next, output the frequency with which the document occurs, but
319 // only if it is > 1.
320 if (documentFrequency[i] > 1) {
321 phraseData << "," << documentFrequency[i];
322 }
323 }
324 }
325
326 phraseData << endl;
327 phraseCounter++;
328
329 // feedback
330 if (verbosity) {
331 if (phraseCounter % 1000 == 0) {
332 tmpString = p.toString();
333 cout << "phrase " << phraseCounter << ": "
334 << "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
335 delete [] tmpString;
336 }
337 }
338 }
339 }
340 i = p.lastSuffixIndex + 1;
341 }
342 outPhrase.close();
343
344 // REMAINING PASSES
345 // The previous outPhrase file forms the input to each new pass
346 cellcount start, length;
347 while (outPhraseCounter > 0) {
348
349 // Start a new pass
350 phrasePass++;
351 if (verbosity) {
352 cout << "Starting pass " << phrasePass << endl;
353 }
354
355 // Open the input file
356 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
357 inPhrase.open(inPhraseName, ios::in);
358 if (!inPhrase) {
359 cerr << "File " << inPhraseName << " could not be opened\n";
360 exit(1);
361 }
362
363 // Open the output file
364 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
365 outPhrase.open(outPhraseName, ios::out);
366 if (!outPhrase) {
367 cerr << "File " << outPhraseName << " could not be opened\n";
368 exit(1);
369 }
370 outPhraseCounter = 0;
371
372 // Process each phrase
373 while(inPhrase >> start >> length) {
374
375 // Ignore the phrase if we have expanded it before
376 if (isPhraseStored(start, length)) {
377 continue;
378 }
379
380 // Remember that we have examined this phrase
381 rememberThisPhrase(start, length);
382
383 // Find the phrase in the suffixarray
384 p = Phrase(suffixArray[start], length, SUFFIX);
385 p.findFirstAndLastSuffix(start, inputLength-1);
386
387 // cout << "index " << start << ", length " << length << " - " << p.toString() << endl;
388
389
390 // Ignore the phrase if it only occurs once
391 if (p.suffixFrequency < 2) {
392 continue;
393 }
394
395
396 // Write the phrase text tmpString = p.toString();
397 tmpString = p.toString();
398 phraseData << start << "-" << length << ":" << tmpString << ":"
399 << p.suffixFrequency << ":";
400 delete [] tmpString;
401
402
403 // Expand the phrase, if it is fewer than 8 words long
404 if (length <= 8) {
405
406 // Get the minimal expansions for this phrase
407 getMinimalExpansions(p, result);
408
409 // write the results
410 phraseData << result.size() << ":";
411
412 for (cellcount i = 0; i < result.size(); i++) {
413 if (i) {
414 phraseData << ",";
415 }
416 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
417 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
418 outPhraseCounter++;
419 }
420 result.clear();
421
422 } else {
423 // phrase is too long to expand further
424 phraseData << "0:";
425 }
426
427
428 // Write the documents in which this phrase occurs
429 df = getDocumentOccurrances(p, documentFrequency);
430 phraseData << ":" << df << ":";
431
432 // write the documents
433 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
434 if (documentFrequency[i]) {
435 if (first) {
436 first = 0;
437 } else {
438 phraseData << ";";
439 }
440 // Output the document number. Note that here we've numbered the
441 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
442 // add 1 to the document id when we output it.
443 phraseData << "d" << (i+1);
444 // Next, output the frequency with which the document occurs, but
445 // only if it is > 1.
446 if (documentFrequency[i] > 1) {
447 phraseData << "," << documentFrequency[i];
448 }
449 }
450 }
451
452 phraseData << endl;
453 phraseCounter++;
454
455 // feedback
456 if (verbosity) {
457 if (phraseCounter % 1000 == 0) {
458 tmpString = p.toString();
459 cout << "phrase " << phraseCounter << ": "<< "start " << start
460 << ", length " << length << " - " << tmpString << endl;
461 delete [] tmpString;
462 }
463 }
464
465 }
466
467 inPhrase.close();
468 outPhrase.close();
469 }
470
471 phraseData.close();
472 deletePhraseMemory();
473
474 delete [] symbols;
475 delete [] suffixArray;
476 delete [] prefixArray;
477 delete [] suffixCheck;
478 delete [] prefixCheck;
479 delete [] documentArray;
480
481
482
483 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
484 return 0;
485}
486
487
488// Get Minimal Expansions
489//
490// Get the set of "minimal" expansions of a phrase p, using the
491// algorithm described in the documentation.
492//
493// Returns a vector of Expansions.
494
495void getMinimalExpansions(Phrase &p, vector<Phrase> &results) {
496
497 // 1. Initialise the result and candiate vectors
498 vector<Phrase> candidates;
499 for (cellcount j = 0; j < inputLength; j++) {
500 suffixCheck[j] = 0;
501 prefixCheck[j] = 0;
502 }
503
504 // 2. Expand the phrase p
505
506 // 2.1 Create the candidate set
507 p.initialSuffixCandidates(candidates);
508 p.initialPrefixCandidates(candidates);
509
510 // 2.2 Sort the candidates by phrase length
511 make_heap(candidates.begin(), candidates.end(), isLonger);
512
513 // 3. While candidates is non-empty, confirm the phrases it
514 // contains, expanding them as required
515 while (!candidates.empty()) {
516
517 // 3.1 Get next candidate
518 pop_heap(candidates.begin(), candidates.end(), isLonger);
519 Phrase c = candidates.back();
520 candidates.pop_back();
521
522 // 3.2 If we know there are no unique right extensions
523 // (i.e. this is a phrase drawn from the suffix array)
524 if (!c.hasUniqueSuffixExtension()) {
525
526 c.ensurePrefixFound();
527
528 // 3.2.1 Ignore candidate if we have used a subphrase instead
529 if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
530 // cout << "ignoring" << endl;
531 }
532
533 // 3.2.2 If candidate has a unique left (prefix) extension,
534 // Then extend it and add it back into Candidates.
535 else if (c.hasUniquePrefixExtension()) {
536 // cout << "expanding prefix " << c.toString() << "=> ";
537 c.expandUniquePrefixExtensionByOne();
538 candidates.push_back(c);
539 push_heap(candidates.begin(), candidates.end(), isLonger);
540 }
541
542 // 3.2.3 If candidate has no unique left (prefix) extension,
543 // Then add it to the list of results.
544 else {
545 // cout << "no unique prefix, add to results" << endl;
546 results.push_back(c);
547 for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
548 suffixCheck[i] = c.length;
549 }
550 for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
551 prefixCheck[i] = c.length;
552 }
553 }
554 }
555
556 // 3.3 If we know there are no unique left extensions,
557 // Then fdo the same as for 3.2 but exchange suffix & prefix
558 else if (!c.hasUniquePrefixExtension()) {
559
560 c.ensureSuffixFound();
561
562 // 3.3.1
563 if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
564
565 }
566
567 // 3.3.2
568 else if (c.hasUniqueSuffixExtension()) {
569 c.expandUniqueSuffixExtensionByOne();
570 candidates.push_back(c);
571 push_heap(candidates.begin(), candidates.end(), isLonger);
572 }
573
574 // 3.3.3
575 else {
576 results.push_back(c);
577 for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
578 suffixCheck[i] = c.length;
579 }
580 for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
581 prefixCheck[i] = c.length;
582 }
583
584 }
585 }
586 }
587}
588
589
590// suffixCompare
591//
592// Compare two pointers into a suffix array. We use this in the
593// qsort function, so the input are pointers to pointers.
594//
595// Return -1 if (a < b), otherwise (a > b) so return +1,
596
597int suffixCompare(const void *cpa, const void *cpb) {
598
599 // Cast then dereference pointers to suffix array elements
600 symbol *pa = (symbol *) cpa;
601 symbol *pb = (symbol *) cpb;
602 pa = (symbol *) *pa;
603 pb = (symbol *) *pb;
604
605 // If the two elements are the same, examine the next one
606 while (*pa == *pb) {
607 *pa++;
608 *pb++;
609 }
610
611 // Make the copmparison and return
612 if ( *pa < *pb) {
613 return -1;
614 } else {
615 return +1;
616 }
617}
618
619
620// prefixCompare
621//
622// Compare two pointers into a prefix array. We use this in the
623// qsort function, so the input are pointers to pointers.
624//
625// Return -1 if (a > b), otherwise (a < b) so return +1,
626
627int prefixCompare(const void *cpa, const void *cpb) {
628
629 // Cast then dereference pointers to prefix array elements
630 symbol *pa = (symbol *) cpa;
631 symbol *pb = (symbol *) cpb;
632 pa = (symbol *) *pa;
633 pb = (symbol *) *pb;
634
635 // If the two elements are the same, examine the next one
636 while (*pa == *pb) {
637 *pa--;
638 *pb--;
639 }
640
641 // Make the copmparison and return
642 if ( *pa > *pb) {
643 return -1;
644 } else {
645 return +1;
646 }
647}
648
649// simpleCompare
650//
651// Compare two pointers based on the memory location they point to.
652
653int pointerCompare( const void *pa, const void *pb ) {
654
655 symbol **a = (symbol **) pa;
656 symbol **b = (symbol **) pb;
657
658 if (*a < *b) {
659 return -1;
660 } else if (*a > *b) {
661 return 1;
662 } else {
663 return 0;
664 }
665}
666
667
668// Read the clauses.numbers file into the "symbols" array.
669//
670// Each number in the file is a symbol number; it is essential that
671// the first symbol (and no others) be COLLECTIONSTART and the last
672// symbol (and no others) be COLLECTIONEND.
673//
674// Return the number of numbers in the array.
675
676int readNumbers() {
677
678 char filename[FILENAME_MAX];
679 sprintf(filename, "%s/clauses.numbers", collection);
680 if (verbosity) {
681 cout << "Reading numbers file: " << filename << endl;
682 }
683
684 // Open the numbers file
685 ifstream inFile(filename, ios::in);
686 if (!inFile) {
687 cerr << "File " << filename << " could not be opened\n";
688 exit(1);
689 }
690
691 // Count the number of symbols
692 inputLength = 0;
693 symbol word;
694 while (inFile >> word) {
695 inputLength++;
696 }
697 inFile.close();
698
699 // Allocate the symbbols array
700 if (verbosity > 1) {
701 cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
702 }
703 symbols = new (symbol)[inputLength];
704 if (symbols == NULL) {
705 cerr << "Suffix error: not enough memory to hold " << inputLength
706 << " symbols." << endl;
707 exit(2);
708 }
709
710 // Read the numbers file into the numbers array
711 if (verbosity > 2) {
712 cout << "Reading the numbers" << endl;
713 }
714 inFile.open(filename, ios::in);
715 cellcount next = 0;
716 numberOfDocuments = 0;
717 while (inFile >> word) {
718 symbols[next++] = word;
719 if (word == DOCUMENTSTART) {
720 numberOfDocuments++;
721 }
722 }
723 inFile.close();
724
725 // Make sure the numbers file is intact
726 assert(symbols[0] == COLLECTIONSTART);
727 assert(symbols[next-1] == COLLECTIONEND);
728
729 return inputLength;
730}
731
732
733
734// Get Document Occurrance statistics
735//
736// Given a phrase, what documents does it occur in?
737
738cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
739
740 // cout << "searching for \""<< p.toString() << "\" in documents "
741 // << 0 << "-" << numberOfDocuments - 1 << endl;
742
743 // The number of documents in which this phrase occurs
744 cellcount df = 0;
745
746 // Initialise the document frequency array
747 for (cellindex i = 0; i < numberOfDocuments; i++) {
748 frequency[i] = 0;
749 }
750
751 // variables used to facilitate the search
752 cellindex begin;
753 cellindex end;
754 cellindex d;
755 symbol *target;
756 bool found;
757
758 // search for the document in which each occurence of the phrase is found
759 for (cellcount i = p.firstSuffixIndex; i <= p.lastSuffixIndex; i++) {
760
761 // cout << "looking for phrase at suffixArray[" << i << "]\n";
762
763 target = suffixArray[i];
764 begin = 0;
765 end = numberOfDocuments - 1;
766 found = false;
767
768 // Search for the occurence of a document delimiter that target
769 // occurs immediately after.
770 // We do this by performing a binary chop search on documentArray.
771 while (!found) {
772
773 // cout << "searching for " << (cellindex) target << " in "
774 // << begin << " - " << end << endl;
775
776 assert (begin <= end);
777
778 // If the beginning and end of the interval are the same,
779 // then we've found the correct document
780 if (begin == end) {
781 if (frequency[begin] == 0) {
782 df++;
783 }
784 frequency[begin]++;
785 found = true;
786 }
787
788 // Otherwise, examine a new document midway through the begin-end
789 // interval and see if it is the one.
790 else {
791 d = (begin + end) / 2;
792 if (target > documentArray[d]) {
793 // If target addrss is greater than this, but thisi sthe last document,
794 // then this must be the one we want. Or, if target is greater than
795 // this one but less then the next, this must be the one we wnat.
796 if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
797 if (frequency[d] == 0) {
798 df++;
799 }
800 frequency[d]++;
801 found = true;
802 } else {
803 // otherwise we know to search later in the document set
804 begin = d + 1;
805 }
806 } else {
807 // search earlier in the document set
808 end = d - 1;
809 }
810 }
811 }
812 }
813 return df;
814}
815
816
817
818
819
820
821// phraseExpansionMemory : Which phrases have we expanded?
822//
823// A set of utilities for keeping track of which phrases we have expanded.
824// We don't want to expand a phrase more than once, after all.
825//
826// This REALLY ought to be in its own class, but it works so that's okay.
827//
828// Phrases are identified by their firstSuffixPosition and length.
829//
830// Functions provided are:
831// void initialisePhraseMemory()
832// void rememberThisPhrase(index, length)
833// bool isPhraseStored(index, length)
834// void deletePhraseMemory()
835//
836// Internally, we will have two separate cases:
837//
838// Phrases of length 1-8:
839// unsigned char phraseMemory[inputLength]
840// is an array where each cell "remembers" the corresponding index in the
841// suffixArray, and each of the 8 bits of the cell correspond to the phrases
842// of length 1, 2... 8.
843// Eventually, we will make this disk-based (i.e. store the array in a file).
844//
845// Phrases of length 9+:
846// file hashTableFile
847// file listOfEntries
848// The first file is a hash table; each phrase maps to one of its cells, which
849// contains either 0 (empty, no occurence) or a number which is an entry number
850// in the second file. This file contains a "list" of entries. Each consists of
851// three numbers: the suffixArray index of the phrase, the length of the phrase,
852// and the entry number of the next phrase with the same hash.
853//
854
855
856unsigned char *phraseMemory;
857
858void initialiseLongPhraseMemory();
859void rememberThisLongPhrase(cellindex index, cellcount length);
860bool isLongPhraseStored(cellindex index, cellcount length);
861void deleteLongPhraseMemory();
862
863
864void initialisePhraseMemory() {
865
866 phraseMemory = new (unsigned char)[inputLength];
867
868 // to begin with, everything is empty
869 for (cellcount i = 0; i < inputLength; i++) {
870 phraseMemory[i] = 0;
871 }
872
873 // intialise the hashTable of long phrases
874 initialiseLongPhraseMemory();
875
876}
877
878void rememberThisPhrase(cellindex index, cellcount length) {
879
880 // if the phrase is very long, use the file-based system
881 if (length > 8) {
882 rememberThisLongPhrase(index, length);
883 return;
884 }
885
886 // create a char with just the bit corresponding to length set
887 unsigned char newbit = 1;
888 for (cellcount i = 1; i < length; i++) {
889 newbit <<= 1;
890 }
891
892 // set that bit in the memory array at position index
893 phraseMemory[index] |= newbit;
894}
895
896
897bool isPhraseStored(cellindex index, cellcount length) {
898
899 // if the phrase is very long, use the file-based system
900 if (length > 8) {
901 return isLongPhraseStored(index, length);
902 }
903
904 // create a char with just the bit corresponding to length set
905 unsigned char newbit = 1;
906 for (cellcount i = 1; i < length; i++) {
907 newbit <<= 1;
908 }
909
910 // retrurn true if that bit is set in memory arrayat position index
911 return (phraseMemory[index] & newbit);
912}
913
914void deletePhraseMemory() {
915 delete phraseMemory;
916 deleteLongPhraseMemory();
917}
918
919
920
921// Files etc used to store "long" equavlents of the above
922
923fstream hashTableFile;
924char hashTableFileName[FILENAME_MAX];
925fstream listOfEntries;
926char listOfEntriesName[FILENAME_MAX];
927cellindex nextEntryNumber;
928
929const cellcount bigPrime = 7919;
930
931
932void initialiseLongPhraseMemory() {
933
934 cellindex example = 0;
935
936 sprintf(hashTableFileName, "%s/hashTable", collection);
937 sprintf(listOfEntriesName, "%s/hashLists", collection);
938
939
940 // create the new hashtable
941 if (verbosity > 1) {
942 cout << "Initialising hashTable: " << hashTableFileName << endl;
943 }
944 hashTableFile.open(hashTableFileName, ios::in | ios::out);
945 for (cellcount i = 0; i < bigPrime; i++) {
946 hashTableFile.write((char *) &example, sizeof(example));
947 }
948
949 // create the list of phrases
950 if (verbosity > 1) {
951 cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
952 }
953 listOfEntries.open(listOfEntriesName, ios::in | ios::out);
954 listOfEntries.write((char *) &example, sizeof(example));
955 listOfEntries.write((char *) &example, sizeof(example));
956 listOfEntries.write((char *) &example, sizeof(example));
957 nextEntryNumber = 1;
958}
959
960
961void rememberThisLongPhrase(cellindex index, cellcount length) {
962
963 // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
964
965 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
966 cellindex pointer;
967 cellindex zero = 0;
968 cellindex readp = 0;
969 cellindex readi = 0;
970 cellindex readl = 0;
971
972 hashTableFile.seekg(hashOffset);
973 hashTableFile.read((char *) &pointer, sizeof(cellindex));
974
975 if (pointer == 0) {
976 // There is no entry at all in the hash table for this entry
977 // so create one
978
979 pointer = nextEntryNumber++;
980 hashTableFile.seekg(hashOffset);
981 hashTableFile.write((char *) &pointer, sizeof(cellindex));
982
983 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
984 listOfEntries.write((char *) &zero, sizeof(cellindex));
985 listOfEntries.write((char *) &index, sizeof(cellindex));
986 listOfEntries.write((char *) &length, sizeof(cellindex));
987
988 } else {
989 // There is a list starting at this hash value, so the phrase may
990 // be already remembered, or it might need to be appended
991
992 while (pointer != 0) {
993 // Read the entry pointed to by pointer
994 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
995 listOfEntries.read((char *) &readp, sizeof(cellindex));
996 listOfEntries.read((char *) &readi, sizeof(cellindex));
997 listOfEntries.read((char *) &readl, sizeof(cellindex));
998
999 // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
1000
1001 if ((readi == index) && (readl = length)) {
1002 // we've found that we've already stored it
1003 return;
1004 } else if (readp == 0) {
1005 // we're reached the end of the list. Add a new entry.
1006 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1007 listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
1008 pointer = nextEntryNumber++;
1009
1010 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1011 listOfEntries.write((char *) &zero, sizeof(cellindex));
1012 listOfEntries.write((char *) &index, sizeof(cellindex));
1013 listOfEntries.write((char *) &length, sizeof(cellindex));
1014 return;
1015 } else {
1016 // go on to the next node
1017 pointer = readp;
1018 }
1019 }
1020 }
1021
1022
1023}
1024
1025bool isLongPhraseStored(cellindex index, cellcount length) {
1026
1027 // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
1028
1029 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
1030 cellindex pointer;
1031 cellindex readp = 0;
1032 cellindex readi = 0;
1033 cellindex readl = 0;
1034
1035 // Find the phrase in the hashFile
1036 hashTableFile.seekg(hashOffset);
1037 hashTableFile.read((char *) &pointer, sizeof(cellindex));
1038
1039 if (pointer == 0) {
1040 // There is no entry at all in the hash table for this entry
1041 // so nothing is stored
1042 return false;
1043
1044 } else {
1045 // There is a list starting at this hash value, so the phrase may
1046 // be already remembered in that list
1047 while (pointer != 0) {
1048 // Read the entry pointed to by pointer
1049 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
1050 listOfEntries.read((char *) &readp, sizeof(cellindex));
1051 listOfEntries.read((char *) &readi, sizeof(cellindex));
1052 listOfEntries.read((char *) &readl, sizeof(cellindex));
1053
1054 if ((readi == index) && (readl = length)) {
1055 // we've found the phrase stored here
1056 return true;
1057 } else {
1058 // go on to the next node
1059 pointer = readp;
1060 }
1061 }
1062 }
1063 return false;
1064}
1065
1066void deleteLongPhraseMemory() {
1067 // remove the hash & other files
1068
1069 hashTableFile.close();
1070 listOfEntries.close();
1071 remove(hashTableFileName);
1072 remove(listOfEntriesName);
1073
1074}
1075
1076
1077
1078
1079// Read the collection statistics file
1080void readStatistics() {
1081
1082 // open the statistics file
1083 char filename[FILENAME_MAX];
1084 sprintf(filename, "%s/clauses.stats", collection);
1085
1086 // Open the file
1087 ifstream inFile(filename, ios::in);
1088 if (!inFile) {
1089 cerr << "File " << filename << " could not be opened\n";
1090 exit(1);
1091 }
1092
1093 // Read the numbers file into the numbers array
1094 char key[1000];
1095 symbol value;
1096 while (inFile >> key >> value){
1097 if (strcmp(key, "first_stopword") == 0) {
1098 firstStopSymbol = value;
1099 } else if (strcmp(key, "last_stopword") == 0) {
1100 lastStopSymbol = value;
1101 } else if (strcmp(key, "first_contentword") == 0) {
1102 firstContentSymbol = value;
1103 } else if (strcmp(key, "last_contentword") == 0) {
1104 lastContentSymbol = value;
1105 }
1106 }
1107 inFile.close();
1108
1109 // Make sure we have the information we need
1110 if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
1111 cerr << "Statistics file incomplete" << endl;
1112 exit(1);
1113 }
1114}
1115
1116
1117
1118
1119
Note: See TracBrowser for help on using the repository browser.