source: trunk/gsdl/src/phind/generate/suffix.cpp@ 1683

Last change on this file since 1683 was 1683, checked in by paynter, 24 years ago

Bug fix - all documents returned for phrases of more than one word were
being stored with an out-by-one error. The probem is that internally
documents are numbered 0 to N-1, but in MGPP they're numbered 1 to N. I
thought I'd fixed this, but had only updated the code for the
phrases-of-length-1 case.

  • Property svn:keywords set to Author Date Id Revision
File size: 30.7 KB
Line 
1/**********************************************************************
2 *
3 * suffix.h -- Extract the repeated phrases in the input using
4 * suffix and prefix arrays.
5 *
6 * Copyright 2000 Gordon W. Paynter ([email protected])
7 * Copyright 2000 The New Zealand Digital Library Project
8 *
9 * A component of the Greenstone digital library software
10 * from the New Zealand Digital Library Project at the
11 * University of Waikato, New Zealand.
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 *********************************************************************/
28
29#include <assert.h>
30#include <fstream.h>
31#include <iostream.h>
32#include <math.h>
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36
37#include <algo.h>
38#include <heap.h>
39#include <vector.h>
40
41#include "suffix.h"
42#include "phrase.h"
43
44
45
46// Global variables declared in suffix.h
47cellcount symbol_array_length;
48cellcount inputLength;
49
50symbol *symbols;
51symbol **suffixArray;
52check *suffixCheck;
53symbol **prefixArray;
54check *prefixCheck;
55
56
57// How many documents are in this collection?
58cellcount numberOfDocuments;
59symbol **documentArray;
60
61// Do we accept any phrase, or do we eliminate those ending with stopwords ?
62int phraseMode = ANYPHRASE; //STOPWORDS;
63
64// The filestem of the collection's phindex directory
65char collection[FILENAME_MAX];
66
67int suffixCompare(const void *, const void *);
68int prefixCompare(const void *, const void *);
69int pointerCompare(const void *, const void *);
70
71int readNumbers(symbol *numbers);
72void readStatistics();
73
74void getMinimalExpansions(Phrase &p, vector<Phrase> &results);
75cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
76
77// The ranges of the stopword and content-word symbols for the collection
78symbol firstStopSymbol = 0;
79symbol lastStopSymbol = 0;
80symbol firstContentSymbol = 0;
81symbol lastContentSymbol = 0;
82
83
84
85
86// Phrase memory
87// We have to "remember" each phrase that we've expanded
88void initialisePhraseMemory();
89void rememberThisPhrase(cellindex index, cellcount length);
90bool isPhraseStored(cellindex index, cellcount length);
91void deletePhraseMemory();
92
93
94// how much output do we want?
95int verbosity = 1;
96
97
98int main (int argc, char * argv[]) {
99
100 // Command-line arguments
101 // argv[1] is the phindex directory
102 // argv[2] is the maximum array symbol length (optional)
103 // argv[3] is the mode, where 1 is stopword mode (optional)
104 if (argc < 2) {
105 cerr << "Usage: " << argv[0] << " collection-directory [max-array-size [mode]]" << endl;
106 exit(1);
107 }
108
109 // collection directory
110 strcpy(collection, argv[1]);
111
112 // Symbol length parameter
113 if (argc >= 3) {
114 symbol_array_length = atol(argv[2]);
115 assert(symbol_array_length);
116 } else {
117 symbol_array_length = 1000;
118 }
119
120 // Stopword mode parameter
121 if (argc == 4) {
122 phraseMode = atoi(argv[3]);
123 assert(phraseMode == STOPWORDS);
124 }
125 if (verbosity) {
126 if (phraseMode == STOPWORDS) {
127 cout << "STOPWORDS mode: no phrase may begin or end with a stopword" << endl;
128 } else {
129 cout << "ALLPHRASE mode: extract every phrase that occurs more than once" << endl;
130 }
131 }
132
133 // Read the statistics file
134 readStatistics();
135
136 // Read the numbers file
137 symbols = new (symbol)[symbol_array_length];
138 readNumbers(symbols);
139
140
141 // Create the suffix & prefix arrays
142 if (verbosity) {
143 cout << "Create suffix & prefix arrays for " << inputLength << " symbols" << endl;
144 }
145 suffixArray = new (symbol *)[inputLength];
146 prefixArray = new (symbol *)[inputLength];
147 suffixCheck = new (check)[inputLength];
148 prefixCheck = new (check)[inputLength];
149 for (cellcount j = 0; j < inputLength; j++) {
150 suffixArray[j] = &symbols[j];
151 prefixArray[j] = &symbols[j];
152 }
153 qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
154 qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
155
156
157 // Create the document arrays
158 if (verbosity) {
159 cout << "Create document arrays for " << numberOfDocuments << " documents" << endl;
160 }
161 if (numberOfDocuments == 0) {
162 cerr << "There are no documents in this collection!" << endl;
163 exit(1);
164 }
165
166 // The document frequecy array is used to count the number of times
167 // each phrase occurs in each document. The number of documents in
168 // which a phrase occurs is stored in df.
169 frequency documentFrequency[numberOfDocuments];
170 frequency df;
171
172 // documentArray will be searched in order to discover which document
173 // each phrase occurs in.
174 documentArray = new (symbol *)[numberOfDocuments];
175
176 // Discover all the DOCUMENTSTART symbols and store as a phrase
177 cellindex d = 0;
178 while (*suffixArray[d] != DOCUMENTSTART) {
179 d++;
180 }
181 Phrase p(suffixArray[d], 1, SUFFIX);
182 p.findFirstAndLastSuffix(d, inputLength-1);
183
184 // Insert the document locations (as pointers) into documentArray
185 for (cellcount i = 0; i < p.suffixFrequency; i++) {
186 documentArray[i] = suffixArray[i + p.firstSuffixIndex];
187 }
188
189 // Sort the document array into ascending order of raw pointer value
190 qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
191
192
193 // Extract phrases
194 //
195 // We will make several passesover the data, in each case considering
196 // a set of input phrases and generating a set of output phrases, which
197 // we will expancd in later passes.
198 //
199 // The input phrases in the first pass will be the vocabulary.
200 // In later passes, the input phrases will be the output phrases of the
201 // previous pass.
202 //
203 // In each pass we will consider each input phrase in turn. If we
204 // have seen it before, we will ignore it. Otherwise, we will expand
205 // it and add its expansions to the set of output phrases.
206
207 // Store the phrase data in the phrases file
208 char phraseDataName[FILENAME_MAX];
209 sprintf(phraseDataName, "%s/phrases", collection);
210 ofstream phraseData(phraseDataName, ios::out);
211 if (!phraseData) {
212 cout << "File " << phraseDataName << " could not be opened\n";
213 exit(1);
214 }
215
216 // Count the number of phrases output
217 unsigned long int phraseCounter = 0;
218
219 // Set up the phrase expansion memory.
220 // We need this so that we don't expand a phrase more than once
221 initialisePhraseMemory();
222
223 // The current pass numebr
224 int phrasePass = 1;
225
226
227 // PASS NUMBER 1
228 if (verbosity) {
229 cout << endl<< "suffix: starting pass " << phrasePass << endl;
230 }
231
232 // We need an input file, for phrases we are about to examine, and an
233 // output file, for phrases still to come.
234 ifstream inPhrase;
235 char inPhraseName[FILENAME_MAX];
236 ofstream outPhrase;
237 char outPhraseName[FILENAME_MAX];
238 unsigned long int outPhraseCounter = 0;
239
240 // On the first pass, simply work through the vocabulary
241 sprintf(outPhraseName, "%s/outPhrase.1", collection);
242 outPhrase.open(outPhraseName, ios::out);
243 if (!outPhrase) {
244 cerr << "File " << outPhraseName << " could not be opened\n";
245 exit(1);
246 }
247
248 // Iterate over the different symbols by working through the suffix array
249 vector<Phrase> result;
250 cellindex i = 0;
251 char *tmpString;
252
253 while (i < inputLength) {
254
255 // make a new phrase of length 1
256 p = Phrase(suffixArray[i], 1, SUFFIX);
257 p.findFirstAndLastSuffix(i, inputLength-1);
258
259 // cout << "cell " << i << " - " << p.toString() << endl;
260
261 // We ignore this symbol if it occurs only once, if it is a delimiter,
262 // of if we are in stopwords mode and it is a stopword
263 //
264 // We could imagine a new mode/command-line option, which is like
265 // STOPWORDS but without this restrictrion. This would let you browse
266 // from "the" to "the AGRIS" for example, but not from "AGRIS" to
267 // "the AGRIS" (where the is a stopword and AGRIS a content word).
268 // The system used to work like this; it is easy to implement, but
269 // it explodes the size of the indexes. So: would it be useful?
270 if (!((p.suffixFrequency <= 1) ||
271 // (*suffixArray[i] != 23054) ||
272 (*suffixArray[i] <= LASTDELIMITER) ||
273 ((phraseMode == STOPWORDS) && (*suffixArray[i] <= lastStopSymbol)))) {
274
275 // Get minimal expansions of the phrase
276 getMinimalExpansions(p, result);
277
278 if (!result.empty()) {
279
280 // Remember that we have expanded this phrase
281 rememberThisPhrase(i, 1);
282
283 // write the phrase text
284 tmpString = p.toString();
285 phraseData << i << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
286 << result.size() << ":";
287 delete [] tmpString;
288
289 // write the results
290 for (cellcount i = 0; i < result.size(); i++) {
291 if (i) {
292 phraseData << ",";
293 }
294 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
295 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
296 outPhraseCounter++;
297 }
298 result.clear();
299
300 // Write the documents in which this phrase occurs
301 df = getDocumentOccurrances(p, documentFrequency);
302 phraseData << ":" << df << ":";
303
304 // write the documents
305 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
306 if (documentFrequency[i]) {
307 if (first) {
308 first = 0;
309 } else {
310 phraseData << ";";
311 }
312 // Output the document number. Note that here we've numbered the
313 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
314 // add 1 to the document id when we output it.
315 phraseData << "d" << (i+1);
316 // Next, output the frequency with which the document occurs, but
317 // only if it is > 1.
318 if (documentFrequency[i] > 1) {
319 phraseData << "," << documentFrequency[i];
320 }
321 }
322 }
323
324 phraseData << endl;
325 phraseCounter++;
326
327 // feedback
328 if (verbosity) {
329 if (phraseCounter % 1000 == 0) {
330 tmpString = p.toString();
331 cout << "phrase " << phraseCounter << ": "
332 << "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
333 delete [] tmpString;
334 }
335 }
336 }
337 }
338 i = p.lastSuffixIndex + 1;
339 }
340 outPhrase.close();
341
342 // REMAINING PASSES
343 // The previous outPhrase file forms the input to each new pass
344 cellcount start, length;
345 while (outPhraseCounter > 0) {
346
347 // Start a new pass
348 phrasePass++;
349 if (verbosity) {
350 cout << endl << "Starting pass " << phrasePass << endl;
351 }
352
353 // Open the input file
354 sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
355 inPhrase.open(inPhraseName, ios::in);
356 if (!inPhrase) {
357 cerr << "File " << inPhraseName << " could not be opened\n";
358 exit(1);
359 }
360
361 // Open the output file
362 sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
363 outPhrase.open(outPhraseName, ios::out);
364 if (!outPhrase) {
365 cerr << "File " << outPhraseName << " could not be opened\n";
366 exit(1);
367 }
368 outPhraseCounter = 0;
369
370 // Process each phrase
371 while(inPhrase >> start >> length) {
372
373 // Ignore the phrase if we have expanded it before
374 if (isPhraseStored(start, length)) {
375 continue;
376 }
377
378 // Remember that we have examined this phrase
379 rememberThisPhrase(start, length);
380
381 // Find the phrase in the suffixarray
382 p = Phrase(suffixArray[start], length, SUFFIX);
383 p.findFirstAndLastSuffix(start, inputLength-1);
384
385 // cout << "index " << start << ", length " << length << " - " << p.toString() << endl;
386
387
388 // Ignore the phrase if it only occurs once
389 if (p.suffixFrequency < 2) {
390 continue;
391 }
392
393
394 // Write the phrase text tmpString = p.toString();
395 tmpString = p.toString();
396 phraseData << start << "-" << length << ":" << tmpString << ":"
397 << p.suffixFrequency << ":";
398 delete [] tmpString;
399
400
401 // Expand the phrase, if it is fewer than 8 words long
402 if (length <= 8) {
403
404 // Get the minimal expansions for this phrase
405 getMinimalExpansions(p, result);
406
407 // write the results
408 phraseData << result.size() << ":";
409
410 for (cellcount i = 0; i < result.size(); i++) {
411 if (i) {
412 phraseData << ",";
413 }
414 phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
415 outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
416 outPhraseCounter++;
417 }
418 result.clear();
419
420 } else {
421 // phrase is too long to expand further
422 phraseData << "0:";
423 }
424
425
426 // Write the documents in which this phrase occurs
427 df = getDocumentOccurrances(p, documentFrequency);
428 phraseData << ":" << df << ":";
429
430 // write the documents
431 for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
432 if (documentFrequency[i]) {
433 if (first) {
434 first = 0;
435 } else {
436 phraseData << ";";
437 }
438 // Output the document number. Note that here we've numbered the
439 // N documents from 0 to N-1, but later they'll be 1-N. Thus we
440 // add 1 to the document id when we output it.
441 phraseData << "d" << (i+1);
442 // Next, output the frequency with which the document occurs, but
443 // only if it is > 1.
444 if (documentFrequency[i] > 1) {
445 phraseData << "," << documentFrequency[i];
446 }
447 }
448 }
449
450 phraseData << endl;
451 phraseCounter++;
452
453 // feedback
454 if (verbosity) {
455 if (phraseCounter % 1000 == 0) {
456 tmpString = p.toString();
457 cout << "phrase " << phraseCounter << ": "<< "start " << start
458 << ", length " << length << " - " << tmpString << endl;
459 delete [] tmpString;
460 }
461 }
462
463 }
464
465 inPhrase.close();
466 outPhrase.close();
467 }
468
469 phraseData.close();
470 deletePhraseMemory();
471
472 delete [] symbols;
473 delete [] suffixArray;
474 delete [] prefixArray;
475 delete [] suffixCheck;
476 delete [] prefixCheck;
477 delete [] documentArray;
478
479
480
481 cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
482 return 0;
483}
484
485
486// Get Minimal Expansions
487//
488// Get the set of "minimal" expansions of a phrase p, using the
489// algorithm described in the documentation.
490//
491// Returns a vector of Expansions.
492
493void getMinimalExpansions(Phrase &p, vector<Phrase> &results) {
494
495 // 1. Initialise the result and candiate vectors
496 vector<Phrase> candidates;
497 for (cellcount j = 0; j < inputLength; j++) {
498 suffixCheck[j] = 0;
499 prefixCheck[j] = 0;
500 }
501
502 // 2. Expand the phrase p
503
504 // 2.1 Create the candidate set
505 p.initialSuffixCandidates(candidates);
506 p.initialPrefixCandidates(candidates);
507
508 // 2.2 Sort the candidates by phrase length
509 // sort(candidates.begin(), candidates.end(), isShorter);
510 make_heap(candidates.begin(), candidates.end(), isShorter);
511
512
513 // 3. While candidates is non-empty, confirm the phrases it
514 // contains, expanding them as required
515 while (!candidates.empty()) {
516
517 // 3.1 Get next candidate
518 //Phrase c = candidates.front();
519 //candidates.erase(candidates.begin());
520 pop_heap(candidates.begin(), candidates.end(), isShorter);
521 Phrase c = candidates.back();
522 candidates.pop_back();
523
524 // 3.2 If we know there are no unique right extensions
525 // (i.e. this is a phrase drawn from the suffix array)
526 if (!c.hasUniqueSuffixExtension()) {
527
528 c.ensurePrefixFound();
529
530 // 3.2.1 Ignore candidate if we have used a subphrase instead
531 if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
532 // cout << "ignoring" << endl;
533 }
534
535 // 3.2.2 If candidate has a unique left (prefix) extension,
536 // Then extend it and add it back into Candidates.
537 else if (c.hasUniquePrefixExtension()) {
538 // cout << "expanding prefix " << c.toString() << "=> ";
539 c.expandUniquePrefixExtensionByOne();
540 //candidates.push_back(c);
541 //sort(candidates.begin(), candidates.end(), isShorter);
542 candidates.push_back(c);
543 push_heap(candidates.begin(), candidates.end(), isShorter);
544 }
545
546 // 3.2.3 If candidate has no unique left (prefix) extension,
547 // Then add it to the list of results.
548 else {
549 // cout << "no unique prefix, add to results" << endl;
550 results.push_back(c);
551 for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
552 suffixCheck[i] = c.length;
553 }
554 for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
555 prefixCheck[i] = c.length;
556 }
557 }
558 }
559
560 // 3.3 If we know there are no unique left extensions,
561 // Then fdo the same as for 3.2 but exchange suffix & prefix
562 else if (!c.hasUniquePrefixExtension()) {
563
564 c.ensureSuffixFound();
565
566 // 3.3.1
567 if (suffixCheck[c.firstSuffixIndex] || prefixCheck[c.firstPrefixIndex]) {
568
569 }
570
571 // 3.3.2
572 else if (c.hasUniqueSuffixExtension()) {
573 c.expandUniqueSuffixExtensionByOne();
574 //candidates.push_back(c);
575 //sort(candidates.begin(), candidates.end(), isShorter);
576 candidates.push_back(c);
577 push_heap(candidates.begin(), candidates.end(), isShorter);
578 }
579
580 // 3.3.3
581 else {
582 results.push_back(c);
583 for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
584 suffixCheck[i] = c.length;
585 }
586 for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
587 prefixCheck[i] = c.length;
588 }
589
590 }
591 }
592 }
593}
594
595
596// suffixCompare
597//
598// Compare two pointers into a suffix array. We use this in the
599// qsort function, so the input are pointers to pointers.
600//
601// Return -1 if (a < b), otherwise (a > b) so return +1,
602
603int suffixCompare(const void *cpa, const void *cpb) {
604
605 // Cast then dereference pointers to suffix array elements
606 symbol *pa = (symbol *) cpa;
607 symbol *pb = (symbol *) cpb;
608 pa = (symbol *) *pa;
609 pb = (symbol *) *pb;
610
611 // If the two elements are the same, examine the next one
612 while (*pa == *pb) {
613 *pa++;
614 *pb++;
615 }
616
617 // Make the copmparison and return
618 if ( *pa < *pb) {
619 return -1;
620 } else {
621 return +1;
622 }
623}
624
625
626// prefixCompare
627//
628// Compare two pointers into a prefix array. We use this in the
629// qsort function, so the input are pointers to pointers.
630//
631// Return -1 if (a > b), otherwise (a < b) so return +1,
632
633int prefixCompare(const void *cpa, const void *cpb) {
634
635 // Cast then dereference pointers to prefix array elements
636 symbol *pa = (symbol *) cpa;
637 symbol *pb = (symbol *) cpb;
638 pa = (symbol *) *pa;
639 pb = (symbol *) *pb;
640
641 // If the two elements are the same, examine the next one
642 while (*pa == *pb) {
643 *pa--;
644 *pb--;
645 }
646
647 // Make the copmparison and return
648 if ( *pa > *pb) {
649 return -1;
650 } else {
651 return +1;
652 }
653}
654
655// simpleCompare
656//
657// Compare two pointers based on the memory location they point to.
658
659int pointerCompare( const void *pa, const void *pb ) {
660
661 symbol **a = (symbol **) pa;
662 symbol **b = (symbol **) pb;
663
664 if (*a < *b) {
665 return -1;
666 } else if (*a > *b) {
667 return 1;
668 } else {
669 return 0;
670 }
671}
672
673
674// Read the numbers file into an array of symbols.
675//
676// Each number is a symbol number; it is essential that the first
677// symbol (and no others) be 0 and the last symbol (and no others)
678// be 1.
679//
680// Return the number of numbers in the array.
681
682int readNumbers(symbol *numbers) {
683
684 char filename[FILENAME_MAX];
685 sprintf(filename, "%s/clauses.numbers", collection);
686 if (verbosity) {
687 cout << "Reading numbers from: " << filename << endl;
688 }
689
690 // Open the numbers file
691 ifstream inFile(filename, ios::in);
692 if (!inFile) {
693 cerr << "File " << filename << " could not be opened\n";
694 exit(1);
695 }
696
697 // Read the numbers file into the numbers array
698 symbol word;
699 cellcount length = 0;
700 numberOfDocuments = 0;
701 while ((inFile >> word) && (length < symbol_array_length)){
702 numbers[length++] = word;
703 if (word == DOCUMENTSTART) {
704 numberOfDocuments++;
705 }
706 }
707
708 // Make sure we were able to read all the numbers
709 if (length >= symbol_array_length) {
710 cerr << "Error: the symbol array is too a short to hold " << filename
711 << endl << "It is currently set to " << symbol_array_length
712 << " and can be adjusted at the command line." << endl;
713 exit(1);
714 }
715
716 // Make sure the numbers file is intact
717 assert(numbers[0] == COLLECTIONSTART);
718 assert(numbers[length-1] == COLLECTIONEND);
719
720 // Record the length of the Input file
721 inputLength = length;
722
723 return length;
724}
725
726
727
728// Get Document Occurrance statistics
729//
730// Given a phrase, what documents does it occur in?
731
732cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
733
734 // cout << "searching for \""<< p.toString() << "\" in documents "
735 // << 0 << "-" << numberOfDocuments - 1 << endl;
736
737 // The number of documents in which this phrase occurs
738 cellcount df = 0;
739
740 // Initialise the document frequency array
741 for (cellindex i = 0; i < numberOfDocuments; i++) {
742 frequency[i] = 0;
743 }
744
745 // variables used to facilitate the search
746 cellindex begin;
747 cellindex end;
748 cellindex d;
749 symbol *target;
750 bool found;
751
752 // search for the document in which each occurence of the phrase is found
753 for (cellcount i = p.firstSuffixIndex; i <= p.lastSuffixIndex; i++) {
754
755 // cout << "looking for phrase at suffixArray[" << i << "]\n";
756
757 target = suffixArray[i];
758 begin = 0;
759 end = numberOfDocuments - 1;
760 found = false;
761
762 // Search for the occurence of a document delimiter that target
763 // occurs immediately after.
764 // We do this by performing a binary chop search on documentArray.
765 while (!found) {
766
767 // cout << "searching for " << (cellindex) target << " in "
768 // << begin << " - " << end << endl;
769
770 assert (begin <= end);
771
772 // If the beginning and end of the interval are the same,
773 // then we've found the correct document
774 if (begin == end) {
775 if (frequency[begin] == 0) {
776 df++;
777 }
778 frequency[begin]++;
779 found = true;
780 }
781
782 // Otherwise, examine a new document midway through the begin-end
783 // interval and see if it is the one.
784 else {
785 d = (begin + end) / 2;
786 if (target > documentArray[d]) {
787 // If target addrss is greater than this, but thisi sthe last document,
788 // then this must be the one we want. Or, if target is greater than
789 // this one but less then the next, this must be the one we wnat.
790 if ((d == numberOfDocuments - 1) || (target < documentArray[d+1])) {
791 if (frequency[d] == 0) {
792 df++;
793 }
794 frequency[d]++;
795 found = true;
796 } else {
797 // otherwise we know to search later in the document set
798 begin = d + 1;
799 }
800 } else {
801 // search earlier in the document set
802 end = d - 1;
803 }
804 }
805 }
806 }
807 return df;
808}
809
810
811
812
813
814
815// phraseExpansionMemory : Which phrases have we expanded?
816//
817// A set of utilities for keeping track of which phrases we have expanded.
818// We don't want to expand a phrase more than once, after all.
819//
820// This REALLY ought to be in its own class, but it works so that's okay.
821//
822// Phrases are identified by their firstSuffixPosition and length.
823//
824// Functions provided are:
825// void initialisePhraseMemory()
826// void rememberThisPhrase(index, length)
827// bool isPhraseStored(index, length)
828// void deletePhraseMemory()
829//
830// Internally, we will have two separate cases:
831//
832// Phrases of length 1-8:
833// unsigned char phraseMemory[inputLength]
834// is an array where each cell "remembers" the corresponding index in the
835// suffixArray, and each of the 8 bits of the cell correspond to the phrases
836// of length 1, 2... 8.
837// Eventually, we will make this disk-based (i.e. store the array in a file).
838//
839// Phrases of length 9+:
840// file hashTableFile
841// file listOfEntries
842// The first file is a hash table; each phrase maps to one of its cells, which
843// contains either 0 (empty, no occurence) or a number which is an entry number
844// in the second file. This file contains a "list" of entries. Each consists of
845// three numbers: the suffixArray index of the phrase, the length of the phrase,
846// and the entry number of the next phrase with the same hash.
847//
848
849
850unsigned char *phraseMemory;
851
852void initialiseLongPhraseMemory();
853void rememberThisLongPhrase(cellindex index, cellcount length);
854bool isLongPhraseStored(cellindex index, cellcount length);
855void deleteLongPhraseMemory();
856
857
858void initialisePhraseMemory() {
859
860 phraseMemory = new (unsigned char)[inputLength];
861
862 // to begin with, everything is empty
863 for (cellcount i = 0; i < inputLength; i++) {
864 phraseMemory[i] = 0;
865 }
866
867 // intialise the hashTable of long phrases
868 initialiseLongPhraseMemory();
869
870}
871
872void rememberThisPhrase(cellindex index, cellcount length) {
873
874 // if the phrase is very long, use the file-based system
875 if (length > 8) {
876 rememberThisLongPhrase(index, length);
877 return;
878 }
879
880 // create a char with just the bit corresponding to length set
881 unsigned char newbit = 1;
882 for (cellcount i = 1; i < length; i++) {
883 newbit <<= 1;
884 }
885
886 // set that bit in the memory array at position index
887 phraseMemory[index] |= newbit;
888}
889
890
891bool isPhraseStored(cellindex index, cellcount length) {
892
893 // if the phrase is very long, use the file-based system
894 if (length > 8) {
895 return isLongPhraseStored(index, length);
896 }
897
898 // create a char with just the bit corresponding to length set
899 unsigned char newbit = 1;
900 for (cellcount i = 1; i < length; i++) {
901 newbit <<= 1;
902 }
903
904 // retrurn true if that bit is set in memory arrayat position index
905 return (phraseMemory[index] & newbit);
906}
907
908void deletePhraseMemory() {
909 delete phraseMemory;
910 deleteLongPhraseMemory();
911}
912
913
914
915// Files etc used to store "long" equavlents of the above
916
917fstream hashTableFile;
918char hashTableFileName[FILENAME_MAX];
919fstream listOfEntries;
920char listOfEntriesName[FILENAME_MAX];
921cellindex nextEntryNumber;
922
923const cellcount bigPrime = 7919;
924
925
926void initialiseLongPhraseMemory() {
927
928 cellindex example = 0;
929
930 sprintf(hashTableFileName, "%s/hashTable", collection);
931 sprintf(listOfEntriesName, "%s/hashLists", collection);
932
933
934 // create the new hashtable
935 if (verbosity > 1) {
936 cout << "Initialising hashTable: " << hashTableFileName << endl;
937 }
938 hashTableFile.open(hashTableFileName, ios::in | ios::out);
939 for (cellcount i = 0; i < bigPrime; i++) {
940 hashTableFile.write((char *) &example, sizeof(example));
941 }
942
943 // create the list of phrases
944 if (verbosity > 1) {
945 cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
946 }
947 listOfEntries.open(listOfEntriesName, ios::in | ios::out);
948 listOfEntries.write((char *) &example, sizeof(example));
949 listOfEntries.write((char *) &example, sizeof(example));
950 listOfEntries.write((char *) &example, sizeof(example));
951 nextEntryNumber = 1;
952}
953
954
955void rememberThisLongPhrase(cellindex index, cellcount length) {
956
957 // cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
958
959 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
960 cellindex pointer;
961 cellindex zero = 0;
962 cellindex readp = 0;
963 cellindex readi = 0;
964 cellindex readl = 0;
965
966 hashTableFile.seekg(hashOffset);
967 hashTableFile.read((char *) &pointer, sizeof(cellindex));
968
969 if (pointer == 0) {
970 // There is no entry at all in the hash table for this entry
971 // so create one
972
973 pointer = nextEntryNumber++;
974 hashTableFile.seekg(hashOffset);
975 hashTableFile.write((char *) &pointer, sizeof(cellindex));
976
977 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
978 listOfEntries.write((char *) &zero, sizeof(cellindex));
979 listOfEntries.write((char *) &index, sizeof(cellindex));
980 listOfEntries.write((char *) &length, sizeof(cellindex));
981
982 } else {
983 // There is a list starting at this hash value, so the phrase may
984 // be already remembered, or it might need to be appended
985
986 while (pointer != 0) {
987 // Read the entry pointed to by pointer
988 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
989 listOfEntries.read((char *) &readp, sizeof(cellindex));
990 listOfEntries.read((char *) &readi, sizeof(cellindex));
991 listOfEntries.read((char *) &readl, sizeof(cellindex));
992
993 // cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
994
995 if ((readi == index) && (readl = length)) {
996 // we've found that we've already stored it
997 return;
998 } else if (readp == 0) {
999 // we're reached the end of the list. Add a new entry.
1000 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1001 listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
1002 pointer = nextEntryNumber++;
1003
1004 listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1005 listOfEntries.write((char *) &zero, sizeof(cellindex));
1006 listOfEntries.write((char *) &index, sizeof(cellindex));
1007 listOfEntries.write((char *) &length, sizeof(cellindex));
1008 return;
1009 } else {
1010 // go on to the next node
1011 pointer = readp;
1012 }
1013 }
1014 }
1015
1016
1017}
1018
1019bool isLongPhraseStored(cellindex index, cellcount length) {
1020
1021 // cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
1022
1023 cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
1024 cellindex pointer;
1025 cellindex readp = 0;
1026 cellindex readi = 0;
1027 cellindex readl = 0;
1028
1029 // Find the phrase in the hashFile
1030 hashTableFile.seekg(hashOffset);
1031 hashTableFile.read((char *) &pointer, sizeof(cellindex));
1032
1033 if (pointer == 0) {
1034 // There is no entry at all in the hash table for this entry
1035 // so nothing is stored
1036 return false;
1037
1038 } else {
1039 // There is a list starting at this hash value, so the phrase may
1040 // be already remembered in that list
1041 while (pointer != 0) {
1042 // Read the entry pointed to by pointer
1043 listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
1044 listOfEntries.read((char *) &readp, sizeof(cellindex));
1045 listOfEntries.read((char *) &readi, sizeof(cellindex));
1046 listOfEntries.read((char *) &readl, sizeof(cellindex));
1047
1048 if ((readi == index) && (readl = length)) {
1049 // we've found the phrase stored here
1050 return true;
1051 } else {
1052 // go on to the next node
1053 pointer = readp;
1054 }
1055 }
1056 }
1057 return false;
1058}
1059
1060void deleteLongPhraseMemory() {
1061 // remove the hash & other files
1062
1063 hashTableFile.close();
1064 listOfEntries.close();
1065 remove(hashTableFileName);
1066 remove(listOfEntriesName);
1067
1068}
1069
1070
1071
1072
1073// Read the collection statistics file
1074void readStatistics() {
1075
1076 // open the statistics file
1077 char filename[FILENAME_MAX];
1078 sprintf(filename, "%s/clauses.stats", collection);
1079
1080 // Open the file
1081 ifstream inFile(filename, ios::in);
1082 if (!inFile) {
1083 cerr << "File " << filename << " could not be opened\n";
1084 exit(1);
1085 }
1086
1087 // Read the numbers file into the numbers array
1088 char key[1000];
1089 symbol value;
1090 while (inFile >> key >> value){
1091 if (strcmp(key, "first_stopword") == 0) {
1092 firstStopSymbol = value;
1093 } else if (strcmp(key, "last_stopword") == 0) {
1094 lastStopSymbol = value;
1095 } else if (strcmp(key, "first_contentword") == 0) {
1096 firstContentSymbol = value;
1097 } else if (strcmp(key, "last_contentword") == 0) {
1098 lastContentSymbol = value;
1099 }
1100 }
1101 inFile.close();
1102
1103 // Make sure we have the information we need
1104 if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
1105 cerr << "Statistics file incomplete" << endl;
1106 exit(1);
1107 }
1108}
1109
1110
1111
1112
1113
Note: See TracBrowser for help on using the repository browser.