Context Navigation

source: trunk/gsdl/src/phind/generate/suffix.cpp@ 1631

Last change on this file since 1631 was 1631, checked in by paynter, 24 years ago
Changed copyrights to include NZDLP.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.4 KB

Line
1	/**********************************************************************
2	*
3	* suffix.h -- Extract the repeated phrases in the input using
4	* suffix and prefix arrays.
5	*
6	* Copyright 2000 Gordon W. Paynter ([email protected])
7	* Copyright 2000 The New Zealand Digital Library Project
8	*
9	* A component of the Greenstone digital library software
10	* from the New Zealand Digital Library Project at the
11	* University of Waikato, New Zealand.
12	*
13	* This program is free software; you can redistribute it and/or modify
14	* it under the terms of the GNU General Public License as published by
15	* the Free Software Foundation; either version 2 of the License, or
16	* (at your option) any later version.
17	*
18	* This program is distributed in the hope that it will be useful,
19	* but WITHOUT ANY WARRANTY; without even the implied warranty of
20	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	* GNU General Public License for more details.
22	*
23	* You should have received a copy of the GNU General Public License
24	* along with this program; if not, write to the Free Software
25	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	*
27	*********************************************************************/
28
29	#include <assert.h>
30	#include <fstream.h>
31	#include <iostream.h>
32	#include <math.h>
33	#include <stdio.h>
34	#include <stdlib.h>
35	#include <string.h>
36
37	#include <algo.h>
38	#include <heap.h>
39	#include <vector.h>
40
41	#include "suffix.h"
42	#include "phrase.h"
43
44
45
46	// Global variables declared in suffix.h
47	cellcount symbol_array_length;
48	cellcount inputLength;
49
50	symbol *symbols;
51	symbol **suffixArray;
52	check *suffixCheck;
53	symbol **prefixArray;
54	check *prefixCheck;
55
56
57	// How many documents are in this collection?
58	cellcount numberOfDocuments;
59	symbol **documentArray;
60
61	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
62	int phraseMode = ANYPHRASE; //STOPWORDS;
63
64	// The filestem of the collection's phindex directory
65	char collection[FILENAME_MAX];
66
67	int suffixCompare(const void , const void );
68	int prefixCompare(const void , const void );
69	int pointerCompare(const void , const void );
70
71	int readNumbers(symbol *numbers);
72	void readStatistics();
73
74	void getMinimalExpansions(Phrase &p, vector<Phrase> &results);
75	cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
76
77	// The ranges of the stopword and content-word symbols for the collection
78	symbol firstStopSymbol = 0;
79	symbol lastStopSymbol = 0;
80	symbol firstContentSymbol = 0;
81	symbol lastContentSymbol = 0;
82
83
84
85
86	// Phrase memory
87	// We have to "remember" each phrase that we've expanded
88	void initialisePhraseMemory();
89	void rememberThisPhrase(cellindex index, cellcount length);
90	bool isPhraseStored(cellindex index, cellcount length);
91	void deletePhraseMemory();
92
93
94	// how much output do we want?
95	int verbosity = 1;
96
97
98	int main (int argc, char * argv[]) {
99
100	// Command-line arguments
101	// argv[1] is the phindex directory
102	// argv[2] is the maximum array symbol length (optional)
103	// argv[3] is the mode, where 1 is stopword mode (optional)
104	if (argc < 2) {
105	cerr << "Usage: " << argv[0] << " collection-directory [max-array-size [mode]]" << endl;
106	exit(1);
107	}
108
109	// collection directory
110	strcpy(collection, argv[1]);
111
112	// Symbol length parameter
113	if (argc >= 3) {
114	symbol_array_length = atol(argv[2]);
115	assert(symbol_array_length);
116	} else {
117	symbol_array_length = 1000;
118	}
119
120	// Stopword mode parameter
121	if (argc == 4) {
122	phraseMode = atoi(argv[3]);
123	assert(phraseMode == STOPWORDS);
124	}
125	if (verbosity) {
126	if (phraseMode == STOPWORDS) {
127	cout << "STOPWORDS mode: no phrase may begin or end with a stopword" << endl;
128	} else {
129	cout << "ALLPHRASE mode: extract every phrase that occurs more than once" << endl;
130	}
131	}
132
133	// Read the statistics file
134	readStatistics();
135
136	// Read the numbers file
137	symbols = new (symbol)[symbol_array_length];
138	readNumbers(symbols);
139
140
141	// Create the suffix & prefix arrays
142	if (verbosity) {
143	cout << "Create suffix & prefix arrays for " << inputLength << " symbols" << endl;
144	}
145	suffixArray = new (symbol *)[inputLength];
146	prefixArray = new (symbol *)[inputLength];
147	suffixCheck = new (check)[inputLength];
148	prefixCheck = new (check)[inputLength];
149	for (cellcount j = 0; j < inputLength; j++) {
150	suffixArray[j] = &symbols[j];
151	prefixArray[j] = &symbols[j];
152	}
153	qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
154	qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
155
156
157	// Create the document arrays
158	if (verbosity) {
159	cout << "Create document arrays for " << numberOfDocuments << " documents" << endl;
160	}
161	if (numberOfDocuments == 0) {
162	cerr << "There are no documents in this collection!" << endl;
163	exit(1);
164	}
165
166	// The document frequecy array is used to count the number of times
167	// each phrase occurs in each document. The number of documents in
168	// which a phrase occurs is stored in df.
169	frequency documentFrequency[numberOfDocuments];
170	frequency df;
171
172	// documentArray will be searched in order to discover which document
173	// each phrase occurs in.
174	documentArray = new (symbol *)[numberOfDocuments];
175
176	// Discover all the DOCUMENTSTART symbols and store as a phrase
177	cellindex d = 0;
178	while (*suffixArray[d] != DOCUMENTSTART) {
179	d++;
180	}
181	Phrase p(suffixArray[d], 1, SUFFIX);
182	p.findFirstAndLastSuffix(d, inputLength-1);
183
184	// Insert the document locations (as pointers) into documentArray
185	for (cellcount i = 0; i < p.suffixFrequency; i++) {
186	documentArray[i] = suffixArray[i + p.firstSuffixIndex];
187	}
188
189	// Sort the document array into ascending order of raw pointer value
190	qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
191
192
193	// Extract phrases
194	//
195	// We will make several passesover the data, in each case considering
196	// a set of input phrases and generating a set of output phrases, which
197	// we will expancd in later passes.
198	//
199	// The input phrases in the first pass will be the vocabulary.
200	// In later passes, the input phrases will be the output phrases of the
201	// previous pass.
202	//
203	// In each pass we will consider each input phrase in turn. If we
204	// have seen it before, we will ignore it. Otherwise, we will expand
205	// it and add its expansions to the set of output phrases.
206
207	// Store the phrase data in the phrases file
208	char phraseDataName[FILENAME_MAX];
209	sprintf(phraseDataName, "%s/phrases", collection);
210	ofstream phraseData(phraseDataName, ios::out);
211	if (!phraseData) {
212	cout << "File " << phraseDataName << " could not be opened\n";
213	exit(1);
214	}
215
216	// Count the number of phrases output
217	unsigned long int phraseCounter = 0;
218
219	// Set up the phrase expansion memory.
220	// We need this so that we don't expand a phrase more than once
221	initialisePhraseMemory();
222
223	// The current pass numebr
224	int phrasePass = 1;
225
226
227	// PASS NUMBER 1
228	if (verbosity) {
229	cout << endl<< "suffix: starting pass " << phrasePass << endl;
230	}
231
232	// We need an input file, for phrases we are about to examine, and an
233	// output file, for phrases still to come.
234	ifstream inPhrase;
235	char inPhraseName[FILENAME_MAX];
236	ofstream outPhrase;
237	char outPhraseName[FILENAME_MAX];
238	unsigned long int outPhraseCounter = 0;
239
240	// On the first pass, simply work through the vocabulary
241	sprintf(outPhraseName, "%s/outPhrase.1", collection);
242	outPhrase.open(outPhraseName, ios::out);
243	if (!outPhrase) {
244	cerr << "File " << outPhraseName << " could not be opened\n";
245	exit(1);
246	}
247
248	// Iterate over the different symbols by working through the suffix array
249	vector<Phrase> result;
250	cellindex i = 0;
251	char *tmpString;
252
253	while (i < inputLength) {
254
255	// make a new phrase of length 1
256	p = Phrase(suffixArray[i], 1, SUFFIX);
257	p.findFirstAndLastSuffix(i, inputLength-1);
258
259	// cout << "cell " << i << " - " << p.toString() << endl;
260
261	// We ignore this symbol if it occurs only once, if it is a delimiter,
262	// of if we are in stopwords mode and it is a stopword
263	//
264	// We could imagine a new mode/command-line option, which is like
265	// STOPWORDS but without this restrictrion. This would let you browse
266	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
267	// "the AGRIS" (where the is a stopword and AGRIS a content word).
268	// The system used to work like this; it is easy to implement, but
269	// it explodes the size of the indexes. So: would it be useful?
270	if (!((p.suffixFrequency <= 1) \|\|
271	// (*suffixArray[i] != 23054) \|\|
272	(*suffixArray[i] <= LASTDELIMITER) \|\|
273	((phraseMode == STOPWORDS) && (*suffixArray[i] <= lastStopSymbol)))) {
274
275	// Get minimal expansions of the phrase
276	getMinimalExpansions(p, result);
277
278	if (!result.empty()) {
279
280	// Remember that we have expanded this phrase
281	rememberThisPhrase(i, 1);
282
283	// write the phrase text
284	tmpString = p.toString();
285	phraseData << i << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
286	<< result.size() << ":";
287	delete [] tmpString;
288
289	// write the results
290	for (cellcount i = 0; i < result.size(); i++) {
291	if (i) {
292	phraseData << ",";
293	}
294	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
295	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
296	outPhraseCounter++;
297	}
298	result.clear();
299
300	// Write the documents in which this phrase occurs
301	df = getDocumentOccurrances(p, documentFrequency);
302	phraseData << ":" << df << ":";
303
304	// write the documents
305	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
306	if (documentFrequency[i]) {
307	if (first) {
308	first = 0;
309	} else {
310	phraseData << ";";
311	}
312	// Output the document number. Note that here we've numbered the
313	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
314	// add 1 to the document id when we output it.
315	phraseData << "d" << (i+1);
316	// Next, output the frequency with which the document occurs, but
317	// only if it is > 1.
318	if (documentFrequency[i] > 1) {
319	phraseData << "," << documentFrequency[i];
320	}
321	}
322	}
323
324	phraseData << endl;
325	phraseCounter++;
326
327	// feedback
328	if (verbosity) {
329	if (phraseCounter % 1000 == 0) {
330	tmpString = p.toString();
331	cout << "phrase " << phraseCounter << ": "
332	<< "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
333	delete [] tmpString;
334	}
335	}
336	}
337	}
338	i = p.lastSuffixIndex + 1;
339	}
340	outPhrase.close();
341
342	// REMAINING PASSES
343	// The previous outPhrase file forms the input to each new pass
344	cellcount start, length;
345	while (outPhraseCounter > 0) {
346
347	// Start a new pass
348	phrasePass++;
349	if (verbosity) {
350	cout << endl << "Starting pass " << phrasePass << endl;
351	}
352
353	// Open the input file
354	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
355	inPhrase.open(inPhraseName, ios::in);
356	if (!inPhrase) {
357	cerr << "File " << inPhraseName << " could not be opened\n";
358	exit(1);
359	}
360
361	// Open the output file
362	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
363	outPhrase.open(outPhraseName, ios::out);
364	if (!outPhrase) {
365	cerr << "File " << outPhraseName << " could not be opened\n";
366	exit(1);
367	}
368	outPhraseCounter = 0;
369
370	// Process each phrase
371	while(inPhrase >> start >> length) {
372
373	// Ignore the phrase if we have expanded it before
374	if (isPhraseStored(start, length)) {
375	continue;
376	}
377
378	// Remember that we have examined this phrase
379	rememberThisPhrase(start, length);
380
381	// Find the phrase in the suffixarray
382	p = Phrase(suffixArray[start], length, SUFFIX);
383	p.findFirstAndLastSuffix(start, inputLength-1);
384
385	// cout << "index " << start << ", length " << length << " - " << p.toString() << endl;
386
387
388	// Ignore the phrase if it only occurs once
389	if (p.suffixFrequency < 2) {
390	continue;
391	}
392
393
394	// Write the phrase text tmpString = p.toString();
395	tmpString = p.toString();
396	phraseData << start << "-" << length << ":" << tmpString << ":"
397	<< p.suffixFrequency << ":";
398	delete [] tmpString;
399
400
401	// Expand the phrase, if it is fewer than 8 words long
402	if (length <= 8) {
403
404	// Get the minimal expansions for this phrase
405	getMinimalExpansions(p, result);
406
407	// write the results
408	phraseData << result.size() << ":";
409
410	for (cellcount i = 0; i < result.size(); i++) {
411	if (i) {
412	phraseData << ",";
413	}
414	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
415	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
416	outPhraseCounter++;
417	}
418	result.clear();
419
420	} else {
421	// phrase is too long to expand further
422	phraseData << "0:";
423	}
424
425
426	// Write the documents in which this phrase occurs
427	df = getDocumentOccurrances(p, documentFrequency);
428	phraseData << ":" << df << ":";
429
430	// write the documents
431	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
432	if (documentFrequency[i]) {
433	if (first) {
434	first = 0;
435	} else {
436	phraseData << ";";
437	}
438	phraseData << "d" << i;
439	if (documentFrequency[i] > 1) {
440	phraseData << "," << documentFrequency[i];
441	}
442	}
443	}
444
445	phraseData << endl;
446	phraseCounter++;
447
448	// feedback
449	if (verbosity) {
450	if (phraseCounter % 1000 == 0) {
451	tmpString = p.toString();
452	cout << "phrase " << phraseCounter << ": "<< "start " << start
453	<< ", length " << length << " - " << tmpString << endl;
454	delete [] tmpString;
455	}
456	}
457
458	}
459
460	inPhrase.close();
461	outPhrase.close();
462	}
463
464	phraseData.close();
465	deletePhraseMemory();
466
467	delete [] symbols;
468	delete [] suffixArray;
469	delete [] prefixArray;
470	delete [] suffixCheck;
471	delete [] prefixCheck;
472	delete [] documentArray;
473
474
475
476	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
477	return 0;
478	}
479
480
481	// Get Minimal Expansions
482	//
483	// Get the set of "minimal" expansions of a phrase p, using the
484	// algorithm described in the documentation.
485	//
486	// Returns a vector of Expansions.
487
488	void getMinimalExpansions(Phrase &p, vector<Phrase> &results) {
489
490	// 1. Initialise the result and candiate vectors
491	vector<Phrase> candidates;
492	for (cellcount j = 0; j < inputLength; j++) {
493	suffixCheck[j] = 0;
494	prefixCheck[j] = 0;
495	}
496
497	// 2. Expand the phrase p
498
499	// 2.1 Create the candidate set
500	p.initialSuffixCandidates(candidates);
501	p.initialPrefixCandidates(candidates);
502
503	// 2.2 Sort the candidates by phrase length
504	// sort(candidates.begin(), candidates.end(), isShorter);
505	make_heap(candidates.begin(), candidates.end(), isShorter);
506
507
508	// 3. While candidates is non-empty, confirm the phrases it
509	// contains, expanding them as required
510	while (!candidates.empty()) {
511
512	// 3.1 Get next candidate
513	//Phrase c = candidates.front();
514	//candidates.erase(candidates.begin());
515	pop_heap(candidates.begin(), candidates.end(), isShorter);
516	Phrase c = candidates.back();
517	candidates.pop_back();
518
519	// 3.2 If we know there are no unique right extensions
520	// (i.e. this is a phrase drawn from the suffix array)
521	if (!c.hasUniqueSuffixExtension()) {
522
523	c.ensurePrefixFound();
524
525	// 3.2.1 Ignore candidate if we have used a subphrase instead
526	if (suffixCheck[c.firstSuffixIndex] \|\| prefixCheck[c.firstPrefixIndex]) {
527	// cout << "ignoring" << endl;
528	}
529
530	// 3.2.2 If candidate has a unique left (prefix) extension,
531	// Then extend it and add it back into Candidates.
532	else if (c.hasUniquePrefixExtension()) {
533	// cout << "expanding prefix " << c.toString() << "=> ";
534	c.expandUniquePrefixExtensionByOne();
535	//candidates.push_back(c);
536	//sort(candidates.begin(), candidates.end(), isShorter);
537	candidates.push_back(c);
538	push_heap(candidates.begin(), candidates.end(), isShorter);
539	}
540
541	// 3.2.3 If candidate has no unique left (prefix) extension,
542	// Then add it to the list of results.
543	else {
544	// cout << "no unique prefix, add to results" << endl;
545	results.push_back(c);
546	for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
547	suffixCheck[i] = c.length;
548	}
549	for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
550	prefixCheck[i] = c.length;
551	}
552	}
553	}
554
555	// 3.3 If we know there are no unique left extensions,
556	// Then fdo the same as for 3.2 but exchange suffix & prefix
557	else if (!c.hasUniquePrefixExtension()) {
558
559	c.ensureSuffixFound();
560
561	// 3.3.1
562	if (suffixCheck[c.firstSuffixIndex] \|\| prefixCheck[c.firstPrefixIndex]) {
563
564	}
565
566	// 3.3.2
567	else if (c.hasUniqueSuffixExtension()) {
568	c.expandUniqueSuffixExtensionByOne();
569	//candidates.push_back(c);
570	//sort(candidates.begin(), candidates.end(), isShorter);
571	candidates.push_back(c);
572	push_heap(candidates.begin(), candidates.end(), isShorter);
573	}
574
575	// 3.3.3
576	else {
577	results.push_back(c);
578	for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
579	suffixCheck[i] = c.length;
580	}
581	for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
582	prefixCheck[i] = c.length;
583	}
584
585	}
586	}
587	}
588	}
589
590
591	// suffixCompare
592	//
593	// Compare two pointers into a suffix array. We use this in the
594	// qsort function, so the input are pointers to pointers.
595	//
596	// Return -1 if (a < b), otherwise (a > b) so return +1,
597
598	int suffixCompare(const void cpa, const void cpb) {
599
600	// Cast then dereference pointers to suffix array elements
601	symbol pa = (symbol ) cpa;
602	symbol pb = (symbol ) cpb;
603	pa = (symbol ) pa;
604	pb = (symbol ) pb;
605
606	// If the two elements are the same, examine the next one
607	while (pa == pb) {
608	*pa++;
609	*pb++;
610	}
611
612	// Make the copmparison and return
613	if ( pa < pb) {
614	return -1;
615	} else {
616	return +1;
617	}
618	}
619
620
621	// prefixCompare
622	//
623	// Compare two pointers into a prefix array. We use this in the
624	// qsort function, so the input are pointers to pointers.
625	//
626	// Return -1 if (a > b), otherwise (a < b) so return +1,
627
628	int prefixCompare(const void cpa, const void cpb) {
629
630	// Cast then dereference pointers to prefix array elements
631	symbol pa = (symbol ) cpa;
632	symbol pb = (symbol ) cpb;
633	pa = (symbol ) pa;
634	pb = (symbol ) pb;
635
636	// If the two elements are the same, examine the next one
637	while (pa == pb) {
638	*pa--;
639	*pb--;
640	}
641
642	// Make the copmparison and return
643	if ( pa > pb) {
644	return -1;
645	} else {
646	return +1;
647	}
648	}
649
650	// simpleCompare
651	//
652	// Compare two pointers based on the memory location they point to.
653
654	int pointerCompare( const void pa, const void pb ) {
655
656	symbol a = (symbol ) pa;
657	symbol b = (symbol ) pb;
658
659	if (a < b) {
660	return -1;
661	} else if (a > b) {
662	return 1;
663	} else {
664	return 0;
665	}
666	}
667
668
669	// Read the numbers file into an array of symbols.
670	//
671	// Each number is a symbol number; it is essential that the first
672	// symbol (and no others) be 0 and the last symbol (and no others)
673	// be 1.
674	//
675	// Return the number of numbers in the array.
676
677	int readNumbers(symbol *numbers) {
678
679	char filename[FILENAME_MAX];
680	sprintf(filename, "%s/clauses.numbers", collection);
681	if (verbosity) {
682	cout << "Reading numbers from: " << filename << endl;
683	}
684
685	// Open the numbers file
686	ifstream inFile(filename, ios::in);
687	if (!inFile) {
688	cerr << "File " << filename << " could not be opened\n";
689	exit(1);
690	}
691
692	// Read the numbers file into the numbers array
693	symbol word;
694	cellcount length = 0;
695	numberOfDocuments = 0;
696	while ((inFile >> word) && (length < symbol_array_length)){
697	numbers[length++] = word;
698	if (word == DOCUMENTSTART) {
699	numberOfDocuments++;
700	}
701	}
702
703	// Make sure we were able to read all the numbers
704	if (length >= symbol_array_length) {
705	cerr << "Error: the symbol array is too a short to hold " << filename
706	<< endl << "It is currently set to " << symbol_array_length
707	<< " and can be adjusted at the command line." << endl;
708	exit(1);
709	}
710
711	// Make sure the numbers file is intact
712	assert(numbers[0] == COLLECTIONSTART);
713	assert(numbers[length-1] == COLLECTIONEND);
714
715	// Record the length of the Input file
716	inputLength = length;
717
718	return length;
719	}
720
721
722
723	// Get Document Occurrance statistics
724	//
725	// Given a phrase, what documents does it occur in?
726
727	cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
728
729	// cout << "searching for \""<< p.toString() << "\" in documents "
730	// << 0 << "-" << numberOfDocuments - 1 << endl;
731
732	// The number of documents in which this phrase occurs
733	cellcount df = 0;
734
735	// Initialise the document frequency array
736	for (cellindex i = 0; i < numberOfDocuments; i++) {
737	frequency[i] = 0;
738	}
739
740	// variables used to facilitate the search
741	cellindex begin;
742	cellindex end;
743	cellindex d;
744	symbol *target;
745	bool found;
746
747	// search for the document in which each occurence of the phrase is found
748	for (cellcount i = p.firstSuffixIndex; i <= p.lastSuffixIndex; i++) {
749
750	// cout << "looking for phrase at suffixArray[" << i << "]\n";
751
752	target = suffixArray[i];
753	begin = 0;
754	end = numberOfDocuments - 1;
755	found = false;
756
757	// Search for the occurence of a document delimiter that target
758	// occurs immediately after.
759	// We do this by performing a binary chop search on documentArray.
760	while (!found) {
761
762	// cout << "searching for " << (cellindex) target << " in "
763	// << begin << " - " << end << endl;
764
765	assert (begin <= end);
766
767	// If the beginning and end of the interval are the same,
768	// then we've found the correct document
769	if (begin == end) {
770	if (frequency[begin] == 0) {
771	df++;
772	}
773	frequency[begin]++;
774	found = true;
775	}
776
777	// Otherwise, examine a new document midway through the begin-end
778	// interval and see if it is the one.
779	else {
780	d = (begin + end) / 2;
781	if (target > documentArray[d]) {
782	// If target addrss is greater than this, but thisi sthe last document,
783	// then this must be the one we want. Or, if target is greater than
784	// this one but less then the next, this must be the one we wnat.
785	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
786	if (frequency[d] == 0) {
787	df++;
788	}
789	frequency[d]++;
790	found = true;
791	} else {
792	// otherwise we know to search later in the document set
793	begin = d + 1;
794	}
795	} else {
796	// search earlier in the document set
797	end = d - 1;
798	}
799	}
800	}
801	}
802	return df;
803	}
804
805
806
807
808
809
810	// phraseExpansionMemory : Which phrases have we expanded?
811	//
812	// A set of utilities for keeping track of which phrases we have expanded.
813	// We don't want to expand a phrase more than once, after all.
814	//
815	// This REALLY ought to be in its own class, but it works so that's okay.
816	//
817	// Phrases are identified by their firstSuffixPosition and length.
818	//
819	// Functions provided are:
820	// void initialisePhraseMemory()
821	// void rememberThisPhrase(index, length)
822	// bool isPhraseStored(index, length)
823	// void deletePhraseMemory()
824	//
825	// Internally, we will have two separate cases:
826	//
827	// Phrases of length 1-8:
828	// unsigned char phraseMemory[inputLength]
829	// is an array where each cell "remembers" the corresponding index in the
830	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
831	// of length 1, 2... 8.
832	// Eventually, we will make this disk-based (i.e. store the array in a file).
833	//
834	// Phrases of length 9+:
835	// file hashTableFile
836	// file listOfEntries
837	// The first file is a hash table; each phrase maps to one of its cells, which
838	// contains either 0 (empty, no occurence) or a number which is an entry number
839	// in the second file. This file contains a "list" of entries. Each consists of
840	// three numbers: the suffixArray index of the phrase, the length of the phrase,
841	// and the entry number of the next phrase with the same hash.
842	//
843
844
845	unsigned char *phraseMemory;
846
847	void initialiseLongPhraseMemory();
848	void rememberThisLongPhrase(cellindex index, cellcount length);
849	bool isLongPhraseStored(cellindex index, cellcount length);
850	void deleteLongPhraseMemory();
851
852
853	void initialisePhraseMemory() {
854
855	phraseMemory = new (unsigned char)[inputLength];
856
857	// to begin with, everything is empty
858	for (cellcount i = 0; i < inputLength; i++) {
859	phraseMemory[i] = 0;
860	}
861
862	// intialise the hashTable of long phrases
863	initialiseLongPhraseMemory();
864
865	}
866
867	void rememberThisPhrase(cellindex index, cellcount length) {
868
869	// if the phrase is very long, use the file-based system
870	if (length > 8) {
871	rememberThisLongPhrase(index, length);
872	return;
873	}
874
875	// create a char with just the bit corresponding to length set
876	unsigned char newbit = 1;
877	for (cellcount i = 1; i < length; i++) {
878	newbit <<= 1;
879	}
880
881	// set that bit in the memory array at position index
882	phraseMemory[index] \|= newbit;
883	}
884
885
886	bool isPhraseStored(cellindex index, cellcount length) {
887
888	// if the phrase is very long, use the file-based system
889	if (length > 8) {
890	return isLongPhraseStored(index, length);
891	}
892
893	// create a char with just the bit corresponding to length set
894	unsigned char newbit = 1;
895	for (cellcount i = 1; i < length; i++) {
896	newbit <<= 1;
897	}
898
899	// retrurn true if that bit is set in memory arrayat position index
900	return (phraseMemory[index] & newbit);
901	}
902
903	void deletePhraseMemory() {
904	delete phraseMemory;
905	deleteLongPhraseMemory();
906	}
907
908
909
910	// Files etc used to store "long" equavlents of the above
911
912	fstream hashTableFile;
913	char hashTableFileName[FILENAME_MAX];
914	fstream listOfEntries;
915	char listOfEntriesName[FILENAME_MAX];
916	cellindex nextEntryNumber;
917
918	const cellcount bigPrime = 7919;
919
920
921	void initialiseLongPhraseMemory() {
922
923	cellindex example = 0;
924
925	sprintf(hashTableFileName, "%s/hashTable", collection);
926	sprintf(listOfEntriesName, "%s/hashLists", collection);
927
928
929	// create the new hashtable
930	if (verbosity > 1) {
931	cout << "Initialising hashTable: " << hashTableFileName << endl;
932	}
933	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
934	for (cellcount i = 0; i < bigPrime; i++) {
935	hashTableFile.write((char *) &example, sizeof(example));
936	}
937
938	// create the list of phrases
939	if (verbosity > 1) {
940	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
941	}
942	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
943	listOfEntries.write((char *) &example, sizeof(example));
944	listOfEntries.write((char *) &example, sizeof(example));
945	listOfEntries.write((char *) &example, sizeof(example));
946	nextEntryNumber = 1;
947	}
948
949
950	void rememberThisLongPhrase(cellindex index, cellcount length) {
951
952	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
953
954	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
955	cellindex pointer;
956	cellindex zero = 0;
957	cellindex readp = 0;
958	cellindex readi = 0;
959	cellindex readl = 0;
960
961	hashTableFile.seekg(hashOffset);
962	hashTableFile.read((char *) &pointer, sizeof(cellindex));
963
964	if (pointer == 0) {
965	// There is no entry at all in the hash table for this entry
966	// so create one
967
968	pointer = nextEntryNumber++;
969	hashTableFile.seekg(hashOffset);
970	hashTableFile.write((char *) &pointer, sizeof(cellindex));
971
972	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
973	listOfEntries.write((char *) &zero, sizeof(cellindex));
974	listOfEntries.write((char *) &index, sizeof(cellindex));
975	listOfEntries.write((char *) &length, sizeof(cellindex));
976
977	} else {
978	// There is a list starting at this hash value, so the phrase may
979	// be already remembered, or it might need to be appended
980
981	while (pointer != 0) {
982	// Read the entry pointed to by pointer
983	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
984	listOfEntries.read((char *) &readp, sizeof(cellindex));
985	listOfEntries.read((char *) &readi, sizeof(cellindex));
986	listOfEntries.read((char *) &readl, sizeof(cellindex));
987
988	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
989
990	if ((readi == index) && (readl = length)) {
991	// we've found that we've already stored it
992	return;
993	} else if (readp == 0) {
994	// we're reached the end of the list. Add a new entry.
995	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
996	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
997	pointer = nextEntryNumber++;
998
999	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1000	listOfEntries.write((char *) &zero, sizeof(cellindex));
1001	listOfEntries.write((char *) &index, sizeof(cellindex));
1002	listOfEntries.write((char *) &length, sizeof(cellindex));
1003	return;
1004	} else {
1005	// go on to the next node
1006	pointer = readp;
1007	}
1008	}
1009	}
1010
1011
1012	}
1013
1014	bool isLongPhraseStored(cellindex index, cellcount length) {
1015
1016	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
1017
1018	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
1019	cellindex pointer;
1020	cellindex readp = 0;
1021	cellindex readi = 0;
1022	cellindex readl = 0;
1023
1024	// Find the phrase in the hashFile
1025	hashTableFile.seekg(hashOffset);
1026	hashTableFile.read((char *) &pointer, sizeof(cellindex));
1027
1028	if (pointer == 0) {
1029	// There is no entry at all in the hash table for this entry
1030	// so nothing is stored
1031	return false;
1032
1033	} else {
1034	// There is a list starting at this hash value, so the phrase may
1035	// be already remembered in that list
1036	while (pointer != 0) {
1037	// Read the entry pointed to by pointer
1038	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
1039	listOfEntries.read((char *) &readp, sizeof(cellindex));
1040	listOfEntries.read((char *) &readi, sizeof(cellindex));
1041	listOfEntries.read((char *) &readl, sizeof(cellindex));
1042
1043	if ((readi == index) && (readl = length)) {
1044	// we've found the phrase stored here
1045	return true;
1046	} else {
1047	// go on to the next node
1048	pointer = readp;
1049	}
1050	}
1051	}
1052	return false;
1053	}
1054
1055	void deleteLongPhraseMemory() {
1056	// remove the hash & other files
1057
1058	hashTableFile.close();
1059	listOfEntries.close();
1060	remove(hashTableFileName);
1061	remove(listOfEntriesName);
1062
1063	}
1064
1065
1066
1067
1068	// Read the collection statistics file
1069	void readStatistics() {
1070
1071	// open the statistics file
1072	char filename[FILENAME_MAX];
1073	sprintf(filename, "%s/clauses.stats", collection);
1074
1075	// Open the file
1076	ifstream inFile(filename, ios::in);
1077	if (!inFile) {
1078	cerr << "File " << filename << " could not be opened\n";
1079	exit(1);
1080	}
1081
1082	// Read the numbers file into the numbers array
1083	char key[1000];
1084	symbol value;
1085	while (inFile >> key >> value){
1086	if (strcmp(key, "first_stopword") == 0) {
1087	firstStopSymbol = value;
1088	} else if (strcmp(key, "last_stopword") == 0) {
1089	lastStopSymbol = value;
1090	} else if (strcmp(key, "first_contentword") == 0) {
1091	firstContentSymbol = value;
1092	} else if (strcmp(key, "last_contentword") == 0) {
1093	lastContentSymbol = value;
1094	}
1095	}
1096	inFile.close();
1097
1098	// Make sure we have the information we need
1099	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
1100	cerr << "Statistics file incomplete" << endl;
1101	exit(1);
1102	}
1103	}
1104
1105
1106
1107
1108

Note: See TracBrowser for help on using the repository browser.

Download in other formats: