Context Navigation

source: trunk/gsdl/src/phind/generate/suffix.cpp@ 1873

Last change on this file since 1873 was 1873, checked in by paynter, 23 years ago
Fixed a bug iin the Phrase extraction alogrithm that had the "candidates" in the GetMinimalExpansions function sorted backwards.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.4 KB

Line
1	/**********************************************************************
2	*
3	* suffix.cpp -- Extract the repeated phrases in the input using
4	* suffix and prefix arrays.
5	*
6	* Copyright 2000 Gordon W. Paynter
7	* Copyright 2000 The New Zealand Digital Library Project
8	*
9	* A component of the Greenstone digital library software
10	* from the New Zealand Digital Library Project at the
11	* University of Waikato, New Zealand.
12	*
13	* This program is free software; you can redistribute it and/or modify
14	* it under the terms of the GNU General Public License as published by
15	* the Free Software Foundation; either version 2 of the License, or
16	* (at your option) any later version.
17	*
18	* This program is distributed in the hope that it will be useful,
19	* but WITHOUT ANY WARRANTY; without even the implied warranty of
20	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	* GNU General Public License for more details.
22	*
23	* You should have received a copy of the GNU General Public License
24	* along with this program; if not, write to the Free Software
25	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	*
27	*********************************************************************/
28
29	#include <assert.h>
30	#include <fstream.h>
31	#include <iostream.h>
32	#include <math.h>
33	#include <stdio.h>
34	#include <stdlib.h>
35	#include <string.h>
36
37	#include <algo.h>
38	#include <heap.h>
39	#include <vector.h>
40
41	#include "suffix.h"
42	#include "phrase.h"
43
44
45
46	// Global variables declared in suffix.h
47	cellcount symbol_array_length;
48	cellcount inputLength;
49
50	symbol *symbols;
51	symbol **suffixArray;
52	check *suffixCheck;
53	symbol **prefixArray;
54	check *prefixCheck;
55
56
57	// How many documents are in this collection?
58	cellcount numberOfDocuments;
59	symbol **documentArray;
60
61	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
62	int phraseMode = ANYPHRASE; //STOPWORDS;
63
64	// The filestem of the collection's phindex directory
65	char collection[FILENAME_MAX];
66
67	int suffixCompare(const void , const void );
68	int prefixCompare(const void , const void );
69	int pointerCompare(const void , const void );
70
71	int readNumbers(symbol *numbers);
72	void readStatistics();
73
74	void getMinimalExpansions(Phrase &p, vector<Phrase> &results);
75	cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
76
77	// The ranges of the stopword and content-word symbols for the collection
78	symbol firstStopSymbol = 0;
79	symbol lastStopSymbol = 0;
80	symbol firstContentSymbol = 0;
81	symbol lastContentSymbol = 0;
82
83
84
85
86	// Phrase memory
87	// We have to "remember" each phrase that we've expanded
88	void initialisePhraseMemory();
89	void rememberThisPhrase(cellindex index, cellcount length);
90	bool isPhraseStored(cellindex index, cellcount length);
91	void deletePhraseMemory();
92
93
94	// how much output do we want?
95	int verbosity = 1;
96
97
98	int main (int argc, char * argv[]) {
99
100	// Command-line arguments
101	// argv[1] is the phindex directory
102	// argv[2] is the maximum array symbol length (optional)
103	// argv[3] is the mode, where 1 is stopword mode (optional)
104	if (argc < 2) {
105	cerr << "Usage: " << argv[0] << " collection-directory [max-array-size [mode]]" << endl;
106	exit(1);
107	}
108
109	// collection directory
110	strcpy(collection, argv[1]);
111
112	// Symbol length parameter
113	if (argc >= 3) {
114	symbol_array_length = atol(argv[2]);
115	assert(symbol_array_length);
116	} else {
117	symbol_array_length = 1000;
118	}
119
120	// Stopword mode parameter
121	if (argc == 4) {
122	phraseMode = atoi(argv[3]);
123	assert(phraseMode == STOPWORDS);
124	}
125	if (verbosity) {
126	if (phraseMode == STOPWORDS) {
127	cout << "STOPWORDS mode: no phrase may begin or end with a stopword" << endl;
128	} else {
129	cout << "ALLPHRASE mode: extract every phrase that occurs more than once" << endl;
130	}
131	}
132
133	// Read the statistics file
134	readStatistics();
135
136	// Read the numbers file
137	symbols = new (symbol)[symbol_array_length];
138	readNumbers(symbols);
139
140
141	// Create the suffix & prefix arrays
142	if (verbosity) {
143	cout << "Create suffix & prefix arrays for " << inputLength << " symbols" << endl;
144	}
145	suffixArray = new (symbol *)[inputLength];
146	prefixArray = new (symbol *)[inputLength];
147	suffixCheck = new (check)[inputLength];
148	prefixCheck = new (check)[inputLength];
149	for (cellcount j = 0; j < inputLength; j++) {
150	suffixArray[j] = &symbols[j];
151	prefixArray[j] = &symbols[j];
152	}
153	qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
154	qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
155
156
157	// Create the document arrays
158	if (verbosity) {
159	cout << "Create document arrays for " << numberOfDocuments << " documents" << endl;
160	}
161	if (numberOfDocuments == 0) {
162	cerr << "There are no documents in this collection!" << endl;
163	exit(1);
164	}
165
166	// The document frequecy array is used to count the number of times
167	// each phrase occurs in each document. The number of documents in
168	// which a phrase occurs is stored in df.
169	frequency documentFrequency[numberOfDocuments];
170	frequency df;
171
172	// documentArray will be searched in order to discover which document
173	// each phrase occurs in.
174	documentArray = new (symbol *)[numberOfDocuments];
175
176	// Discover all the DOCUMENTSTART symbols and store as a phrase
177	cellindex d = 0;
178	while (*suffixArray[d] != DOCUMENTSTART) {
179	d++;
180	}
181	Phrase p(suffixArray[d], 1, SUFFIX);
182	p.findFirstAndLastSuffix(d, inputLength-1);
183
184	// Insert the document locations (as pointers) into documentArray
185	for (cellcount i = 0; i < p.suffixFrequency; i++) {
186	documentArray[i] = suffixArray[i + p.firstSuffixIndex];
187	}
188
189	// Sort the document array into ascending order of raw pointer value
190	qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
191
192
193	// Extract phrases
194	//
195	// We will make several passesover the data, in each case considering
196	// a set of input phrases and generating a set of output phrases, which
197	// we will expancd in later passes.
198	//
199	// The input phrases in the first pass will be the vocabulary.
200	// In later passes, the input phrases will be the output phrases of the
201	// previous pass.
202	//
203	// In each pass we will consider each input phrase in turn. If we
204	// have seen it before, we will ignore it. Otherwise, we will expand
205	// it and add its expansions to the set of output phrases.
206
207	// Store the phrase data in the phrases file
208	char phraseDataName[FILENAME_MAX];
209	sprintf(phraseDataName, "%s/phrases", collection);
210	ofstream phraseData(phraseDataName, ios::out);
211	if (!phraseData) {
212	cout << "File " << phraseDataName << " could not be opened\n";
213	exit(1);
214	}
215
216	// Count the number of phrases output
217	unsigned long int phraseCounter = 0;
218
219	// Set up the phrase expansion memory.
220	// We need this so that we don't expand a phrase more than once
221	initialisePhraseMemory();
222
223	// The current pass numebr
224	int phrasePass = 1;
225
226
227	// PASS NUMBER 1
228	if (verbosity) {
229	cout << endl<< "suffix: starting pass " << phrasePass << endl;
230	}
231
232	// We need an input file, for phrases we are about to examine, and an
233	// output file, for phrases still to come.
234	ifstream inPhrase;
235	char inPhraseName[FILENAME_MAX];
236	ofstream outPhrase;
237	char outPhraseName[FILENAME_MAX];
238	unsigned long int outPhraseCounter = 0;
239
240	// On the first pass, simply work through the vocabulary
241	sprintf(outPhraseName, "%s/outPhrase.1", collection);
242	outPhrase.open(outPhraseName, ios::out);
243	if (!outPhrase) {
244	cerr << "File " << outPhraseName << " could not be opened\n";
245	exit(1);
246	}
247
248	// Iterate over the different symbols by working through the suffix array
249	vector<Phrase> result;
250	cellindex i = 0;
251	char *tmpString;
252
253	while (i < inputLength) {
254
255	// make a new phrase of length 1
256	p = Phrase(suffixArray[i], 1, SUFFIX);
257	p.findFirstAndLastSuffix(i, inputLength-1);
258
259	// cout << "cell " << i << " - " << p.toString() << endl;
260
261	// We ignore this symbol if it occurs only once, if it is a delimiter,
262	// of if we are in stopwords mode and it is a stopword
263	//
264	// We could imagine a new mode/command-line option, which is like
265	// STOPWORDS but without this restrictrion. This would let you browse
266	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
267	// "the AGRIS" (where the is a stopword and AGRIS a content word).
268	// The system used to work like this; it is easy to implement, but
269	// it explodes the size of the indexes. So: would it be useful?
270	if (!((p.suffixFrequency <= 1) \|\|
271	// (*suffixArray[i] != 23054) \|\|
272	(*suffixArray[i] <= LASTDELIMITER) \|\|
273	((phraseMode == STOPWORDS) && (*suffixArray[i] <= lastStopSymbol)))) {
274
275	// Get minimal expansions of the phrase
276	getMinimalExpansions(p, result);
277
278	if (!result.empty()) {
279
280	// Remember that we have expanded this phrase
281	rememberThisPhrase(i, 1);
282
283	// write the phrase text
284	tmpString = p.toString();
285	phraseData << i << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
286	<< result.size() << ":";
287	delete [] tmpString;
288
289	// write the results
290	for (cellcount i = 0; i < result.size(); i++) {
291	if (i) {
292	phraseData << ",";
293	}
294	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
295	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
296	outPhraseCounter++;
297	}
298	result.clear();
299
300	// Write the documents in which this phrase occurs
301	df = getDocumentOccurrances(p, documentFrequency);
302	phraseData << ":" << df << ":";
303
304	// write the documents
305	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
306	if (documentFrequency[i]) {
307	if (first) {
308	first = 0;
309	} else {
310	phraseData << ";";
311	}
312	// Output the document number. Note that here we've numbered the
313	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
314	// add 1 to the document id when we output it.
315	phraseData << "d" << (i+1);
316	// Next, output the frequency with which the document occurs, but
317	// only if it is > 1.
318	if (documentFrequency[i] > 1) {
319	phraseData << "," << documentFrequency[i];
320	}
321	}
322	}
323
324	phraseData << endl;
325	phraseCounter++;
326
327	// feedback
328	if (verbosity) {
329	if (phraseCounter % 1000 == 0) {
330	tmpString = p.toString();
331	cout << "phrase " << phraseCounter << ": "
332	<< "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
333	delete [] tmpString;
334	}
335	}
336	}
337	}
338	i = p.lastSuffixIndex + 1;
339	}
340	outPhrase.close();
341
342	// REMAINING PASSES
343	// The previous outPhrase file forms the input to each new pass
344	cellcount start, length;
345	while (outPhraseCounter > 0) {
346
347	// Start a new pass
348	phrasePass++;
349	if (verbosity) {
350	cout << endl << "Starting pass " << phrasePass << endl;
351	}
352
353	// Open the input file
354	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
355	inPhrase.open(inPhraseName, ios::in);
356	if (!inPhrase) {
357	cerr << "File " << inPhraseName << " could not be opened\n";
358	exit(1);
359	}
360
361	// Open the output file
362	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
363	outPhrase.open(outPhraseName, ios::out);
364	if (!outPhrase) {
365	cerr << "File " << outPhraseName << " could not be opened\n";
366	exit(1);
367	}
368	outPhraseCounter = 0;
369
370	// Process each phrase
371	while(inPhrase >> start >> length) {
372
373	// Ignore the phrase if we have expanded it before
374	if (isPhraseStored(start, length)) {
375	continue;
376	}
377
378	// Remember that we have examined this phrase
379	rememberThisPhrase(start, length);
380
381	// Find the phrase in the suffixarray
382	p = Phrase(suffixArray[start], length, SUFFIX);
383	p.findFirstAndLastSuffix(start, inputLength-1);
384
385	// cout << "index " << start << ", length " << length << " - " << p.toString() << endl;
386
387
388	// Ignore the phrase if it only occurs once
389	if (p.suffixFrequency < 2) {
390	continue;
391	}
392
393
394	// Write the phrase text tmpString = p.toString();
395	tmpString = p.toString();
396	phraseData << start << "-" << length << ":" << tmpString << ":"
397	<< p.suffixFrequency << ":";
398	delete [] tmpString;
399
400
401	// Expand the phrase, if it is fewer than 8 words long
402	if (length <= 8) {
403
404	// Get the minimal expansions for this phrase
405	getMinimalExpansions(p, result);
406
407	// write the results
408	phraseData << result.size() << ":";
409
410	for (cellcount i = 0; i < result.size(); i++) {
411	if (i) {
412	phraseData << ",";
413	}
414	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
415	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
416	outPhraseCounter++;
417	}
418	result.clear();
419
420	} else {
421	// phrase is too long to expand further
422	phraseData << "0:";
423	}
424
425
426	// Write the documents in which this phrase occurs
427	df = getDocumentOccurrances(p, documentFrequency);
428	phraseData << ":" << df << ":";
429
430	// write the documents
431	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
432	if (documentFrequency[i]) {
433	if (first) {
434	first = 0;
435	} else {
436	phraseData << ";";
437	}
438	// Output the document number. Note that here we've numbered the
439	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
440	// add 1 to the document id when we output it.
441	phraseData << "d" << (i+1);
442	// Next, output the frequency with which the document occurs, but
443	// only if it is > 1.
444	if (documentFrequency[i] > 1) {
445	phraseData << "," << documentFrequency[i];
446	}
447	}
448	}
449
450	phraseData << endl;
451	phraseCounter++;
452
453	// feedback
454	if (verbosity) {
455	if (phraseCounter % 1000 == 0) {
456	tmpString = p.toString();
457	cout << "phrase " << phraseCounter << ": "<< "start " << start
458	<< ", length " << length << " - " << tmpString << endl;
459	delete [] tmpString;
460	}
461	}
462
463	}
464
465	inPhrase.close();
466	outPhrase.close();
467	}
468
469	phraseData.close();
470	deletePhraseMemory();
471
472	delete [] symbols;
473	delete [] suffixArray;
474	delete [] prefixArray;
475	delete [] suffixCheck;
476	delete [] prefixCheck;
477	delete [] documentArray;
478
479
480
481	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
482	return 0;
483	}
484
485
486	// Get Minimal Expansions
487	//
488	// Get the set of "minimal" expansions of a phrase p, using the
489	// algorithm described in the documentation.
490	//
491	// Returns a vector of Expansions.
492
493	void getMinimalExpansions(Phrase &p, vector<Phrase> &results) {
494
495	// 1. Initialise the result and candiate vectors
496	vector<Phrase> candidates;
497	for (cellcount j = 0; j < inputLength; j++) {
498	suffixCheck[j] = 0;
499	prefixCheck[j] = 0;
500	}
501
502	// 2. Expand the phrase p
503
504	// 2.1 Create the candidate set
505	p.initialSuffixCandidates(candidates);
506	p.initialPrefixCandidates(candidates);
507
508	// 2.2 Sort the candidates by phrase length
509	make_heap(candidates.begin(), candidates.end(), isLonger);
510
511	// 3. While candidates is non-empty, confirm the phrases it
512	// contains, expanding them as required
513	while (!candidates.empty()) {
514
515	// 3.1 Get next candidate
516	pop_heap(candidates.begin(), candidates.end(), isLonger);
517	Phrase c = candidates.back();
518	candidates.pop_back();
519
520	// 3.2 If we know there are no unique right extensions
521	// (i.e. this is a phrase drawn from the suffix array)
522	if (!c.hasUniqueSuffixExtension()) {
523
524	c.ensurePrefixFound();
525
526	// 3.2.1 Ignore candidate if we have used a subphrase instead
527	if (suffixCheck[c.firstSuffixIndex] \|\| prefixCheck[c.firstPrefixIndex]) {
528	// cout << "ignoring" << endl;
529	}
530
531	// 3.2.2 If candidate has a unique left (prefix) extension,
532	// Then extend it and add it back into Candidates.
533	else if (c.hasUniquePrefixExtension()) {
534	// cout << "expanding prefix " << c.toString() << "=> ";
535	c.expandUniquePrefixExtensionByOne();
536	candidates.push_back(c);
537	push_heap(candidates.begin(), candidates.end(), isLonger);
538	}
539
540	// 3.2.3 If candidate has no unique left (prefix) extension,
541	// Then add it to the list of results.
542	else {
543	// cout << "no unique prefix, add to results" << endl;
544	results.push_back(c);
545	for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
546	suffixCheck[i] = c.length;
547	}
548	for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
549	prefixCheck[i] = c.length;
550	}
551	}
552	}
553
554	// 3.3 If we know there are no unique left extensions,
555	// Then fdo the same as for 3.2 but exchange suffix & prefix
556	else if (!c.hasUniquePrefixExtension()) {
557
558	c.ensureSuffixFound();
559
560	// 3.3.1
561	if (suffixCheck[c.firstSuffixIndex] \|\| prefixCheck[c.firstPrefixIndex]) {
562
563	}
564
565	// 3.3.2
566	else if (c.hasUniqueSuffixExtension()) {
567	c.expandUniqueSuffixExtensionByOne();
568	candidates.push_back(c);
569	push_heap(candidates.begin(), candidates.end(), isLonger);
570	}
571
572	// 3.3.3
573	else {
574	results.push_back(c);
575	for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
576	suffixCheck[i] = c.length;
577	}
578	for (cellcount i = c.firstPrefixIndex; i <= c.lastPrefixIndex; i++) {
579	prefixCheck[i] = c.length;
580	}
581
582	}
583	}
584	}
585	}
586
587
588	// suffixCompare
589	//
590	// Compare two pointers into a suffix array. We use this in the
591	// qsort function, so the input are pointers to pointers.
592	//
593	// Return -1 if (a < b), otherwise (a > b) so return +1,
594
595	int suffixCompare(const void cpa, const void cpb) {
596
597	// Cast then dereference pointers to suffix array elements
598	symbol pa = (symbol ) cpa;
599	symbol pb = (symbol ) cpb;
600	pa = (symbol ) pa;
601	pb = (symbol ) pb;
602
603	// If the two elements are the same, examine the next one
604	while (pa == pb) {
605	*pa++;
606	*pb++;
607	}
608
609	// Make the copmparison and return
610	if ( pa < pb) {
611	return -1;
612	} else {
613	return +1;
614	}
615	}
616
617
618	// prefixCompare
619	//
620	// Compare two pointers into a prefix array. We use this in the
621	// qsort function, so the input are pointers to pointers.
622	//
623	// Return -1 if (a > b), otherwise (a < b) so return +1,
624
625	int prefixCompare(const void cpa, const void cpb) {
626
627	// Cast then dereference pointers to prefix array elements
628	symbol pa = (symbol ) cpa;
629	symbol pb = (symbol ) cpb;
630	pa = (symbol ) pa;
631	pb = (symbol ) pb;
632
633	// If the two elements are the same, examine the next one
634	while (pa == pb) {
635	*pa--;
636	*pb--;
637	}
638
639	// Make the copmparison and return
640	if ( pa > pb) {
641	return -1;
642	} else {
643	return +1;
644	}
645	}
646
647	// simpleCompare
648	//
649	// Compare two pointers based on the memory location they point to.
650
651	int pointerCompare( const void pa, const void pb ) {
652
653	symbol a = (symbol ) pa;
654	symbol b = (symbol ) pb;
655
656	if (a < b) {
657	return -1;
658	} else if (a > b) {
659	return 1;
660	} else {
661	return 0;
662	}
663	}
664
665
666	// Read the numbers file into an array of symbols.
667	//
668	// Each number is a symbol number; it is essential that the first
669	// symbol (and no others) be 0 and the last symbol (and no others)
670	// be 1.
671	//
672	// Return the number of numbers in the array.
673
674	int readNumbers(symbol *numbers) {
675
676	char filename[FILENAME_MAX];
677	sprintf(filename, "%s/clauses.numbers", collection);
678	if (verbosity) {
679	cout << "Reading numbers from: " << filename << endl;
680	}
681
682	// Open the numbers file
683	ifstream inFile(filename, ios::in);
684	if (!inFile) {
685	cerr << "File " << filename << " could not be opened\n";
686	exit(1);
687	}
688
689	// Read the numbers file into the numbers array
690	symbol word;
691	cellcount length = 0;
692	numberOfDocuments = 0;
693	while ((inFile >> word) && (length < symbol_array_length)){
694	numbers[length++] = word;
695	if (word == DOCUMENTSTART) {
696	numberOfDocuments++;
697	}
698	}
699
700	// Make sure we were able to read all the numbers
701	if (length >= symbol_array_length) {
702	cerr << "Error: the symbol array is too a short to hold " << filename
703	<< endl << "It is currently set to " << symbol_array_length
704	<< " and can be adjusted at the command line." << endl;
705	exit(1);
706	}
707
708	// Make sure the numbers file is intact
709	assert(numbers[0] == COLLECTIONSTART);
710	assert(numbers[length-1] == COLLECTIONEND);
711
712	// Record the length of the Input file
713	inputLength = length;
714
715	return length;
716	}
717
718
719
720	// Get Document Occurrance statistics
721	//
722	// Given a phrase, what documents does it occur in?
723
724	cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
725
726	// cout << "searching for \""<< p.toString() << "\" in documents "
727	// << 0 << "-" << numberOfDocuments - 1 << endl;
728
729	// The number of documents in which this phrase occurs
730	cellcount df = 0;
731
732	// Initialise the document frequency array
733	for (cellindex i = 0; i < numberOfDocuments; i++) {
734	frequency[i] = 0;
735	}
736
737	// variables used to facilitate the search
738	cellindex begin;
739	cellindex end;
740	cellindex d;
741	symbol *target;
742	bool found;
743
744	// search for the document in which each occurence of the phrase is found
745	for (cellcount i = p.firstSuffixIndex; i <= p.lastSuffixIndex; i++) {
746
747	// cout << "looking for phrase at suffixArray[" << i << "]\n";
748
749	target = suffixArray[i];
750	begin = 0;
751	end = numberOfDocuments - 1;
752	found = false;
753
754	// Search for the occurence of a document delimiter that target
755	// occurs immediately after.
756	// We do this by performing a binary chop search on documentArray.
757	while (!found) {
758
759	// cout << "searching for " << (cellindex) target << " in "
760	// << begin << " - " << end << endl;
761
762	assert (begin <= end);
763
764	// If the beginning and end of the interval are the same,
765	// then we've found the correct document
766	if (begin == end) {
767	if (frequency[begin] == 0) {
768	df++;
769	}
770	frequency[begin]++;
771	found = true;
772	}
773
774	// Otherwise, examine a new document midway through the begin-end
775	// interval and see if it is the one.
776	else {
777	d = (begin + end) / 2;
778	if (target > documentArray[d]) {
779	// If target addrss is greater than this, but thisi sthe last document,
780	// then this must be the one we want. Or, if target is greater than
781	// this one but less then the next, this must be the one we wnat.
782	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
783	if (frequency[d] == 0) {
784	df++;
785	}
786	frequency[d]++;
787	found = true;
788	} else {
789	// otherwise we know to search later in the document set
790	begin = d + 1;
791	}
792	} else {
793	// search earlier in the document set
794	end = d - 1;
795	}
796	}
797	}
798	}
799	return df;
800	}
801
802
803
804
805
806
807	// phraseExpansionMemory : Which phrases have we expanded?
808	//
809	// A set of utilities for keeping track of which phrases we have expanded.
810	// We don't want to expand a phrase more than once, after all.
811	//
812	// This REALLY ought to be in its own class, but it works so that's okay.
813	//
814	// Phrases are identified by their firstSuffixPosition and length.
815	//
816	// Functions provided are:
817	// void initialisePhraseMemory()
818	// void rememberThisPhrase(index, length)
819	// bool isPhraseStored(index, length)
820	// void deletePhraseMemory()
821	//
822	// Internally, we will have two separate cases:
823	//
824	// Phrases of length 1-8:
825	// unsigned char phraseMemory[inputLength]
826	// is an array where each cell "remembers" the corresponding index in the
827	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
828	// of length 1, 2... 8.
829	// Eventually, we will make this disk-based (i.e. store the array in a file).
830	//
831	// Phrases of length 9+:
832	// file hashTableFile
833	// file listOfEntries
834	// The first file is a hash table; each phrase maps to one of its cells, which
835	// contains either 0 (empty, no occurence) or a number which is an entry number
836	// in the second file. This file contains a "list" of entries. Each consists of
837	// three numbers: the suffixArray index of the phrase, the length of the phrase,
838	// and the entry number of the next phrase with the same hash.
839	//
840
841
842	unsigned char *phraseMemory;
843
844	void initialiseLongPhraseMemory();
845	void rememberThisLongPhrase(cellindex index, cellcount length);
846	bool isLongPhraseStored(cellindex index, cellcount length);
847	void deleteLongPhraseMemory();
848
849
850	void initialisePhraseMemory() {
851
852	phraseMemory = new (unsigned char)[inputLength];
853
854	// to begin with, everything is empty
855	for (cellcount i = 0; i < inputLength; i++) {
856	phraseMemory[i] = 0;
857	}
858
859	// intialise the hashTable of long phrases
860	initialiseLongPhraseMemory();
861
862	}
863
864	void rememberThisPhrase(cellindex index, cellcount length) {
865
866	// if the phrase is very long, use the file-based system
867	if (length > 8) {
868	rememberThisLongPhrase(index, length);
869	return;
870	}
871
872	// create a char with just the bit corresponding to length set
873	unsigned char newbit = 1;
874	for (cellcount i = 1; i < length; i++) {
875	newbit <<= 1;
876	}
877
878	// set that bit in the memory array at position index
879	phraseMemory[index] \|= newbit;
880	}
881
882
883	bool isPhraseStored(cellindex index, cellcount length) {
884
885	// if the phrase is very long, use the file-based system
886	if (length > 8) {
887	return isLongPhraseStored(index, length);
888	}
889
890	// create a char with just the bit corresponding to length set
891	unsigned char newbit = 1;
892	for (cellcount i = 1; i < length; i++) {
893	newbit <<= 1;
894	}
895
896	// retrurn true if that bit is set in memory arrayat position index
897	return (phraseMemory[index] & newbit);
898	}
899
900	void deletePhraseMemory() {
901	delete phraseMemory;
902	deleteLongPhraseMemory();
903	}
904
905
906
907	// Files etc used to store "long" equavlents of the above
908
909	fstream hashTableFile;
910	char hashTableFileName[FILENAME_MAX];
911	fstream listOfEntries;
912	char listOfEntriesName[FILENAME_MAX];
913	cellindex nextEntryNumber;
914
915	const cellcount bigPrime = 7919;
916
917
918	void initialiseLongPhraseMemory() {
919
920	cellindex example = 0;
921
922	sprintf(hashTableFileName, "%s/hashTable", collection);
923	sprintf(listOfEntriesName, "%s/hashLists", collection);
924
925
926	// create the new hashtable
927	if (verbosity > 1) {
928	cout << "Initialising hashTable: " << hashTableFileName << endl;
929	}
930	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
931	for (cellcount i = 0; i < bigPrime; i++) {
932	hashTableFile.write((char *) &example, sizeof(example));
933	}
934
935	// create the list of phrases
936	if (verbosity > 1) {
937	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
938	}
939	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
940	listOfEntries.write((char *) &example, sizeof(example));
941	listOfEntries.write((char *) &example, sizeof(example));
942	listOfEntries.write((char *) &example, sizeof(example));
943	nextEntryNumber = 1;
944	}
945
946
947	void rememberThisLongPhrase(cellindex index, cellcount length) {
948
949	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
950
951	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
952	cellindex pointer;
953	cellindex zero = 0;
954	cellindex readp = 0;
955	cellindex readi = 0;
956	cellindex readl = 0;
957
958	hashTableFile.seekg(hashOffset);
959	hashTableFile.read((char *) &pointer, sizeof(cellindex));
960
961	if (pointer == 0) {
962	// There is no entry at all in the hash table for this entry
963	// so create one
964
965	pointer = nextEntryNumber++;
966	hashTableFile.seekg(hashOffset);
967	hashTableFile.write((char *) &pointer, sizeof(cellindex));
968
969	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
970	listOfEntries.write((char *) &zero, sizeof(cellindex));
971	listOfEntries.write((char *) &index, sizeof(cellindex));
972	listOfEntries.write((char *) &length, sizeof(cellindex));
973
974	} else {
975	// There is a list starting at this hash value, so the phrase may
976	// be already remembered, or it might need to be appended
977
978	while (pointer != 0) {
979	// Read the entry pointed to by pointer
980	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
981	listOfEntries.read((char *) &readp, sizeof(cellindex));
982	listOfEntries.read((char *) &readi, sizeof(cellindex));
983	listOfEntries.read((char *) &readl, sizeof(cellindex));
984
985	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
986
987	if ((readi == index) && (readl = length)) {
988	// we've found that we've already stored it
989	return;
990	} else if (readp == 0) {
991	// we're reached the end of the list. Add a new entry.
992	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
993	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
994	pointer = nextEntryNumber++;
995
996	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
997	listOfEntries.write((char *) &zero, sizeof(cellindex));
998	listOfEntries.write((char *) &index, sizeof(cellindex));
999	listOfEntries.write((char *) &length, sizeof(cellindex));
1000	return;
1001	} else {
1002	// go on to the next node
1003	pointer = readp;
1004	}
1005	}
1006	}
1007
1008
1009	}
1010
1011	bool isLongPhraseStored(cellindex index, cellcount length) {
1012
1013	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
1014
1015	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
1016	cellindex pointer;
1017	cellindex readp = 0;
1018	cellindex readi = 0;
1019	cellindex readl = 0;
1020
1021	// Find the phrase in the hashFile
1022	hashTableFile.seekg(hashOffset);
1023	hashTableFile.read((char *) &pointer, sizeof(cellindex));
1024
1025	if (pointer == 0) {
1026	// There is no entry at all in the hash table for this entry
1027	// so nothing is stored
1028	return false;
1029
1030	} else {
1031	// There is a list starting at this hash value, so the phrase may
1032	// be already remembered in that list
1033	while (pointer != 0) {
1034	// Read the entry pointed to by pointer
1035	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
1036	listOfEntries.read((char *) &readp, sizeof(cellindex));
1037	listOfEntries.read((char *) &readi, sizeof(cellindex));
1038	listOfEntries.read((char *) &readl, sizeof(cellindex));
1039
1040	if ((readi == index) && (readl = length)) {
1041	// we've found the phrase stored here
1042	return true;
1043	} else {
1044	// go on to the next node
1045	pointer = readp;
1046	}
1047	}
1048	}
1049	return false;
1050	}
1051
1052	void deleteLongPhraseMemory() {
1053	// remove the hash & other files
1054
1055	hashTableFile.close();
1056	listOfEntries.close();
1057	remove(hashTableFileName);
1058	remove(listOfEntriesName);
1059
1060	}
1061
1062
1063
1064
1065	// Read the collection statistics file
1066	void readStatistics() {
1067
1068	// open the statistics file
1069	char filename[FILENAME_MAX];
1070	sprintf(filename, "%s/clauses.stats", collection);
1071
1072	// Open the file
1073	ifstream inFile(filename, ios::in);
1074	if (!inFile) {
1075	cerr << "File " << filename << " could not be opened\n";
1076	exit(1);
1077	}
1078
1079	// Read the numbers file into the numbers array
1080	char key[1000];
1081	symbol value;
1082	while (inFile >> key >> value){
1083	if (strcmp(key, "first_stopword") == 0) {
1084	firstStopSymbol = value;
1085	} else if (strcmp(key, "last_stopword") == 0) {
1086	lastStopSymbol = value;
1087	} else if (strcmp(key, "first_contentword") == 0) {
1088	firstContentSymbol = value;
1089	} else if (strcmp(key, "last_contentword") == 0) {
1090	lastContentSymbol = value;
1091	}
1092	}
1093	inFile.close();
1094
1095	// Make sure we have the information we need
1096	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
1097	cerr << "Statistics file incomplete" << endl;
1098	exit(1);
1099	}
1100	}
1101
1102
1103
1104
1105

Note: See TracBrowser for help on using the repository browser.

Download in other formats: