Context Navigation

source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2487

Last change on this file since 2487 was 2487, checked in by sjboddie, 23 years ago
Changes to get phind working under windows
Property svn:keywords set to `Author Date Id Revision`
File size: 30.8 KB

Line
1	/**********************************************************************
2	*
3	* suffix.cpp -- Extract the repeated phrases in the input using
4	* suffix and prefix arrays.
5	*
6	* Copyright 2000 Gordon W. Paynter
7	* Copyright 2000 The New Zealand Digital Library Project
8	*
9	* A component of the Greenstone digital library software
10	* from the New Zealand Digital Library Project at the
11	* University of Waikato, New Zealand.
12	*
13	* This program is free software; you can redistribute it and/or modify
14	* it under the terms of the GNU General Public License as published by
15	* the Free Software Foundation; either version 2 of the License, or
16	* (at your option) any later version.
17	*
18	* This program is distributed in the hope that it will be useful,
19	* but WITHOUT ANY WARRANTY; without even the implied warranty of
20	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	* GNU General Public License for more details.
22	*
23	* You should have received a copy of the GNU General Public License
24	* along with this program; if not, write to the Free Software
25	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	*
27	*********************************************************************/
28
29	#include <assert.h>
30	#include <math.h>
31	#include <stdio.h>
32	#include <stdlib.h>
33	#include <string.h>
34
35	#if defined(GSDL_USE_IOS_H)
36	# include <fstream.h>
37	# include <iostream.h>
38	#else
39	# include <fstream>
40	# include <iostream>
41	#endif
42
43	#if defined(GSDL_USE_STL_H)
44	# if defined(GSDL_USE_ALGO_H)
45	# include <algo.h>
46	# else
47	# include <algorithm.h>
48	# endif
49	# include <vector.h>
50	#else
51	# include <algorithm>
52	# include <vector>
53	#endif
54	#include <stl_heap.h>
55
56
57	#include "suffix.h"
58	#include "phrase.h"
59
60	// Global variables declared in suffix.h
61	cellcount inputLength;
62
63	symbol *symbols;
64	symbol **suffixArray;
65	check *suffixCheck;
66	symbol **prefixArray;
67	check *prefixCheck;
68
69
70	// How many documents are in this collection?
71	cellcount numberOfDocuments;
72	symbol **documentArray;
73
74	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
75	int phraseMode = ANYPHRASE; //STOPWORDS;
76
77	// The filestem of the collection's phindex directory
78	char collection[FILENAME_MAX];
79
80	int suffixCompare(const void , const void );
81	int prefixCompare(const void , const void );
82	int pointerCompare(const void , const void );
83
84	int readNumbers();
85	void readStatistics();
86
87	void getMinimalExpansions(Phrase &p, vector<Phrase> &results);
88	cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
89
90	// The ranges of the stopword and content-word symbols for the collection
91	symbol firstStopSymbol = 0;
92	symbol lastStopSymbol = 0;
93	symbol firstContentSymbol = 0;
94	symbol lastContentSymbol = 0;
95
96
97
98
99	// Phrase memory
100	// We have to "remember" each phrase that we've expanded
101	void initialisePhraseMemory();
102	void rememberThisPhrase(cellindex index, cellcount length);
103	bool isPhraseStored(cellindex index, cellcount length);
104	void deletePhraseMemory();
105
106
107	// how much output do we want?
108	int verbosity = 1;
109
110
111	int main (int argc, char * argv[]) {
112
113	// Command-line arguments
114	// argv[1] is the phindex directory
115	// argv[2] is the maximum array symbol length (optional)
116	// argv[3] is the mode, where 1 is stopword mode (optional)
117	if (argc < 2) {
118	cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
119	exit(1);
120	}
121
122	// collection directory
123	strcpy(collection, argv[1]);
124
125	// mode parameter
126	phraseMode = atoi(argv[2]);
127	assert((phraseMode == STOPWORDS) \|\| (phraseMode == ANYPHRASE));
128
129	// optional verbosity parameter
130	if (argc == 4) {
131	verbosity = atoi(argv[3]);
132	assert (verbosity >= 0);
133	}
134
135	if (verbosity) {
136	cout << "Suffix phrase extraction program" << endl;
137	}
138
139	if (verbosity > 1) {
140	if (phraseMode == STOPWORDS) {
141	cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
142	} else {
143	cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
144	}
145	}
146
147	// Read the statistics file
148	readStatistics();
149
150	// Read the numbers file
151	readNumbers();
152
153	// Create the suffix & prefix arrays
154	suffixArray = new symbol *[inputLength];
155	prefixArray = new symbol *[inputLength];
156	suffixCheck = new check[inputLength];
157	prefixCheck = new check[inputLength];
158	if (prefixCheck == NULL) {
159	cerr << "Suffix error: not enough memory to hold " << inputLength
160	<< " symbols." << endl;
161	exit(2);
162	}
163
164	// Initialise prefix and suffix arrays
165	for (cellcount j = 0; j < inputLength; j++) {
166	suffixArray[j] = &symbols[j];
167	prefixArray[j] = &symbols[j];
168	}
169	qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
170	qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
171
172
173	// Create the document arrays
174	if (numberOfDocuments == 0) {
175	cerr << "There are no documents in this collection!" << endl;
176	exit(1);
177	}
178	if (verbosity > 1) {
179	cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
180	}
181
182	// The document frequecy array is used to count the number of times
183	// each phrase occurs in each document. The number of documents in
184	// which a phrase occurs is stored in df.
185	frequency *documentFrequency = new frequency[numberOfDocuments];
186	frequency df;
187
188	// documentArray will be searched in order to discover which document
189	// each phrase occurs in.
190	documentArray = new symbol *[numberOfDocuments];
191
192	// Discover all the DOCUMENTSTART symbols and store as a phrase
193	cellindex d = 0;
194	while (*suffixArray[d] != DOCUMENTSTART) {
195	d++;
196	}
197	Phrase p(suffixArray[d], 1, SUFFIX);
198	p.findFirstAndLastSuffix(d, inputLength-1);
199
200	// Insert the document locations (as pointers) into documentArray
201	for (cellcount i = 0; i < p.suffixFrequency; i++) {
202	documentArray[i] = suffixArray[i + p.firstSuffixIndex];
203	}
204
205	// Sort the document array into ascending order of raw pointer value
206	qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
207
208
209	// Extract phrases
210	//
211	// We will make several passesover the data, in each case considering
212	// a set of input phrases and generating a set of output phrases, which
213	// we will expancd in later passes.
214	//
215	// The input phrases in the first pass will be the vocabulary.
216	// In later passes, the input phrases will be the output phrases of the
217	// previous pass.
218	//
219	// In each pass we will consider each input phrase in turn. If we
220	// have seen it before, we will ignore it. Otherwise, we will expand
221	// it and add its expansions to the set of output phrases.
222
223	// Store the phrase data in the phrases file
224	char phraseDataName[FILENAME_MAX];
225	sprintf(phraseDataName, "%s/phrases", collection);
226	ofstream phraseData(phraseDataName, ios::out);
227	if (!phraseData) {
228	cout << "File " << phraseDataName << " could not be opened\n";
229	exit(1);
230	}
231
232	// Count the number of phrases output
233	unsigned long int phraseCounter = 0;
234
235	// Set up the phrase expansion memory.
236	// We need this so that we don't expand a phrase more than once
237	initialisePhraseMemory();
238
239	// The current pass numebr
240	int phrasePass = 1;
241
242
243	// PASS NUMBER 1
244	if (verbosity > 1) {
245	cout << "Starting pass " << phrasePass << endl;
246	}
247
248	// We need an input file, for phrases we are about to examine, and an
249	// output file, for phrases still to come.
250	ifstream inPhrase;
251	char inPhraseName[FILENAME_MAX];
252	ofstream outPhrase;
253	char outPhraseName[FILENAME_MAX];
254	unsigned long int outPhraseCounter = 0;
255
256	// On the first pass, simply work through the vocabulary
257	sprintf(outPhraseName, "%s/outPhrase.1", collection);
258	outPhrase.open(outPhraseName, ios::out);
259	if (!outPhrase) {
260	cerr << "File " << outPhraseName << " could not be opened\n";
261	exit(1);
262	}
263
264	// Iterate over the different symbols by working through the suffix array
265	vector<Phrase> result;
266	cellindex ij = 0;
267	char *tmpString;
268
269	while (ij < inputLength) {
270
271	// make a new phrase of length 1
272	p = Phrase(suffixArray[ij], 1, SUFFIX);
273	p.findFirstAndLastSuffix(ij, inputLength-1);
274
275	// cout << "cell " << ij << " - " << p.toString() << endl;
276
277	// We ignore this symbol if it occurs only once, if it is a delimiter,
278	// of if we are in stopwords mode and it is a stopword
279	//
280	// We could imagine a new mode/command-line option, which is like
281	// STOPWORDS but without this restrictrion. This would let you browse
282	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
283	// "the AGRIS" (where the is a stopword and AGRIS a content word).
284	// The system used to work like this; it is easy to implement, but
285	// it explodes the size of the indexes. So: would it be useful?
286	if (!((p.suffixFrequency <= 1) \|\|
287	// (*suffixArray[ij] != 23054) \|\|
288	(*suffixArray[ij] <= LASTDELIMITER) \|\|
289	((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {
290
291	// Get minimal expansions of the phrase
292	getMinimalExpansions(p, result);
293
294	if (!result.empty()) {
295
296	// Remember that we have expanded this phrase
297	rememberThisPhrase(ij, 1);
298
299	// write the phrase text
300	tmpString = p.toString();
301	phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
302	<< result.size() << ":";
303	delete [] tmpString;
304
305	// write the results
306	for (cellcount k = 0; k < result.size(); k++) {
307	if (k) {
308	phraseData << ",";
309	}
310	phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
311	outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
312	outPhraseCounter++;
313	}
314	result.clear();
315
316	// Write the documents in which this phrase occurs
317	df = getDocumentOccurrances(p, documentFrequency);
318	phraseData << ":" << df << ":";
319
320	// write the documents
321	for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
322	if (documentFrequency[m]) {
323	if (first) {
324	first = 0;
325	} else {
326	phraseData << ";";
327	}
328	// Output the document number. Note that here we've numbered the
329	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
330	// add 1 to the document id when we output it.
331	phraseData << "d" << (m+1);
332	// Next, output the frequency with which the document occurs, but
333	// only if it is > 1.
334	if (documentFrequency[m] > 1) {
335	phraseData << "," << documentFrequency[m];
336	}
337	}
338	}
339
340	phraseData << endl;
341	phraseCounter++;
342
343	// feedback
344	if (verbosity) {
345	if (phraseCounter % 1000 == 0) {
346	tmpString = p.toString();
347	cout << "phrase " << phraseCounter << ": "
348	<< "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
349	delete [] tmpString;
350	}
351	}
352	}
353	}
354	ij = p.lastSuffixIndex + 1;
355	}
356	outPhrase.close();
357
358	// REMAINING PASSES
359	// The previous outPhrase file forms the input to each new pass
360	cellcount start, length;
361	while (outPhraseCounter > 0) {
362
363	// Start a new pass
364	phrasePass++;
365	if (verbosity) {
366	cout << "Starting pass " << phrasePass << endl;
367	}
368
369	// Open the input file
370	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
371	inPhrase.open(inPhraseName, ios::in);
372	if (!inPhrase) {
373	cerr << "File " << inPhraseName << " could not be opened\n";
374	exit(1);
375	}
376
377	// Open the output file
378	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
379	outPhrase.open(outPhraseName, ios::out);
380	if (!outPhrase) {
381	cerr << "File " << outPhraseName << " could not be opened\n";
382	exit(1);
383	}
384	outPhraseCounter = 0;
385
386	// Process each phrase
387	while(inPhrase >> start >> length) {
388
389	// Ignore the phrase if we have expanded it before
390	if (isPhraseStored(start, length)) {
391	continue;
392	}
393
394	// Remember that we have examined this phrase
395	rememberThisPhrase(start, length);
396
397	// Find the phrase in the suffixarray
398	p = Phrase(suffixArray[start], length, SUFFIX);
399	p.findFirstAndLastSuffix(start, inputLength-1);
400
401	// cout << "index " << start << ", length " << length << " - " << p.toString() << endl;
402
403
404	// Ignore the phrase if it only occurs once
405	if (p.suffixFrequency < 2) {
406	continue;
407	}
408
409
410	// Write the phrase text tmpString = p.toString();
411	tmpString = p.toString();
412	phraseData << start << "-" << length << ":" << tmpString << ":"
413	<< p.suffixFrequency << ":";
414	delete [] tmpString;
415
416
417	// Expand the phrase, if it is fewer than 8 words long
418	if (length <= 8) {
419
420	// Get the minimal expansions for this phrase
421	getMinimalExpansions(p, result);
422
423	// write the results
424	phraseData << result.size() << ":";
425
426	for (cellcount i = 0; i < result.size(); i++) {
427	if (i) {
428	phraseData << ",";
429	}
430	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
431	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
432	outPhraseCounter++;
433	}
434	result.clear();
435
436	} else {
437	// phrase is too long to expand further
438	phraseData << "0:";
439	}
440
441
442	// Write the documents in which this phrase occurs
443	df = getDocumentOccurrances(p, documentFrequency);
444	phraseData << ":" << df << ":";
445
446	// write the documents
447	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
448	if (documentFrequency[i]) {
449	if (first) {
450	first = 0;
451	} else {
452	phraseData << ";";
453	}
454	// Output the document number. Note that here we've numbered the
455	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
456	// add 1 to the document id when we output it.
457	phraseData << "d" << (i+1);
458	// Next, output the frequency with which the document occurs, but
459	// only if it is > 1.
460	if (documentFrequency[i] > 1) {
461	phraseData << "," << documentFrequency[i];
462	}
463	}
464	}
465
466	phraseData << endl;
467	phraseCounter++;
468
469	// feedback
470	if (verbosity) {
471	if (phraseCounter % 1000 == 0) {
472	tmpString = p.toString();
473	cout << "phrase " << phraseCounter << ": "<< "start " << start
474	<< ", length " << length << " - " << tmpString << endl;
475	delete [] tmpString;
476	}
477	}
478
479	}
480
481	inPhrase.close();
482	outPhrase.close();
483	}
484
485	phraseData.close();
486	deletePhraseMemory();
487
488	delete [] documentFrequency;
489	delete [] symbols;
490	delete [] suffixArray;
491	delete [] prefixArray;
492	delete [] suffixCheck;
493	delete [] prefixCheck;
494	delete [] documentArray;
495
496
497
498	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
499	return 0;
500	}
501
502
503	// Get Minimal Expansions
504	//
505	// Get the set of "minimal" expansions of a phrase p, using the
506	// algorithm described in the documentation.
507	//
508	// Returns a vector of Expansions.
509
510	void getMinimalExpansions(Phrase &p, vector<Phrase> &results) {
511
512	// 1. Initialise the result and candiate vectors
513	vector<Phrase> candidates;
514	for (cellcount j = 0; j < inputLength; j++) {
515	suffixCheck[j] = 0;
516	prefixCheck[j] = 0;
517	}
518
519	// 2. Expand the phrase p
520
521	// 2.1 Create the candidate set
522	p.initialSuffixCandidates(candidates);
523	p.initialPrefixCandidates(candidates);
524
525	// 2.2 Sort the candidates by phrase length
526	make_heap(candidates.begin(), candidates.end(), isLonger);
527
528	// 3. While candidates is non-empty, confirm the phrases it
529	// contains, expanding them as required
530	while (!candidates.empty()) {
531
532	// 3.1 Get next candidate
533	pop_heap(candidates.begin(), candidates.end(), isLonger);
534	Phrase c = candidates.back();
535	candidates.pop_back();
536
537	// 3.2 If we know there are no unique right extensions
538	// (i.e. this is a phrase drawn from the suffix array)
539	if (!c.hasUniqueSuffixExtension()) {
540
541	c.ensurePrefixFound();
542
543	// 3.2.1 Ignore candidate if we have used a subphrase instead
544	if (suffixCheck[c.firstSuffixIndex] \|\| prefixCheck[c.firstPrefixIndex]) {
545	// cout << "ignoring" << endl;
546	}
547
548	// 3.2.2 If candidate has a unique left (prefix) extension,
549	// Then extend it and add it back into Candidates.
550	else if (c.hasUniquePrefixExtension()) {
551	// cout << "expanding prefix " << c.toString() << "=> ";
552	c.expandUniquePrefixExtensionByOne();
553	candidates.push_back(c);
554	push_heap(candidates.begin(), candidates.end(), isLonger);
555	}
556
557	// 3.2.3 If candidate has no unique left (prefix) extension,
558	// Then add it to the list of results.
559	else {
560	// cout << "no unique prefix, add to results" << endl;
561	results.push_back(c);
562	for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
563	suffixCheck[i] = c.length;
564	}
565	for (cellcount ik = c.firstPrefixIndex; ik <= c.lastPrefixIndex; ik++) {
566	prefixCheck[ik] = c.length;
567	}
568	}
569	}
570
571	// 3.3 If we know there are no unique left extensions,
572	// Then fdo the same as for 3.2 but exchange suffix & prefix
573	else if (!c.hasUniquePrefixExtension()) {
574
575	c.ensureSuffixFound();
576
577	// 3.3.1
578	if (suffixCheck[c.firstSuffixIndex] \|\| prefixCheck[c.firstPrefixIndex]) {
579
580	}
581
582	// 3.3.2
583	else if (c.hasUniqueSuffixExtension()) {
584	c.expandUniqueSuffixExtensionByOne();
585	candidates.push_back(c);
586	push_heap(candidates.begin(), candidates.end(), isLonger);
587	}
588
589	// 3.3.3
590	else {
591	results.push_back(c);
592	for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
593	suffixCheck[i] = c.length;
594	}
595	for (cellcount ijk = c.firstPrefixIndex; ijk <= c.lastPrefixIndex; ijk++) {
596	prefixCheck[ijk] = c.length;
597	}
598
599	}
600	}
601	}
602	}
603
604
605	// suffixCompare
606	//
607	// Compare two pointers into a suffix array. We use this in the
608	// qsort function, so the input are pointers to pointers.
609	//
610	// Return -1 if (a < b), otherwise (a > b) so return +1,
611
612	int suffixCompare(const void cpa, const void cpb) {
613
614	// Cast then dereference pointers to suffix array elements
615	symbol pa = (symbol ) cpa;
616	symbol pb = (symbol ) cpb;
617	pa = (symbol ) pa;
618	pb = (symbol ) pb;
619
620	// If the two elements are the same, examine the next one
621	while (pa == pb) {
622	*pa++;
623	*pb++;
624	}
625
626	// Make the copmparison and return
627	if ( pa < pb) {
628	return -1;
629	} else {
630	return +1;
631	}
632	}
633
634
635	// prefixCompare
636	//
637	// Compare two pointers into a prefix array. We use this in the
638	// qsort function, so the input are pointers to pointers.
639	//
640	// Return -1 if (a > b), otherwise (a < b) so return +1,
641
642	int prefixCompare(const void cpa, const void cpb) {
643
644	// Cast then dereference pointers to prefix array elements
645	symbol pa = (symbol ) cpa;
646	symbol pb = (symbol ) cpb;
647	pa = (symbol ) pa;
648	pb = (symbol ) pb;
649
650	// If the two elements are the same, examine the next one
651	while (pa == pb) {
652	*pa--;
653	*pb--;
654	}
655
656	// Make the copmparison and return
657	if ( pa > pb) {
658	return -1;
659	} else {
660	return +1;
661	}
662	}
663
664	// simpleCompare
665	//
666	// Compare two pointers based on the memory location they point to.
667
668	int pointerCompare( const void pa, const void pb ) {
669
670	symbol a = (symbol ) pa;
671	symbol b = (symbol ) pb;
672
673	if (a < b) {
674	return -1;
675	} else if (a > b) {
676	return 1;
677	} else {
678	return 0;
679	}
680	}
681
682
683	// Read the clauses.numbers file into the "symbols" array.
684	//
685	// Each number in the file is a symbol number; it is essential that
686	// the first symbol (and no others) be COLLECTIONSTART and the last
687	// symbol (and no others) be COLLECTIONEND.
688	//
689	// Return the number of numbers in the array.
690
691	int readNumbers() {
692
693	char filename[FILENAME_MAX];
694	sprintf(filename, "%s/clauses.numbers", collection);
695	if (verbosity) {
696	cout << "Reading numbers file: " << filename << endl;
697	}
698
699	// Open the numbers file
700	ifstream inFile(filename, ios::in);
701	if (!inFile) {
702	cerr << "File " << filename << " could not be opened\n";
703	exit(1);
704	}
705
706	// Count the number of symbols
707	inputLength = 0;
708	symbol word;
709	while (inFile >> word) {
710	inputLength++;
711	}
712	inFile.close();
713
714	// Allocate the symbbols array
715	if (verbosity > 1) {
716	cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
717	}
718	symbols = new symbol[inputLength];
719	if (symbols == NULL) {
720	cerr << "Suffix error: not enough memory to hold " << inputLength
721	<< " symbols." << endl;
722	exit(2);
723	}
724
725	// Read the numbers file into the numbers array
726	if (verbosity > 2) {
727	cout << "Reading the numbers" << endl;
728	}
729	inFile.open(filename, ios::in);
730	cellcount next = 0;
731	numberOfDocuments = 0;
732	while (inFile >> word) {
733	symbols[next++] = word;
734	if (word == DOCUMENTSTART) {
735	numberOfDocuments++;
736	}
737	}
738	inFile.close();
739
740	// Make sure the numbers file is intact
741	assert(symbols[0] == COLLECTIONSTART);
742	assert(symbols[next-1] == COLLECTIONEND);
743
744	return inputLength;
745	}
746
747
748
749	// Get Document Occurrance statistics
750	//
751	// Given a phrase, what documents does it occur in?
752
753	cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
754
755	// cout << "searching for \""<< p.toString() << "\" in documents "
756	// << 0 << "-" << numberOfDocuments - 1 << endl;
757
758	// The number of documents in which this phrase occurs
759	cellcount df = 0;
760
761	// Initialise the document frequency array
762	for (cellindex i = 0; i < numberOfDocuments; i++) {
763	frequency[i] = 0;
764	}
765
766	// variables used to facilitate the search
767	cellindex begin;
768	cellindex end;
769	cellindex d;
770	symbol *target;
771	bool found;
772
773	// search for the document in which each occurence of the phrase is found
774	for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
775
776	// cout << "looking for phrase at suffixArray[" << j << "]\n";
777
778	target = suffixArray[j];
779	begin = 0;
780	end = numberOfDocuments - 1;
781	found = false;
782
783	// Search for the occurence of a document delimiter that target
784	// occurs immediately after.
785	// We do this by performing a binary chop search on documentArray.
786	while (!found) {
787
788	// cout << "searching for " << (cellindex) target << " in "
789	// << begin << " - " << end << endl;
790
791	assert (begin <= end);
792
793	// If the beginning and end of the interval are the same,
794	// then we've found the correct document
795	if (begin == end) {
796	if (frequency[begin] == 0) {
797	df++;
798	}
799	frequency[begin]++;
800	found = true;
801	}
802
803	// Otherwise, examine a new document midway through the begin-end
804	// interval and see if it is the one.
805	else {
806	d = (begin + end) / 2;
807	if (target > documentArray[d]) {
808	// If target addrss is greater than this, but thisi sthe last document,
809	// then this must be the one we want. Or, if target is greater than
810	// this one but less then the next, this must be the one we wnat.
811	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
812	if (frequency[d] == 0) {
813	df++;
814	}
815	frequency[d]++;
816	found = true;
817	} else {
818	// otherwise we know to search later in the document set
819	begin = d + 1;
820	}
821	} else {
822	// search earlier in the document set
823	end = d - 1;
824	}
825	}
826	}
827	}
828	return df;
829	}
830
831
832
833
834
835
836	// phraseExpansionMemory : Which phrases have we expanded?
837	//
838	// A set of utilities for keeping track of which phrases we have expanded.
839	// We don't want to expand a phrase more than once, after all.
840	//
841	// This REALLY ought to be in its own class, but it works so that's okay.
842	//
843	// Phrases are identified by their firstSuffixPosition and length.
844	//
845	// Functions provided are:
846	// void initialisePhraseMemory()
847	// void rememberThisPhrase(index, length)
848	// bool isPhraseStored(index, length)
849	// void deletePhraseMemory()
850	//
851	// Internally, we will have two separate cases:
852	//
853	// Phrases of length 1-8:
854	// unsigned char phraseMemory[inputLength]
855	// is an array where each cell "remembers" the corresponding index in the
856	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
857	// of length 1, 2... 8.
858	// Eventually, we will make this disk-based (i.e. store the array in a file).
859	//
860	// Phrases of length 9+:
861	// file hashTableFile
862	// file listOfEntries
863	// The first file is a hash table; each phrase maps to one of its cells, which
864	// contains either 0 (empty, no occurence) or a number which is an entry number
865	// in the second file. This file contains a "list" of entries. Each consists of
866	// three numbers: the suffixArray index of the phrase, the length of the phrase,
867	// and the entry number of the next phrase with the same hash.
868	//
869
870
871	unsigned char *phraseMemory;
872
873	void initialiseLongPhraseMemory();
874	void rememberThisLongPhrase(cellindex index, cellcount length);
875	bool isLongPhraseStored(cellindex index, cellcount length);
876	void deleteLongPhraseMemory();
877
878
879	void initialisePhraseMemory() {
880
881	phraseMemory = new unsigned char[inputLength];
882
883	// to begin with, everything is empty
884	for (cellcount i = 0; i < inputLength; i++) {
885	phraseMemory[i] = 0;
886	}
887
888	// intialise the hashTable of long phrases
889	initialiseLongPhraseMemory();
890
891	}
892
893	void rememberThisPhrase(cellindex index, cellcount length) {
894
895	// if the phrase is very long, use the file-based system
896	if (length > 8) {
897	rememberThisLongPhrase(index, length);
898	return;
899	}
900
901	// create a char with just the bit corresponding to length set
902	unsigned char newbit = 1;
903	for (cellcount i = 1; i < length; i++) {
904	newbit <<= 1;
905	}
906
907	// set that bit in the memory array at position index
908	phraseMemory[index] \|= newbit;
909	}
910
911
912	bool isPhraseStored(cellindex index, cellcount length) {
913
914	// if the phrase is very long, use the file-based system
915	if (length > 8) {
916	return isLongPhraseStored(index, length);
917	}
918
919	// create a char with just the bit corresponding to length set
920	unsigned char newbit = 1;
921	for (cellcount i = 1; i < length; i++) {
922	newbit <<= 1;
923	}
924
925	// retrurn true if that bit is set in memory arrayat position index
926	return (phraseMemory[index] & newbit);
927	}
928
929	void deletePhraseMemory() {
930	delete phraseMemory;
931	deleteLongPhraseMemory();
932	}
933
934
935
936	// Files etc used to store "long" equavlents of the above
937
938	fstream hashTableFile;
939	char hashTableFileName[FILENAME_MAX];
940	fstream listOfEntries;
941	char listOfEntriesName[FILENAME_MAX];
942	cellindex nextEntryNumber;
943
944	const cellcount bigPrime = 7919;
945
946
947	void initialiseLongPhraseMemory() {
948
949	cellindex example = 0;
950
951	sprintf(hashTableFileName, "%s/hashTable", collection);
952	sprintf(listOfEntriesName, "%s/hashLists", collection);
953
954
955	// create the new hashtable
956	if (verbosity > 1) {
957	cout << "Initialising hashTable: " << hashTableFileName << endl;
958	}
959	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
960	for (cellcount i = 0; i < bigPrime; i++) {
961	hashTableFile.write((char *) &example, sizeof(example));
962	}
963
964	// create the list of phrases
965	if (verbosity > 1) {
966	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
967	}
968	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
969	listOfEntries.write((char *) &example, sizeof(example));
970	listOfEntries.write((char *) &example, sizeof(example));
971	listOfEntries.write((char *) &example, sizeof(example));
972	nextEntryNumber = 1;
973	}
974
975
976	void rememberThisLongPhrase(cellindex index, cellcount length) {
977
978	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
979
980	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
981	cellindex pointer;
982	cellindex zero = 0;
983	cellindex readp = 0;
984	cellindex readi = 0;
985	cellindex readl = 0;
986
987	hashTableFile.seekg(hashOffset);
988	hashTableFile.read((char *) &pointer, sizeof(cellindex));
989
990	if (pointer == 0) {
991	// There is no entry at all in the hash table for this entry
992	// so create one
993
994	pointer = nextEntryNumber++;
995	hashTableFile.seekg(hashOffset);
996	hashTableFile.write((char *) &pointer, sizeof(cellindex));
997
998	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
999	listOfEntries.write((char *) &zero, sizeof(cellindex));
1000	listOfEntries.write((char *) &index, sizeof(cellindex));
1001	listOfEntries.write((char *) &length, sizeof(cellindex));
1002
1003	} else {
1004	// There is a list starting at this hash value, so the phrase may
1005	// be already remembered, or it might need to be appended
1006
1007	while (pointer != 0) {
1008	// Read the entry pointed to by pointer
1009	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
1010	listOfEntries.read((char *) &readp, sizeof(cellindex));
1011	listOfEntries.read((char *) &readi, sizeof(cellindex));
1012	listOfEntries.read((char *) &readl, sizeof(cellindex));
1013
1014	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
1015
1016	if ((readi == index) && (readl = length)) {
1017	// we've found that we've already stored it
1018	return;
1019	} else if (readp == 0) {
1020	// we're reached the end of the list. Add a new entry.
1021	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1022	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
1023	pointer = nextEntryNumber++;
1024
1025	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1026	listOfEntries.write((char *) &zero, sizeof(cellindex));
1027	listOfEntries.write((char *) &index, sizeof(cellindex));
1028	listOfEntries.write((char *) &length, sizeof(cellindex));
1029	return;
1030	} else {
1031	// go on to the next node
1032	pointer = readp;
1033	}
1034	}
1035	}
1036
1037
1038	}
1039
1040	bool isLongPhraseStored(cellindex index, cellcount length) {
1041
1042	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
1043
1044	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
1045	cellindex pointer;
1046	cellindex readp = 0;
1047	cellindex readi = 0;
1048	cellindex readl = 0;
1049
1050	// Find the phrase in the hashFile
1051	hashTableFile.seekg(hashOffset);
1052	hashTableFile.read((char *) &pointer, sizeof(cellindex));
1053
1054	if (pointer == 0) {
1055	// There is no entry at all in the hash table for this entry
1056	// so nothing is stored
1057	return false;
1058
1059	} else {
1060	// There is a list starting at this hash value, so the phrase may
1061	// be already remembered in that list
1062	while (pointer != 0) {
1063	// Read the entry pointed to by pointer
1064	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
1065	listOfEntries.read((char *) &readp, sizeof(cellindex));
1066	listOfEntries.read((char *) &readi, sizeof(cellindex));
1067	listOfEntries.read((char *) &readl, sizeof(cellindex));
1068
1069	if ((readi == index) && (readl = length)) {
1070	// we've found the phrase stored here
1071	return true;
1072	} else {
1073	// go on to the next node
1074	pointer = readp;
1075	}
1076	}
1077	}
1078	return false;
1079	}
1080
1081	void deleteLongPhraseMemory() {
1082	// remove the hash & other files
1083
1084	hashTableFile.close();
1085	listOfEntries.close();
1086	remove(hashTableFileName);
1087	remove(listOfEntriesName);
1088
1089	}
1090
1091
1092
1093
1094	// Read the collection statistics file
1095	void readStatistics() {
1096
1097	// open the statistics file
1098	char filename[FILENAME_MAX];
1099	sprintf(filename, "%s/clauses.stats", collection);
1100
1101	// Open the file
1102	ifstream inFile(filename, ios::in);
1103	if (!inFile) {
1104	cerr << "File " << filename << " could not be opened\n";
1105	exit(1);
1106	}
1107
1108	// Read the numbers file into the numbers array
1109	char key[1000];
1110	symbol value;
1111	while (inFile >> key >> value){
1112	if (strcmp(key, "first_stopword") == 0) {
1113	firstStopSymbol = value;
1114	} else if (strcmp(key, "last_stopword") == 0) {
1115	lastStopSymbol = value;
1116	} else if (strcmp(key, "first_contentword") == 0) {
1117	firstContentSymbol = value;
1118	} else if (strcmp(key, "last_contentword") == 0) {
1119	lastContentSymbol = value;
1120	}
1121	}
1122	inFile.close();
1123
1124	// Make sure we have the information we need
1125	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
1126	cerr << "Statistics file incomplete" << endl;
1127	exit(1);
1128	}
1129	}
1130
1131
1132
1133
1134

Note: See TracBrowser for help on using the repository browser.

Download in other formats: