Context Navigation

source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2498

Last change on this file since 2498 was 2498, checked in by sjboddie, 23 years ago
A couple of small changes to phind's generate code to get it working under windows 95
Property svn:keywords set to `Author Date Id Revision`
File size: 30.8 KB

Line
1	/**********************************************************************
2	*
3	* suffix.cpp -- Extract the repeated phrases in the input using
4	* suffix and prefix arrays.
5	*
6	* Copyright 2000 Gordon W. Paynter
7	* Copyright 2000 The New Zealand Digital Library Project
8	*
9	* A component of the Greenstone digital library software
10	* from the New Zealand Digital Library Project at the
11	* University of Waikato, New Zealand.
12	*
13	* This program is free software; you can redistribute it and/or modify
14	* it under the terms of the GNU General Public License as published by
15	* the Free Software Foundation; either version 2 of the License, or
16	* (at your option) any later version.
17	*
18	* This program is distributed in the hope that it will be useful,
19	* but WITHOUT ANY WARRANTY; without even the implied warranty of
20	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	* GNU General Public License for more details.
22	*
23	* You should have received a copy of the GNU General Public License
24	* along with this program; if not, write to the Free Software
25	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	*
27	*********************************************************************/
28
29	#include <assert.h>
30	#include <math.h>
31	#include <stdio.h>
32	#include <stdlib.h>
33	#include <string.h>
34
35	#if defined(GSDL_USE_IOS_H)
36	# include <fstream.h>
37	# include <iostream.h>
38	#else
39	# include <fstream>
40	# include <iostream>
41	#endif
42
43	#if defined(GSDL_USE_STL_H)
44	# if defined(GSDL_USE_ALGO_H)
45	# include <algo.h>
46	# else
47	# include <algorithm.h>
48	# endif
49	# include <vector.h>
50	#else
51	# include <algorithm>
52	# include <vector>
53	#endif
54
55	#include "suffix.h"
56	#include "phrase.h"
57
58	// Global variables declared in suffix.h
59	cellcount inputLength;
60
61	symbol *symbols;
62	symbol **suffixArray;
63	check *suffixCheck;
64	symbol **prefixArray;
65	check *prefixCheck;
66
67
68	// How many documents are in this collection?
69	cellcount numberOfDocuments;
70	symbol **documentArray;
71
72	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
73	int phraseMode = ANYPHRASE; //STOPWORDS;
74
75	// The filestem of the collection's phindex directory
76	char collection[FILENAME_MAX];
77
78	int suffixCompare(const void , const void );
79	int prefixCompare(const void , const void );
80	int pointerCompare(const void , const void );
81
82	int readNumbers();
83	void readStatistics();
84
85	void getMinimalExpansions(Phrase &p, vector<Phrase> &results);
86	cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency);
87
88	// The ranges of the stopword and content-word symbols for the collection
89	symbol firstStopSymbol = 0;
90	symbol lastStopSymbol = 0;
91	symbol firstContentSymbol = 0;
92	symbol lastContentSymbol = 0;
93
94
95
96
97	// Phrase memory
98	// We have to "remember" each phrase that we've expanded
99	void initialisePhraseMemory();
100	void rememberThisPhrase(cellindex index, cellcount length);
101	bool isPhraseStored(cellindex index, cellcount length);
102	void deletePhraseMemory();
103
104
105	// how much output do we want?
106	int verbosity = 1;
107
108
109	int main (int argc, char * argv[]) {
110
111	// Command-line arguments
112	// argv[1] is the phindex directory
113	// argv[2] is the maximum array symbol length (optional)
114	// argv[3] is the mode, where 1 is stopword mode (optional)
115	if (argc < 2) {
116	cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
117	exit(1);
118	}
119
120	// collection directory
121	strcpy(collection, argv[1]);
122
123	// mode parameter
124	phraseMode = atoi(argv[2]);
125	assert((phraseMode == STOPWORDS) \|\| (phraseMode == ANYPHRASE));
126
127	// optional verbosity parameter
128	if (argc == 4) {
129	verbosity = atoi(argv[3]);
130	assert (verbosity >= 0);
131	}
132
133	if (verbosity) {
134	cout << "Suffix phrase extraction program" << endl;
135	}
136
137	if (verbosity > 1) {
138	if (phraseMode == STOPWORDS) {
139	cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
140	} else {
141	cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
142	}
143	}
144
145	// Read the statistics file
146	readStatistics();
147
148	// Read the numbers file
149	readNumbers();
150
151	// Create the suffix & prefix arrays
152	suffixArray = new symbol *[inputLength];
153	prefixArray = new symbol *[inputLength];
154	suffixCheck = new check[inputLength];
155	prefixCheck = new check[inputLength];
156	if (prefixCheck == NULL) {
157	cerr << "Suffix error: not enough memory to hold " << inputLength
158	<< " symbols." << endl;
159	exit(2);
160	}
161
162	// Initialise prefix and suffix arrays
163	for (cellcount j = 0; j < inputLength; j++) {
164	suffixArray[j] = &symbols[j];
165	prefixArray[j] = &symbols[j];
166	}
167	qsort(suffixArray, inputLength, sizeof(symbol *), suffixCompare);
168	qsort(prefixArray, inputLength, sizeof(symbol *), prefixCompare);
169
170
171	// Create the document arrays
172	if (numberOfDocuments == 0) {
173	cerr << "There are no documents in this collection!" << endl;
174	exit(1);
175	}
176	if (verbosity > 1) {
177	cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
178	}
179
180	// The document frequecy array is used to count the number of times
181	// each phrase occurs in each document. The number of documents in
182	// which a phrase occurs is stored in df.
183	frequency *documentFrequency = new frequency[numberOfDocuments];
184	frequency df;
185
186	// documentArray will be searched in order to discover which document
187	// each phrase occurs in.
188	documentArray = new symbol *[numberOfDocuments];
189
190	// Discover all the DOCUMENTSTART symbols and store as a phrase
191	cellindex d = 0;
192	while (*suffixArray[d] != DOCUMENTSTART) {
193	d++;
194	}
195	Phrase p(suffixArray[d], 1, SUFFIX);
196	p.findFirstAndLastSuffix(d, inputLength-1);
197
198	// Insert the document locations (as pointers) into documentArray
199	for (cellcount i = 0; i < p.suffixFrequency; i++) {
200	documentArray[i] = suffixArray[i + p.firstSuffixIndex];
201	}
202
203	// Sort the document array into ascending order of raw pointer value
204	qsort(documentArray, numberOfDocuments, sizeof(symbol *), pointerCompare);
205
206
207	// Extract phrases
208	//
209	// We will make several passesover the data, in each case considering
210	// a set of input phrases and generating a set of output phrases, which
211	// we will expancd in later passes.
212	//
213	// The input phrases in the first pass will be the vocabulary.
214	// In later passes, the input phrases will be the output phrases of the
215	// previous pass.
216	//
217	// In each pass we will consider each input phrase in turn. If we
218	// have seen it before, we will ignore it. Otherwise, we will expand
219	// it and add its expansions to the set of output phrases.
220
221	// Store the phrase data in the phrases file
222	char phraseDataName[FILENAME_MAX];
223	sprintf(phraseDataName, "%s/phrases", collection);
224	ofstream phraseData(phraseDataName, ios::out);
225	if (!phraseData) {
226	cout << "File " << phraseDataName << " could not be opened\n";
227	exit(1);
228	}
229
230	// Count the number of phrases output
231	unsigned long int phraseCounter = 0;
232
233	// Set up the phrase expansion memory.
234	// We need this so that we don't expand a phrase more than once
235	initialisePhraseMemory();
236
237	// The current pass numebr
238	int phrasePass = 1;
239
240
241	// PASS NUMBER 1
242	if (verbosity > 1) {
243	cout << "Starting pass " << phrasePass << endl;
244	}
245
246	ofstream outPhrase;
247	char outPhraseName[FILENAME_MAX];
248	unsigned long int outPhraseCounter = 0;
249
250	// On the first pass, simply work through the vocabulary
251	sprintf(outPhraseName, "%s/outPhrase.1", collection);
252	outPhrase.open(outPhraseName, ios::out);
253	if (!outPhrase) {
254	cerr << "File " << outPhraseName << " could not be opened\n";
255	exit(1);
256	}
257
258	// Iterate over the different symbols by working through the suffix array
259	vector<Phrase> result;
260	cellindex ij = 0;
261	char *tmpString;
262
263	while (ij < inputLength) {
264
265	// make a new phrase of length 1
266	p = Phrase(suffixArray[ij], 1, SUFFIX);
267	p.findFirstAndLastSuffix(ij, inputLength-1);
268
269	// cout << "cell " << ij << " - " << p.toString() << endl;
270
271	// We ignore this symbol if it occurs only once, if it is a delimiter,
272	// of if we are in stopwords mode and it is a stopword
273	//
274	// We could imagine a new mode/command-line option, which is like
275	// STOPWORDS but without this restrictrion. This would let you browse
276	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
277	// "the AGRIS" (where the is a stopword and AGRIS a content word).
278	// The system used to work like this; it is easy to implement, but
279	// it explodes the size of the indexes. So: would it be useful?
280	if (!((p.suffixFrequency <= 1) \|\|
281	// (*suffixArray[ij] != 23054) \|\|
282	(*suffixArray[ij] <= LASTDELIMITER) \|\|
283	((phraseMode == STOPWORDS) && (*suffixArray[ij] <= lastStopSymbol)))) {
284
285	// Get minimal expansions of the phrase
286	getMinimalExpansions(p, result);
287
288	if (!result.empty()) {
289
290	// Remember that we have expanded this phrase
291	rememberThisPhrase(ij, 1);
292
293	// write the phrase text
294	tmpString = p.toString();
295	phraseData << ij << "-1:" << tmpString << ":" << p.suffixFrequency << ":"
296	<< result.size() << ":";
297	delete [] tmpString;
298
299	// write the results
300	for (cellcount k = 0; k < result.size(); k++) {
301	if (k) {
302	phraseData << ",";
303	}
304	phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
305	outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
306	outPhraseCounter++;
307	}
308	result.clear();
309
310	// Write the documents in which this phrase occurs
311	df = getDocumentOccurrances(p, documentFrequency);
312	phraseData << ":" << df << ":";
313
314	// write the documents
315	for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
316	if (documentFrequency[m]) {
317	if (first) {
318	first = 0;
319	} else {
320	phraseData << ";";
321	}
322	// Output the document number. Note that here we've numbered the
323	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
324	// add 1 to the document id when we output it.
325	phraseData << "d" << (m+1);
326	// Next, output the frequency with which the document occurs, but
327	// only if it is > 1.
328	if (documentFrequency[m] > 1) {
329	phraseData << "," << documentFrequency[m];
330	}
331	}
332	}
333
334	phraseData << endl;
335	phraseCounter++;
336
337	// feedback
338	if (verbosity) {
339	if (phraseCounter % 1000 == 0) {
340	tmpString = p.toString();
341	cout << "phrase " << phraseCounter << ": "
342	<< "cell " << p.firstSuffixIndex << " - " << tmpString << endl;
343	delete [] tmpString;
344	}
345	}
346	}
347	}
348	ij = p.lastSuffixIndex + 1;
349	}
350	outPhrase.close();
351
352	// REMAINING PASSES
353	// The previous outPhrase file forms the input to each new pass
354	cellcount start, length;
355	while (outPhraseCounter > 0) {
356
357	// Start a new pass
358	phrasePass++;
359	if (verbosity) {
360	cout << "Starting pass " << phrasePass << endl;
361	}
362
363	// Open the input file
364	char inPhraseName[FILENAME_MAX];
365	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
366	ifstream inPhrase (inPhraseName, ios::in);
367	if (!inPhrase) {
368	cerr << "File " << inPhraseName << " could not be opened\n";
369	exit(1);
370	}
371
372	// Open the output file
373	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
374	outPhrase.open(outPhraseName, ios::out);
375	if (!outPhrase) {
376	cerr << "File " << outPhraseName << " could not be opened\n";
377	exit(1);
378	}
379	outPhraseCounter = 0;
380
381	// Process each phrase
382	while(inPhrase >> start >> length) {
383
384	// Ignore the phrase if we have expanded it before
385	if (isPhraseStored(start, length)) {
386	continue;
387	}
388
389	// Remember that we have examined this phrase
390	rememberThisPhrase(start, length);
391
392	// Find the phrase in the suffixarray
393	p = Phrase(suffixArray[start], length, SUFFIX);
394	p.findFirstAndLastSuffix(start, inputLength-1);
395
396	// cout << "index " << start << ", length " << length << " - " << p.toString() << endl;
397
398
399	// Ignore the phrase if it only occurs once
400	if (p.suffixFrequency < 2) {
401	continue;
402	}
403
404
405	// Write the phrase text tmpString = p.toString();
406	tmpString = p.toString();
407	phraseData << start << "-" << length << ":" << tmpString << ":"
408	<< p.suffixFrequency << ":";
409	delete [] tmpString;
410
411
412	// Expand the phrase, if it is fewer than 8 words long
413	if (length <= 8) {
414
415	// Get the minimal expansions for this phrase
416	getMinimalExpansions(p, result);
417
418	// write the results
419	phraseData << result.size() << ":";
420
421	for (cellcount i = 0; i < result.size(); i++) {
422	if (i) {
423	phraseData << ",";
424	}
425	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
426	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
427	outPhraseCounter++;
428	}
429	result.clear();
430
431	} else {
432	// phrase is too long to expand further
433	phraseData << "0:";
434	}
435
436
437	// Write the documents in which this phrase occurs
438	df = getDocumentOccurrances(p, documentFrequency);
439	phraseData << ":" << df << ":";
440
441	// write the documents
442	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
443	if (documentFrequency[i]) {
444	if (first) {
445	first = 0;
446	} else {
447	phraseData << ";";
448	}
449	// Output the document number. Note that here we've numbered the
450	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
451	// add 1 to the document id when we output it.
452	phraseData << "d" << (i+1);
453	// Next, output the frequency with which the document occurs, but
454	// only if it is > 1.
455	if (documentFrequency[i] > 1) {
456	phraseData << "," << documentFrequency[i];
457	}
458	}
459	}
460
461	phraseData << endl;
462	phraseCounter++;
463
464	// feedback
465	if (verbosity) {
466	if (phraseCounter % 1000 == 0) {
467	tmpString = p.toString();
468	cout << "phrase " << phraseCounter << ": "<< "start " << start
469	<< ", length " << length << " - " << tmpString << endl;
470	delete [] tmpString;
471	}
472	}
473
474	}
475
476	inPhrase.close();
477	outPhrase.close();
478	}
479
480	phraseData.close();
481	deletePhraseMemory();
482
483	delete [] documentFrequency;
484	delete [] symbols;
485	delete [] suffixArray;
486	delete [] prefixArray;
487	delete [] suffixCheck;
488	delete [] prefixCheck;
489	delete [] documentArray;
490
491
492
493	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
494	return 0;
495	}
496
497
498	// Get Minimal Expansions
499	//
500	// Get the set of "minimal" expansions of a phrase p, using the
501	// algorithm described in the documentation.
502	//
503	// Returns a vector of Expansions.
504
505	void getMinimalExpansions(Phrase &p, vector<Phrase> &results) {
506
507	// 1. Initialise the result and candiate vectors
508	vector<Phrase> candidates;
509	for (cellcount j = 0; j < inputLength; j++) {
510	suffixCheck[j] = 0;
511	prefixCheck[j] = 0;
512	}
513
514	// 2. Expand the phrase p
515
516	// 2.1 Create the candidate set
517	p.initialSuffixCandidates(candidates);
518	p.initialPrefixCandidates(candidates);
519
520	// 2.2 Sort the candidates by phrase length
521	make_heap(candidates.begin(), candidates.end(), isLonger);
522
523	// 3. While candidates is non-empty, confirm the phrases it
524	// contains, expanding them as required
525	while (!candidates.empty()) {
526
527	// 3.1 Get next candidate
528	pop_heap(candidates.begin(), candidates.end(), isLonger);
529	Phrase c = candidates.back();
530	candidates.pop_back();
531
532	// 3.2 If we know there are no unique right extensions
533	// (i.e. this is a phrase drawn from the suffix array)
534	if (!c.hasUniqueSuffixExtension()) {
535
536	c.ensurePrefixFound();
537
538	// 3.2.1 Ignore candidate if we have used a subphrase instead
539	if (suffixCheck[c.firstSuffixIndex] \|\| prefixCheck[c.firstPrefixIndex]) {
540	// cout << "ignoring" << endl;
541	}
542
543	// 3.2.2 If candidate has a unique left (prefix) extension,
544	// Then extend it and add it back into Candidates.
545	else if (c.hasUniquePrefixExtension()) {
546	// cout << "expanding prefix " << c.toString() << "=> ";
547	c.expandUniquePrefixExtensionByOne();
548	candidates.push_back(c);
549	push_heap(candidates.begin(), candidates.end(), isLonger);
550	}
551
552	// 3.2.3 If candidate has no unique left (prefix) extension,
553	// Then add it to the list of results.
554	else {
555	// cout << "no unique prefix, add to results" << endl;
556	results.push_back(c);
557	for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
558	suffixCheck[i] = c.length;
559	}
560	for (cellcount ik = c.firstPrefixIndex; ik <= c.lastPrefixIndex; ik++) {
561	prefixCheck[ik] = c.length;
562	}
563	}
564	}
565
566	// 3.3 If we know there are no unique left extensions,
567	// Then fdo the same as for 3.2 but exchange suffix & prefix
568	else if (!c.hasUniquePrefixExtension()) {
569
570	c.ensureSuffixFound();
571
572	// 3.3.1
573	if (suffixCheck[c.firstSuffixIndex] \|\| prefixCheck[c.firstPrefixIndex]) {
574
575	}
576
577	// 3.3.2
578	else if (c.hasUniqueSuffixExtension()) {
579	c.expandUniqueSuffixExtensionByOne();
580	candidates.push_back(c);
581	push_heap(candidates.begin(), candidates.end(), isLonger);
582	}
583
584	// 3.3.3
585	else {
586	results.push_back(c);
587	for (cellcount i = c.firstSuffixIndex; i <= c.lastSuffixIndex; i++) {
588	suffixCheck[i] = c.length;
589	}
590	for (cellcount ijk = c.firstPrefixIndex; ijk <= c.lastPrefixIndex; ijk++) {
591	prefixCheck[ijk] = c.length;
592	}
593
594	}
595	}
596	}
597	}
598
599
600	// suffixCompare
601	//
602	// Compare two pointers into a suffix array. We use this in the
603	// qsort function, so the input are pointers to pointers.
604	//
605	// Return -1 if (a < b), otherwise (a > b) so return +1,
606
607	int suffixCompare(const void cpa, const void cpb) {
608
609	// Cast then dereference pointers to suffix array elements
610	symbol pa = (symbol ) cpa;
611	symbol pb = (symbol ) cpb;
612	pa = (symbol ) pa;
613	pb = (symbol ) pb;
614
615	// If the two elements are the same, examine the next one
616	while (pa == pb) {
617	*pa++;
618	*pb++;
619	}
620
621	// Make the copmparison and return
622	if ( pa < pb) {
623	return -1;
624	} else {
625	return +1;
626	}
627	}
628
629
630	// prefixCompare
631	//
632	// Compare two pointers into a prefix array. We use this in the
633	// qsort function, so the input are pointers to pointers.
634	//
635	// Return -1 if (a > b), otherwise (a < b) so return +1,
636
637	int prefixCompare(const void cpa, const void cpb) {
638
639	// Cast then dereference pointers to prefix array elements
640	symbol pa = (symbol ) cpa;
641	symbol pb = (symbol ) cpb;
642	pa = (symbol ) pa;
643	pb = (symbol ) pb;
644
645	// If the two elements are the same, examine the next one
646	while (pa == pb) {
647	*pa--;
648	*pb--;
649	}
650
651	// Make the copmparison and return
652	if ( pa > pb) {
653	return -1;
654	} else {
655	return +1;
656	}
657	}
658
659	// simpleCompare
660	//
661	// Compare two pointers based on the memory location they point to.
662
663	int pointerCompare( const void pa, const void pb ) {
664
665	symbol a = (symbol ) pa;
666	symbol b = (symbol ) pb;
667
668	if (a < b) {
669	return -1;
670	} else if (a > b) {
671	return 1;
672	} else {
673	return 0;
674	}
675	}
676
677
678	// Read the clauses.numbers file into the "symbols" array.
679	//
680	// Each number in the file is a symbol number; it is essential that
681	// the first symbol (and no others) be COLLECTIONSTART and the last
682	// symbol (and no others) be COLLECTIONEND.
683	//
684	// Return the number of numbers in the array.
685
686	int readNumbers() {
687
688	char filename[FILENAME_MAX];
689	sprintf(filename, "%s/clauses.numbers", collection);
690	if (verbosity) {
691	cout << "Reading numbers file: " << filename << endl;
692	}
693
694	// Open the numbers file
695	ifstream inFile1(filename, ios::in);
696	if (!inFile1) {
697	cerr << "File " << filename << " could not be opened\n";
698	exit(1);
699	}
700
701	// Count the number of symbols
702	inputLength = 0;
703	symbol word;
704	while (inFile1 >> word) {
705	inputLength++;
706	}
707	inFile1.close();
708
709	// Allocate the symbbols array
710	if (verbosity > 1) {
711	cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
712	}
713	symbols = new symbol[inputLength];
714	if (symbols == NULL) {
715	cerr << "Suffix error: not enough memory to hold " << inputLength
716	<< " symbols." << endl;
717	exit(2);
718	}
719
720	// Read the numbers file into the numbers array
721	if (verbosity > 2) {
722	cout << "Reading the numbers" << endl;
723	}
724
725	ifstream inFile2(filename, ios::in);
726	if (!inFile2) {
727	cerr << "File " << filename << " could not be opened\n";
728	exit(1);
729	}
730	cellcount next = 0;
731	numberOfDocuments = 0;
732	while (inFile2 >> word) {
733	symbols[next++] = word;
734	if (word == DOCUMENTSTART) {
735	numberOfDocuments++;
736	}
737	}
738	inFile2.close();
739
740	// Make sure the numbers file is intact
741	assert(symbols[0] == COLLECTIONSTART);
742	assert(symbols[next-1] == COLLECTIONEND);
743
744	return inputLength;
745	}
746
747
748
749	// Get Document Occurrance statistics
750	//
751	// Given a phrase, what documents does it occur in?
752
753	cellcount getDocumentOccurrances(Phrase &p, cellcount *frequency) {
754
755	// cout << "searching for \""<< p.toString() << "\" in documents "
756	// << 0 << "-" << numberOfDocuments - 1 << endl;
757
758	// The number of documents in which this phrase occurs
759	cellcount df = 0;
760
761	// Initialise the document frequency array
762	for (cellindex i = 0; i < numberOfDocuments; i++) {
763	frequency[i] = 0;
764	}
765
766	// variables used to facilitate the search
767	cellindex begin;
768	cellindex end;
769	cellindex d;
770	symbol *target;
771	bool found;
772
773	// search for the document in which each occurence of the phrase is found
774	for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
775
776	// cout << "looking for phrase at suffixArray[" << j << "]\n";
777
778	target = suffixArray[j];
779	begin = 0;
780	end = numberOfDocuments - 1;
781	found = false;
782
783	// Search for the occurence of a document delimiter that target
784	// occurs immediately after.
785	// We do this by performing a binary chop search on documentArray.
786	while (!found) {
787
788	// cout << "searching for " << (cellindex) target << " in "
789	// << begin << " - " << end << endl;
790
791	assert (begin <= end);
792
793	// If the beginning and end of the interval are the same,
794	// then we've found the correct document
795	if (begin == end) {
796	if (frequency[begin] == 0) {
797	df++;
798	}
799	frequency[begin]++;
800	found = true;
801	}
802
803	// Otherwise, examine a new document midway through the begin-end
804	// interval and see if it is the one.
805	else {
806	d = (begin + end) / 2;
807	if (target > documentArray[d]) {
808	// If target addrss is greater than this, but thisi sthe last document,
809	// then this must be the one we want. Or, if target is greater than
810	// this one but less then the next, this must be the one we wnat.
811	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
812	if (frequency[d] == 0) {
813	df++;
814	}
815	frequency[d]++;
816	found = true;
817	} else {
818	// otherwise we know to search later in the document set
819	begin = d + 1;
820	}
821	} else {
822	// search earlier in the document set
823	end = d - 1;
824	}
825	}
826	}
827	}
828	return df;
829	}
830
831
832
833
834
835
836	// phraseExpansionMemory : Which phrases have we expanded?
837	//
838	// A set of utilities for keeping track of which phrases we have expanded.
839	// We don't want to expand a phrase more than once, after all.
840	//
841	// This REALLY ought to be in its own class, but it works so that's okay.
842	//
843	// Phrases are identified by their firstSuffixPosition and length.
844	//
845	// Functions provided are:
846	// void initialisePhraseMemory()
847	// void rememberThisPhrase(index, length)
848	// bool isPhraseStored(index, length)
849	// void deletePhraseMemory()
850	//
851	// Internally, we will have two separate cases:
852	//
853	// Phrases of length 1-8:
854	// unsigned char phraseMemory[inputLength]
855	// is an array where each cell "remembers" the corresponding index in the
856	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
857	// of length 1, 2... 8.
858	// Eventually, we will make this disk-based (i.e. store the array in a file).
859	//
860	// Phrases of length 9+:
861	// file hashTableFile
862	// file listOfEntries
863	// The first file is a hash table; each phrase maps to one of its cells, which
864	// contains either 0 (empty, no occurence) or a number which is an entry number
865	// in the second file. This file contains a "list" of entries. Each consists of
866	// three numbers: the suffixArray index of the phrase, the length of the phrase,
867	// and the entry number of the next phrase with the same hash.
868	//
869
870
871	unsigned char *phraseMemory;
872
873	void initialiseLongPhraseMemory();
874	void rememberThisLongPhrase(cellindex index, cellcount length);
875	bool isLongPhraseStored(cellindex index, cellcount length);
876	void deleteLongPhraseMemory();
877
878
879	void initialisePhraseMemory() {
880
881	phraseMemory = new unsigned char[inputLength];
882
883	// to begin with, everything is empty
884	for (cellcount i = 0; i < inputLength; i++) {
885	phraseMemory[i] = 0;
886	}
887
888	// intialise the hashTable of long phrases
889	initialiseLongPhraseMemory();
890
891	}
892
893	void rememberThisPhrase(cellindex index, cellcount length) {
894
895	// if the phrase is very long, use the file-based system
896	if (length > 8) {
897	rememberThisLongPhrase(index, length);
898	return;
899	}
900
901	// create a char with just the bit corresponding to length set
902	unsigned char newbit = 1;
903	for (cellcount i = 1; i < length; i++) {
904	newbit <<= 1;
905	}
906
907	// set that bit in the memory array at position index
908	phraseMemory[index] \|= newbit;
909	}
910
911
912	bool isPhraseStored(cellindex index, cellcount length) {
913
914	// if the phrase is very long, use the file-based system
915	if (length > 8) {
916	return isLongPhraseStored(index, length);
917	}
918
919	// create a char with just the bit corresponding to length set
920	unsigned char newbit = 1;
921	for (cellcount i = 1; i < length; i++) {
922	newbit <<= 1;
923	}
924
925	// retrurn true if that bit is set in memory arrayat position index
926	return (phraseMemory[index] & newbit);
927	}
928
929	void deletePhraseMemory() {
930	delete phraseMemory;
931	deleteLongPhraseMemory();
932	}
933
934
935
936	// Files etc used to store "long" equavlents of the above
937
938	fstream hashTableFile;
939	char hashTableFileName[FILENAME_MAX];
940	fstream listOfEntries;
941	char listOfEntriesName[FILENAME_MAX];
942	cellindex nextEntryNumber;
943
944	const cellcount bigPrime = 7919;
945
946
947	void initialiseLongPhraseMemory() {
948
949	cellindex example = 0;
950
951	sprintf(hashTableFileName, "%s/hashTable", collection);
952	sprintf(listOfEntriesName, "%s/hashLists", collection);
953
954
955	// create the new hashtable
956	if (verbosity > 1) {
957	cout << "Initialising hashTable: " << hashTableFileName << endl;
958	}
959	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
960	for (cellcount i = 0; i < bigPrime; i++) {
961	hashTableFile.write((char *) &example, sizeof(example));
962	}
963
964	// create the list of phrases
965	if (verbosity > 1) {
966	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
967	}
968	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
969	listOfEntries.write((char *) &example, sizeof(example));
970	listOfEntries.write((char *) &example, sizeof(example));
971	listOfEntries.write((char *) &example, sizeof(example));
972	nextEntryNumber = 1;
973	}
974
975
976	void rememberThisLongPhrase(cellindex index, cellcount length) {
977
978	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
979
980	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
981	cellindex pointer;
982	cellindex zero = 0;
983	cellindex readp = 0;
984	cellindex readi = 0;
985	cellindex readl = 0;
986
987	hashTableFile.seekg(hashOffset);
988	hashTableFile.read((char *) &pointer, sizeof(cellindex));
989
990	if (pointer == 0) {
991	// There is no entry at all in the hash table for this entry
992	// so create one
993
994	pointer = nextEntryNumber++;
995	hashTableFile.seekg(hashOffset);
996	hashTableFile.write((char *) &pointer, sizeof(cellindex));
997
998	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
999	listOfEntries.write((char *) &zero, sizeof(cellindex));
1000	listOfEntries.write((char *) &index, sizeof(cellindex));
1001	listOfEntries.write((char *) &length, sizeof(cellindex));
1002
1003	} else {
1004	// There is a list starting at this hash value, so the phrase may
1005	// be already remembered, or it might need to be appended
1006
1007	while (pointer != 0) {
1008	// Read the entry pointed to by pointer
1009	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
1010	listOfEntries.read((char *) &readp, sizeof(cellindex));
1011	listOfEntries.read((char *) &readi, sizeof(cellindex));
1012	listOfEntries.read((char *) &readl, sizeof(cellindex));
1013
1014	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
1015
1016	if ((readi == index) && (readl = length)) {
1017	// we've found that we've already stored it
1018	return;
1019	} else if (readp == 0) {
1020	// we're reached the end of the list. Add a new entry.
1021	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1022	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
1023	pointer = nextEntryNumber++;
1024
1025	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
1026	listOfEntries.write((char *) &zero, sizeof(cellindex));
1027	listOfEntries.write((char *) &index, sizeof(cellindex));
1028	listOfEntries.write((char *) &length, sizeof(cellindex));
1029	return;
1030	} else {
1031	// go on to the next node
1032	pointer = readp;
1033	}
1034	}
1035	}
1036
1037
1038	}
1039
1040	bool isLongPhraseStored(cellindex index, cellcount length) {
1041
1042	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
1043
1044	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
1045	cellindex pointer;
1046	cellindex readp = 0;
1047	cellindex readi = 0;
1048	cellindex readl = 0;
1049
1050	// Find the phrase in the hashFile
1051	hashTableFile.seekg(hashOffset);
1052	hashTableFile.read((char *) &pointer, sizeof(cellindex));
1053
1054	if (pointer == 0) {
1055	// There is no entry at all in the hash table for this entry
1056	// so nothing is stored
1057	return false;
1058
1059	} else {
1060	// There is a list starting at this hash value, so the phrase may
1061	// be already remembered in that list
1062	while (pointer != 0) {
1063	// Read the entry pointed to by pointer
1064	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
1065	listOfEntries.read((char *) &readp, sizeof(cellindex));
1066	listOfEntries.read((char *) &readi, sizeof(cellindex));
1067	listOfEntries.read((char *) &readl, sizeof(cellindex));
1068
1069	if ((readi == index) && (readl = length)) {
1070	// we've found the phrase stored here
1071	return true;
1072	} else {
1073	// go on to the next node
1074	pointer = readp;
1075	}
1076	}
1077	}
1078	return false;
1079	}
1080
1081	void deleteLongPhraseMemory() {
1082	// remove the hash & other files
1083
1084	hashTableFile.close();
1085	listOfEntries.close();
1086	remove(hashTableFileName);
1087	remove(listOfEntriesName);
1088
1089	}
1090
1091
1092
1093
1094	// Read the collection statistics file
1095	void readStatistics() {
1096
1097	// open the statistics file
1098	char filename[FILENAME_MAX];
1099	sprintf(filename, "%s/clauses.stats", collection);
1100
1101	// Open the file
1102	ifstream inFile(filename, ios::in);
1103	if (!inFile) {
1104	cerr << "File " << filename << " could not be opened\n";
1105	exit(1);
1106	}
1107
1108	// Read the numbers file into the numbers array
1109	char key[1000];
1110	symbol value;
1111	while (inFile >> key >> value){
1112	if (strcmp(key, "first_stopword") == 0) {
1113	firstStopSymbol = value;
1114	} else if (strcmp(key, "last_stopword") == 0) {
1115	lastStopSymbol = value;
1116	} else if (strcmp(key, "first_contentword") == 0) {
1117	firstContentSymbol = value;
1118	} else if (strcmp(key, "last_contentword") == 0) {
1119	lastContentSymbol = value;
1120	}
1121	}
1122	inFile.close();
1123
1124	// Make sure we have the information we need
1125	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
1126	cerr << "Statistics file incomplete" << endl;
1127	exit(1);
1128	}
1129	}
1130
1131
1132
1133
1134

Note: See TracBrowser for help on using the repository browser.

Download in other formats: