Context Navigation

source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2867

Last change on this file since 2867 was 2867, checked in by paynter, 22 years ago
Moved all the sufficCheck functionality into the check.h header and inlined it.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.0 KB

Line
1	/**********************************************************************
2	*
3	* Suffix.cpp -- Extract the repeated phrases in the input with suffix
4	* and prefix arrays (cgn & gwp's simpler algorithm,
5	* and kjm's improvements).
6	*
7	* Copyright 2000 Gordon W. Paynter
8	* Copyright 2000 The New Zealand Digital Library Project
9	*
10	* A component of the Greenstone digital library software from the
11	* New Zealand Digital Library Project at the University of Waikato,
12	* New Zealand.
13	*
14	* This program is free software; you can redistribute it and/or
15	* modify it under the terms of the GNU General Public License as
16	* published by the Free Software Foundation; either version 2 of
17	* the License, or (at your option) any later version.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the GNU General Public License
25	* along with this program; if not, write to the Free Software
26	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	*
28	*********************************************************************/
29
30	#include <assert.h>
31	#include <math.h>
32	#include <stdio.h>
33	#include <stdlib.h>
34	#include <string.h>
35
36	#if defined(GSDL_USE_IOS_H)
37	# include <fstream.h>
38	# include <iostream.h>
39	#else
40	# include <fstream>
41	# include <iostream>
42	#endif
43
44	#if defined(GSDL_USE_STL_H)
45	# if defined(GSDL_USE_ALGO_H)
46	# include <algo.h>
47	# else
48	# include <algorithm.h>
49	# endif
50	# include <vector.h>
51	#else
52	# include <algorithm>
53	# include <vector>
54	#endif
55
56	#include "suffix.h"
57	#include "phrase.h"
58	#include "check.h"
59
60	// Global variables declared in suffix.h
61	cellcount inputLength;
62
63	symbol *symbols;
64	symbol **suffixArray;
65	symbol **prefixArray;
66
67	// How many documents are in this collection?
68	cellcount numberOfDocuments;
69	symbol **documentArray;
70
71
72	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
73	int phraseMode = ANYPHRASE; //STOPWORDS;
74
75
76	// The filestem of the collection's phindex directory
77	char collection[FILENAME_MAX];
78
79
80	// The ranges of the stopword and content-word symbols for the collection
81	symbol firstStopSymbol = 0;
82	symbol lastStopSymbol = 0;
83	symbol firstContentSymbol = 0;
84	symbol lastContentSymbol = 0;
85
86
87	// Some useful comparison functions, defined below.
88	int suffixCompare(const void , const void );
89	int prefixCompare(const void , const void );
90	int pointerCompare(const void , const void );
91
92
93	// Functions for implementing "phrase memory". These let us "remember"
94	// each phrase that we've expanded without using too much memory.
95	void initialisePhraseMemory();
96	void rememberThisPhrase(cellindex index, cellcount length);
97	bool isPhraseStored(cellindex index, cellcount length);
98	void deletePhraseMemory();
99
100
101	// how much output do we want?
102	int verbosity = 1;
103
104
105	// Get a phrase's expansions
106	//
107	// Get the set of "minimal", "maximal", non-unique expansions of a
108	// phrase p, using the simpler algorithm that Craig and Gordon came up
109	// with at Google.
110	//
111	// Returns a vector of Expansions.
112
113	void getExpansions(Phrase &p, vector<Phrase> &results) {
114
115	// 1. Get the initial candidates
116	vector<Phrase> candidates;
117	p.initialSuffixCandidates(candidates);
118	int suffcands = candidates.size();
119	p.initialPrefixCandidates(candidates);
120
121	if (candidates.size() == 0)
122	return;
123
124	vector<Phrase>::iterator i;
125	for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
126	i->expandWhileUniquePrefixExtension();
127	i->ensureSuffixFound();
128	}
129	for (i; i != candidates.end(); ++i) {
130	i->expandWhileUniqueSuffixExtension();
131	}
132
133	// 3. Ensure minimality: ignore phrases whose subphrases are also found
134	results.clear();
135
136	// Initialise the candidates, check array, and various variables.
137	sort(candidates.begin(), candidates.end(), isShorter);
138	unsigned minimum_length = candidates.begin()->length;
139	clearSuffixCheck();
140
141	// Try to add each candidate to the results set, ignoring the non-minimal
142	for (vector<Phrase>::iterator candidate = candidates.begin();
143	candidate != candidates.end(); candidate++) {
144
145	// Make a copy of candidate to mutilate while performing sub-phrase checks
146	Phrase temp_phrase(*candidate);
147	bool shorter_found = false;
148
149	// Check for shorter and shorter versions of the temporary phrase
150	while (temp_phrase.length >= minimum_length && !shorter_found) {
151	temp_phrase.ensureSuffixFound();
152	if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
153	temp_phrase.shortenByOneAtPrefix();
154	else
155	shorter_found = true;
156
157	// Possible efficiency here: we can finish if the prefix of c
158	// and temp_phrase are the same for candidate->length symbols.
159	}
160
161	// If no shorter phrase is found, use this one
162	if (!shorter_found) {
163	results.push_back(*candidate);
164	candidate->ensureSuffixFound();
165	setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
166	}
167	}
168	}
169
170
171	// suffixCompare
172	//
173	// Compare two pointers into a suffix array. We use this in the
174	// qsort function, so the input are pointers to pointers.
175	//
176	// Return -1 if (a < b), otherwise (a > b) so return +1,
177
178	int suffixCompare(const void cpa, const void cpb) {
179
180	// Cast then dereference pointers to suffix array elements
181	symbol pa = (symbol ) cpa;
182	symbol pb = (symbol ) cpb;
183	pa = (symbol ) pa;
184	pb = (symbol ) pb;
185
186	// If the two elements are the same, examine the next one
187	while (pa == pb) {
188	*pa++;
189	*pb++;
190	}
191
192	// Make the copmparison and return
193	if ( pa < pb) {
194	return -1;
195	} else {
196	return +1;
197	}
198	}
199
200
201	// prefixCompare
202	//
203	// Compare two pointers into a prefix array. We use this in the
204	// qsort function, so the input are pointers to pointers.
205	//
206	// Return -1 if (a > b), otherwise (a < b) so return +1,
207
208	int prefixCompare(const void cpa, const void cpb) {
209
210	// Cast then dereference pointers to prefix array elements
211	symbol pa = (symbol ) cpa;
212	symbol pb = (symbol ) cpb;
213	pa = (symbol ) pa;
214	pb = (symbol ) pb;
215
216	// If the two elements are the same, examine the next one
217	while (pa == pb) {
218	*pa--;
219	*pb--;
220	}
221
222	// Make the copmparison and return
223	if ( pa > pb) {
224	return -1;
225	} else {
226	return +1;
227	}
228	}
229
230	// simpleCompare
231	//
232	// Compare two pointers based on the memory location they point to.
233
234	int pointerCompare( const void pa, const void pb ) {
235
236	symbol a = (symbol ) pa;
237	symbol b = (symbol ) pb;
238
239	if (a < b) {
240	return -1;
241	} else if (a > b) {
242	return 1;
243	} else {
244	return 0;
245	}
246	}
247
248
249	// Read the clauses.numbers file into the "symbols" array.
250	//
251	// Each number in the file is a symbol number; it is essential that
252	// the first symbol (and no others) be COLLECTIONSTART and the last
253	// symbol (and no others) be COLLECTIONEND.
254	//
255	// Return the number of numbers in the array.
256
257	int readNumbers() {
258
259	char filename[FILENAME_MAX];
260	sprintf(filename, "%s/clauses.numbers", collection);
261	if (verbosity) {
262	cout << "Reading numbers file: " << filename << endl;
263	}
264
265	// Open the numbers file
266	ifstream inFile1(filename, ios::in);
267	if (!inFile1) {
268	cerr << "File " << filename << " could not be opened\n";
269	exit(1);
270	}
271
272	// Count the number of symbols
273	inputLength = 0;
274	symbol word;
275	while (inFile1 >> word) {
276	inputLength++;
277	}
278	inFile1.close();
279
280	// Allocate the symbbols array
281	if (verbosity > 1) {
282	cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
283	}
284	symbols = new symbol[inputLength];
285	if (symbols == NULL) {
286	cerr << "Suffix error: not enough memory to hold " << inputLength
287	<< " symbols." << endl;
288	exit(2);
289	}
290
291	// Read the numbers file into the numbers array
292	if (verbosity > 2) {
293	cout << "Reading the numbers" << endl;
294	}
295
296	ifstream inFile2(filename, ios::in);
297	if (!inFile2) {
298	cerr << "File " << filename << " could not be opened\n";
299	exit(1);
300	}
301	cellcount next = 0;
302	numberOfDocuments = 0;
303	while (inFile2 >> word) {
304	symbols[next++] = word;
305	if (word == DOCUMENTSTART) {
306	numberOfDocuments++;
307	}
308	}
309	inFile2.close();
310
311	// Make sure the numbers file is intact
312	assert(symbols[0] == COLLECTIONSTART);
313	assert(symbols[next-1] == COLLECTIONEND);
314
315	return inputLength;
316	}
317
318
319
320	// Get Document Occurrance statistics
321	//
322	// Given a phrase, what documents does it occur in?
323
324	cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
325
326	// cout << "searching for \""<< p << "\" in documents "
327	// << 0 << "-" << numberOfDocuments - 1 << endl;
328
329	// The number of documents in which this phrase occurs
330	cellcount df = 0;
331
332	// Initialise the document frequency array
333	// for (cellindex i = 0; i < numberOfDocuments; i++) {
334	// frequency[i] = 0;
335	//}
336	memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
337
338	// variables used to facilitate the search
339	cellindex begin;
340	cellindex end;
341	cellindex d;
342	symbol *target;
343	bool found;
344
345	// search for the document in which each occurence of the phrase is found
346	for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
347
348	// cout << "looking for phrase at suffixArray[" << j << "]\n";
349
350	target = suffixArray[j];
351	begin = 0;
352	end = numberOfDocuments - 1;
353	found = false;
354
355	// Search for the occurence of a document delimiter that target
356	// occurs immediately after.
357	// We do this by performing a binary chop search on documentArray.
358	while (!found) {
359
360	// cout << "searching for " << (cellindex) target << " in "
361	// << begin << " - " << end << endl;
362
363	assert (begin <= end);
364
365	// If the beginning and end of the interval are the same,
366	// then we've found the correct document
367	if (begin == end) {
368	if (frequency[begin] == 0) {
369	df++;
370	}
371	frequency[begin]++;
372	found = true;
373	}
374
375	// Otherwise, examine a new document midway through the begin-end
376	// interval and see if it is the one.
377	else {
378	d = (begin + end) / 2;
379	if (target > documentArray[d]) {
380	// If target addrss is greater than this, but thisi sthe last document,
381	// then this must be the one we want. Or, if target is greater than
382	// this one but less then the next, this must be the one we wnat.
383	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
384	if (frequency[d] == 0) {
385	df++;
386	}
387	frequency[d]++;
388	found = true;
389	} else {
390	// otherwise we know to search later in the document set
391	begin = d + 1;
392	}
393	} else {
394	// search earlier in the document set
395	end = d - 1;
396	}
397	}
398	}
399	}
400	return df;
401	}
402
403
404
405
406
407
408	// phraseExpansionMemory : Which phrases have we expanded?
409	//
410	// A set of utilities for keeping track of which phrases we have expanded.
411	// We don't want to expand a phrase more than once, after all.
412	//
413	// This REALLY ought to be in its own class, but it works so that's okay.
414	//
415	// Phrases are identified by their firstSuffixPosition and length.
416	//
417	// Functions provided are:
418	// void initialisePhraseMemory()
419	// void rememberThisPhrase(index, length)
420	// bool isPhraseStored(index, length)
421	// void deletePhraseMemory()
422	//
423	// Internally, we will have two separate cases:
424	//
425	// Phrases of length 1-8:
426	// unsigned char phraseMemory[inputLength]
427	// is an array where each cell "remembers" the corresponding index in the
428	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
429	// of length 1, 2... 8.
430	// Eventually, we will make this disk-based (i.e. store the array in a file).
431	//
432	// Phrases of length 9+:
433	// file hashTableFile
434	// file listOfEntries
435	// The first file is a hash table; each phrase maps to one of its cells, which
436	// contains either 0 (empty, no occurence) or a number which is an entry number
437	// in the second file. This file contains a "list" of entries. Each consists of
438	// three numbers: the suffixArray index of the phrase, the length of the phrase,
439	// and the entry number of the next phrase with the same hash.
440	//
441
442
443	unsigned char *phraseMemory;
444
445	void initialiseLongPhraseMemory();
446	void rememberThisLongPhrase(cellindex index, cellcount length);
447	bool isLongPhraseStored(cellindex index, cellcount length);
448	void deleteLongPhraseMemory();
449
450
451	void initialisePhraseMemory() {
452
453	phraseMemory = new unsigned char[inputLength];
454
455	// to begin with, everything is empty
456	// for (cellcount i = 0; i < inputLength; i++) {
457	// phraseMemory[i] = 0;
458	//}
459	memset(phraseMemory, 0, sizeof(char)*inputLength);
460
461	// intialise the hashTable of long phrases
462	initialiseLongPhraseMemory();
463
464	}
465
466	void rememberThisPhrase(cellindex index, cellcount length) {
467
468	// if the phrase is very long, use the file-based system
469	if (length > 8) {
470	rememberThisLongPhrase(index, length);
471	return;
472	}
473
474	// create a char with just the bit corresponding to length set
475	unsigned char newbit = 1;
476	for (cellcount i = 1; i < length; i++) {
477	newbit <<= 1;
478	}
479
480	// set that bit in the memory array at position index
481	phraseMemory[index] \|= newbit;
482	}
483
484
485	bool isPhraseStored(cellindex index, cellcount length) {
486
487	// if the phrase is very long, use the file-based system
488	if (length > 8) {
489	return isLongPhraseStored(index, length);
490	}
491
492	// create a char with just the bit corresponding to length set
493	unsigned char newbit = 1;
494	for (cellcount i = 1; i < length; i++) {
495	newbit <<= 1;
496	}
497
498	// retrurn true if that bit is set in memory arrayat position index
499	return (phraseMemory[index] & newbit);
500	}
501
502	void deletePhraseMemory() {
503	delete phraseMemory;
504	deleteLongPhraseMemory();
505	}
506
507
508
509	// Files etc used to store "long" equavlents of the above
510
511	fstream hashTableFile;
512	char hashTableFileName[FILENAME_MAX];
513	fstream listOfEntries;
514	char listOfEntriesName[FILENAME_MAX];
515	cellindex nextEntryNumber;
516
517	const cellcount bigPrime = 7919;
518
519
520	void initialiseLongPhraseMemory() {
521
522	cellindex example = 0;
523
524	sprintf(hashTableFileName, "%s/hashTable", collection);
525	sprintf(listOfEntriesName, "%s/hashLists", collection);
526
527
528	// create the new hashtable
529	if (verbosity > 1) {
530	cout << "Initialising hashTable: " << hashTableFileName << endl;
531	}
532	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
533	for (cellcount i = 0; i < bigPrime; i++) {
534	hashTableFile.write((char *) &example, sizeof(example));
535	}
536
537	// create the list of phrases
538	if (verbosity > 1) {
539	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
540	}
541	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
542	listOfEntries.write((char *) &example, sizeof(example));
543	listOfEntries.write((char *) &example, sizeof(example));
544	listOfEntries.write((char *) &example, sizeof(example));
545	nextEntryNumber = 1;
546	}
547
548
549	void rememberThisLongPhrase(cellindex index, cellcount length) {
550
551	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
552
553	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
554	cellindex pointer;
555	cellindex zero = 0;
556	cellindex readp = 0;
557	cellindex readi = 0;
558	cellindex readl = 0;
559
560	hashTableFile.seekg(hashOffset);
561	hashTableFile.read((char *) &pointer, sizeof(cellindex));
562
563	if (pointer == 0) {
564	// There is no entry at all in the hash table for this entry
565	// so create one
566
567	pointer = nextEntryNumber++;
568	hashTableFile.seekg(hashOffset);
569	hashTableFile.write((char *) &pointer, sizeof(cellindex));
570
571	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
572	listOfEntries.write((char *) &zero, sizeof(cellindex));
573	listOfEntries.write((char *) &index, sizeof(cellindex));
574	listOfEntries.write((char *) &length, sizeof(cellindex));
575
576	} else {
577	// There is a list starting at this hash value, so the phrase may
578	// be already remembered, or it might need to be appended
579
580	while (pointer != 0) {
581	// Read the entry pointed to by pointer
582	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
583	listOfEntries.read((char *) &readp, sizeof(cellindex));
584	listOfEntries.read((char *) &readi, sizeof(cellindex));
585	listOfEntries.read((char *) &readl, sizeof(cellindex));
586
587	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
588
589	if ((readi == index) && (readl = length)) {
590	// we've found that we've already stored it
591	return;
592	} else if (readp == 0) {
593	// we're reached the end of the list. Add a new entry.
594	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
595	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
596	pointer = nextEntryNumber++;
597
598	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
599	listOfEntries.write((char *) &zero, sizeof(cellindex));
600	listOfEntries.write((char *) &index, sizeof(cellindex));
601	listOfEntries.write((char *) &length, sizeof(cellindex));
602	return;
603	} else {
604	// go on to the next node
605	pointer = readp;
606	}
607	}
608	}
609
610
611	}
612
613
614	bool isLongPhraseStored(cellindex index, cellcount length) {
615
616	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
617
618	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
619	cellindex pointer;
620	cellindex readp = 0;
621	cellindex readi = 0;
622	cellindex readl = 0;
623
624	// Find the phrase in the hashFile
625	hashTableFile.seekg(hashOffset);
626	hashTableFile.read((char *) &pointer, sizeof(cellindex));
627
628	if (pointer == 0) {
629	// There is no entry at all in the hash table for this entry
630	// so nothing is stored
631	return false;
632
633	} else {
634	// There is a list starting at this hash value, so the phrase may
635	// be already remembered in that list
636	while (pointer != 0) {
637	// Read the entry pointed to by pointer
638	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
639	listOfEntries.read((char *) &readp, sizeof(cellindex));
640	listOfEntries.read((char *) &readi, sizeof(cellindex));
641	listOfEntries.read((char *) &readl, sizeof(cellindex));
642
643	if ((readi == index) && (readl = length)) {
644	// we've found the phrase stored here
645	return true;
646	} else {
647	// go on to the next node
648	pointer = readp;
649	}
650	}
651	}
652	return false;
653	}
654
655
656	void deleteLongPhraseMemory() {
657	// remove the hash & other files
658
659	hashTableFile.close();
660	listOfEntries.close();
661	remove(hashTableFileName);
662	remove(listOfEntriesName);
663
664	}
665
666
667	// Read the collection statistics file
668	//
669	void readStatistics() {
670
671	// open the statistics file
672	char filename[FILENAME_MAX];
673	sprintf(filename, "%s/clauses.stats", collection);
674
675	// Open the file
676	ifstream inFile(filename, ios::in);
677	if (!inFile) {
678	cerr << "File " << filename << " could not be opened\n";
679	exit(1);
680	}
681
682	// Read the numbers file into the numbers array
683	char key[1000];
684	symbol value;
685	while (inFile >> key >> value){
686	if (strcmp(key, "first_stopword") == 0) {
687	firstStopSymbol = value;
688	} else if (strcmp(key, "last_stopword") == 0) {
689	lastStopSymbol = value;
690	} else if (strcmp(key, "first_contentword") == 0) {
691	firstContentSymbol = value;
692	} else if (strcmp(key, "last_contentword") == 0) {
693	lastContentSymbol = value;
694	}
695	}
696	inFile.close();
697
698	// Make sure we have the information we need
699	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
700	cerr << "Statistics file incomplete" << endl;
701	exit(1);
702	}
703	}
704
705	cellcount getContentCount(symbol firstContent) {
706
707	cellcount content=0;
708	for (cellcount i=0; i<inputLength; i++) {
709	if (symbols[i]>=firstContent) content++;
710	}
711
712	return content;
713	}
714
715
716	int main (int argc, char * argv[]) {
717
718	// Command-line arguments
719	// argv[1] is the phindex directory
720	// argv[2] is the maximum array symbol length (optional)
721	// argv[3] is the mode, where 1 is stopword mode (optional)
722	if (argc < 2) {
723	cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
724	exit(1);
725	}
726
727	// collection directory
728	strcpy(collection, argv[1]);
729
730	// mode parameter
731	phraseMode = atoi(argv[2]);
732	assert((phraseMode == STOPWORDS) \|\| (phraseMode == ANYPHRASE));
733
734	// optional verbosity parameter
735	if (argc == 4) {
736	verbosity = atoi(argv[3]);
737	assert (verbosity >= 0);
738	}
739
740	if (verbosity) {
741	cout << "suffix: the phrase extraction program" << endl;
742	}
743	if (verbosity > 1) {
744	if (phraseMode == STOPWORDS) {
745	cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
746	} else {
747	cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
748	}
749	}
750
751	// Read the statistics file
752	readStatistics();
753
754	// Read the numbers file
755	readNumbers();
756
757	if (numberOfDocuments == 0) {
758	cerr << "There are no documents in this collection!" << endl;
759	exit(1);
760	}
761
762	symbol firstContent;
763	if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
764	else firstContent = firstStopSymbol;
765
766	// Allocate memory for the suffix & prefix arrays
767	cellcount contentLength = 0;
768	contentLength = getContentCount(firstContent);
769	suffixArray = new symbol *[contentLength];
770	prefixArray = new symbol *[contentLength];
771	if (prefixArray == NULL) {
772	cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
773	exit(2);
774	}
775	allocateSuffixCheck(contentLength);
776
777	// Initialise prefix and suffix arrays, only use the needed suffixes
778	for (cellcount j = 0, here = 0; j < inputLength; j++) {
779	if (symbols[j]>=firstContent) {
780	suffixArray[here] = &symbols[j];
781	prefixArray[here] = &symbols[j];
782	here++;
783	}
784	}
785	qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
786	qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
787
788	// Create the document arrays
789	if (verbosity > 1) {
790	cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
791	}
792
793	// The document frequecy array is used to count the number of times
794	// each phrase occurs in each document. The number of documents in
795	// which a phrase occurs is stored in df.
796	frequency *documentFrequency = new frequency[numberOfDocuments];
797	frequency df;
798
799	// documentArray will be searched in order to discover which document
800	// each phrase occurs in.
801	documentArray = new symbol *[numberOfDocuments];
802	if (documentArray == NULL) {
803	cerr << "Suffix: out of memory allocating document arrays." << endl;
804	exit(2);
805	}
806
807	// just scan through the input text to find the doc starts
808	cellindex d = 0;
809	for (cellindex i=0; i<inputLength; i++) {
810	if (symbols[i] == DOCUMENTSTART) {
811	documentArray[d] = &symbols[i];
812	d++;
813	}
814	}
815
816	// the phrases stuff is expecting inputLength to be the length of the
817	// suffix array, so change it.
818	inputLength = contentLength;
819
820	// Extract phrases
821	//
822	// We will make several passesover the data, in each case considering
823	// a set of input phrases and generating a set of output phrases, which
824	// we will expancd in later passes.
825	//
826	// The input phrases in the first pass will be the vocabulary.
827	// In later passes, the input phrases will be the output phrases of the
828	// previous pass.
829	//
830	// In each pass we will consider each input phrase in turn. If we
831	// have seen it before, we will ignore it. Otherwise, we will expand
832	// it and add its expansions to the set of output phrases.
833
834	cout <<"\ngenerating the phrase hierarchy\n\n";
835
836	// Store the phrase data in the phrases file
837	char phraseDataName[FILENAME_MAX];
838	sprintf(phraseDataName, "%s/phrases", collection);
839	ofstream phraseData(phraseDataName, ios::out);
840	if (!phraseData) {
841	cout << "File " << phraseDataName << " could not be opened\n";
842	exit(1);
843	}
844
845	// Count the number of phrases output
846	unsigned long int phraseCounter = 0;
847
848	// Set up the phrase expansion memory.
849	// We need this so that we don't expand a phrase more than once
850	initialisePhraseMemory();
851
852	// The current pass numebr
853	int phrasePass = 1;
854
855
856	// PASS NUMBER 1
857	if (verbosity > 1) {
858	cout << "Starting pass " << phrasePass << endl;
859	}
860
861	ofstream outPhrase;
862	char outPhraseName[FILENAME_MAX];
863	unsigned long int outPhraseCounter = 0;
864
865	// On the first pass, simply work through the vocabulary
866	sprintf(outPhraseName, "%s/outPhrase.1", collection);
867	outPhrase.open(outPhraseName, ios::out);
868	if (!outPhrase) {
869	cerr << "File " << outPhraseName << " could not be opened\n";
870	exit(1);
871	}
872
873	// Iterate over the different symbols by working through the suffix array
874	vector<Phrase> result;
875	cellindex ij = 0;
876	char *tmpString;
877
878	Phrase p;
879	while (ij < inputLength) {
880
881	// make a new phrase of length 1
882	p = Phrase(suffixArray[ij], 1, SUFFIX);
883	p.findFirstAndLastSuffix(ij, inputLength-1);
884
885	// We ignore this symbol if it occurs only once, if it is a delimiter,
886	// of if we are in stopwords mode and it is a stopword
887	// - in this new version, only need to check freq
888	// We could imagine a new mode/command-line option, which is like
889	// STOPWORDS but without this restrictrion. This would let you browse
890	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
891	// "the AGRIS" (where the is a stopword and AGRIS a content word).
892	// The system used to work like this; it is easy to implement, but
893	// it explodes the size of the indexes. So: would it be useful?
894	if (p.suffixFrequency > 1) {
895	// Get minimal expansions of the phrase
896	getExpansions(p, result);
897
898	if (!result.empty()) {
899
900	// Remember that we have expanded this phrase
901	rememberThisPhrase(ij, 1);
902
903	// write the phrase text
904	phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
905	<< result.size() << ":";
906
907	// write the results
908	for (cellcount k = 0; k < result.size(); k++) {
909	if (k) {
910	phraseData << ",";
911	}
912	phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
913	outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
914	outPhraseCounter++;
915	}
916	result.clear();
917
918	// Write the documents in which this phrase occurs
919	df = getDocumentOccurrances(p, documentFrequency);
920	phraseData << ":" << df << ":";
921
922	// write the documents
923	for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
924	if (documentFrequency[m]) {
925	if (first) {
926	first = 0;
927	} else {
928	phraseData << ";";
929	}
930	// Output the document number. Note that here we've numbered the
931	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
932	// add 1 to the document id when we output it.
933	phraseData << "d" << (m+1);
934	// Next, output the frequency with which the document occurs, but
935	// only if it is > 1.
936	if (documentFrequency[m] > 1) {
937	phraseData << "," << documentFrequency[m];
938	}
939	}
940	}
941
942	phraseData << endl;
943	phraseCounter++;
944
945	// feedback
946	if (verbosity) {
947	if (phraseCounter % 1000 == 0) {
948	cout << "phrase " << phraseCounter << ": "
949	<< "cell " << p.firstSuffixIndex << " - " << p << endl;
950	}
951	}
952	}
953	}
954	ij = p.lastSuffixIndex + 1;
955	}
956	outPhrase.close();
957
958	// REMAINING PASSES
959	// The previous outPhrase file forms the input to each new pass
960	cellcount start, length;
961	while (outPhraseCounter > 0) {
962
963	// Start a new pass
964	phrasePass++;
965	if (verbosity) {
966	cout << "Starting pass " << phrasePass << endl;
967	}
968
969	// Open the input file
970	char inPhraseName[FILENAME_MAX];
971	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
972	ifstream inPhrase (inPhraseName, ios::in);
973	if (!inPhrase) {
974	cerr << "File " << inPhraseName << " could not be opened\n";
975	exit(1);
976	}
977
978	// Open the output file
979	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
980	outPhrase.open(outPhraseName, ios::out);
981	if (!outPhrase) {
982	cerr << "File " << outPhraseName << " could not be opened\n";
983	exit(1);
984	}
985	outPhraseCounter = 0;
986
987	// Process each phrase
988	while(inPhrase >> start >> length) {
989
990	// Ignore the phrase if we have expanded it before
991	if (isPhraseStored(start, length))
992	continue;
993
994	// Remember that we have examined this phrase
995	rememberThisPhrase(start, length);
996
997	// Find the phrase in the suffixarray
998	p = Phrase(suffixArray[start], length, SUFFIX);
999	p.findFirstAndLastSuffix(start, inputLength-1);
1000
1001	// Ignore the phrase if it only occurs once
1002	if (p.suffixFrequency < 2)
1003	continue;
1004
1005	// Write the phrase text;
1006	phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1007
1008	// Expand the phrase, if it is fewer than 8 words long
1009	if (length <= 8) {
1010
1011	// Get the minimal expansions for this phrase
1012	getExpansions(p, result);
1013
1014	// write the results
1015	phraseData << result.size() << ":";
1016
1017	for (cellcount i = 0; i < result.size(); i++) {
1018	if (i) {
1019	phraseData << ",";
1020	}
1021	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1022	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1023	outPhraseCounter++;
1024	}
1025	result.clear();
1026
1027	} else {
1028	// phrase is too long to expand further
1029	phraseData << "0:";
1030	}
1031
1032	// Write the documents in which this phrase occurs
1033	df = getDocumentOccurrances(p, documentFrequency);
1034	phraseData << ":" << df << ":";
1035
1036	// write the documents
1037	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
1038	if (documentFrequency[i]) {
1039	if (first) {
1040	first = 0;
1041	} else {
1042	phraseData << ";";
1043	}
1044	// Output the document number. Note that here we've numbered the
1045	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
1046	// add 1 to the document id when we output it.
1047	phraseData << "d" << (i+1);
1048	// Next, output the frequency with which the document occurs, but
1049	// only if it is > 1.
1050	if (documentFrequency[i] > 1) {
1051	phraseData << "," << documentFrequency[i];
1052	}
1053	}
1054	}
1055
1056	phraseData << endl;
1057	phraseCounter++;
1058
1059	// feedback
1060	if (verbosity) {
1061	if (phraseCounter % 1000 == 0) {
1062	cout << "phrase " << phraseCounter << ": "<< "start " << start
1063	<< ", length " << length << " - " << p << endl;
1064	}
1065	}
1066
1067	}
1068
1069	inPhrase.close();
1070	outPhrase.close();
1071	}
1072
1073	phraseData.close();
1074	deletePhraseMemory();
1075
1076	delete [] documentFrequency;
1077	delete [] symbols;
1078	delete [] suffixArray;
1079	delete [] prefixArray;
1080	delete [] suffixCheck;
1081	delete [] documentArray;
1082
1083
1084
1085	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1086	return 0;
1087	}
1088
1089
1090
1091
1092

Note: See TracBrowser for help on using the repository browser.

Download in other formats: