Context Navigation

source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2801

Last change on this file since 2801 was 2801, checked in by kjm18, 23 years ago
new version of suffix, based on suffix2 (gordon and craigs simpler version) with kaths improvements
Property svn:keywords set to `Author Date Id Revision`
File size: 29.8 KB

Line
1	/**********************************************************************
2	*
3	* Suffix.cpp -- Extract the repeated phrases in the input with suffix
4	* and prefix arrays (cgn & gwp's simpler algorithm,
5	* and kjm's improvements).
6	*
7	* Copyright 2000 Gordon W. Paynter
8	* Copyright 2000 The New Zealand Digital Library Project
9	*
10	* A component of the Greenstone digital library software
11	* from the New Zealand Digital Library Project at the
12	* University of Waikato, New Zealand.
13	*
14	* This program is free software; you can redistribute it and/or modify
15	* it under the terms of the GNU General Public License as published by
16	* the Free Software Foundation; either version 2 of the License, or
17	* (at your option) any later version.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the GNU General Public License
25	* along with this program; if not, write to the Free Software
26	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	*
28	*********************************************************************/
29
30	#include <assert.h>
31	#include <math.h>
32	#include <stdio.h>
33	#include <stdlib.h>
34	#include <string.h>
35
36	#if defined(GSDL_USE_IOS_H)
37	# include <fstream.h>
38	# include <iostream.h>
39	#else
40	# include <fstream>
41	# include <iostream>
42	#endif
43
44	#if defined(GSDL_USE_STL_H)
45	# if defined(GSDL_USE_ALGO_H)
46	# include <algo.h>
47	# else
48	# include <algorithm.h>
49	# endif
50	# include <vector.h>
51	#else
52	# include <algorithm>
53	# include <vector>
54	#endif
55
56	#include "suffix.h"
57	#include "phrase.h"
58
59	// Global variables declared in suffix.h
60	cellcount inputLength;
61
62	symbol *symbols;
63	symbol **suffixArray;
64	symbol **prefixArray;
65	check *suffixCheck;
66
67
68	// How many documents are in this collection?
69	cellcount numberOfDocuments;
70	symbol **documentArray;
71
72
73	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
74	int phraseMode = ANYPHRASE; //STOPWORDS;
75
76
77	// The filestem of the collection's phindex directory
78	char collection[FILENAME_MAX];
79
80
81	// The ranges of the stopword and content-word symbols for the collection
82	symbol firstStopSymbol = 0;
83	symbol lastStopSymbol = 0;
84	symbol firstContentSymbol = 0;
85	symbol lastContentSymbol = 0;
86
87
88	// Some useful comparison functions, defined below.
89	int suffixCompare(const void , const void );
90	int prefixCompare(const void , const void );
91	int pointerCompare(const void , const void );
92
93
94	// Functions for implementing "phrase memory". These let us "remember"
95	// each phrase that we've expanded without using too much memory.
96	void initialisePhraseMemory();
97	void rememberThisPhrase(cellindex index, cellcount length);
98	bool isPhraseStored(cellindex index, cellcount length);
99	void deletePhraseMemory();
100
101
102	// how much output do we want?
103	int verbosity = 1;
104
105
106	// Get a phrase's expansions
107	//
108	// Get the set of "minimal", "maximal", non-unique expansions of a
109	// phrase p, using the simpler algorithm that Craig and Gordon came up
110	// with at Google.
111	//
112	// Returns a vector of Expansions.
113
114	void getExpansions(Phrase &p, vector<Phrase> &results) {
115
116	// 1. Get the initial candidates
117	vector<Phrase> candidates;
118	p.initialSuffixCandidates(candidates);
119	int suffcands = candidates.size();
120	p.initialPrefixCandidates(candidates);
121
122	if (candidates.size() == 0)
123	return;
124
125	vector<Phrase>::iterator i;
126	for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
127	i->expandWhileUniquePrefixExtension();
128	i->ensureSuffixFound();
129	}
130	for (i; i != candidates.end(); ++i) {
131	i->expandWhileUniqueSuffixExtension();
132	}
133
134	// 3. Ensure minimality: ignore phrases whose subphrases are also found
135	results.clear();
136
137	// Initialise the candidates, check array, and various variables.
138	sort(candidates.begin(), candidates.end(), isShorter);
139	for (cellcount j = 0; j < inputLength; j++)
140	suffixCheck[j] = 0;
141	unsigned minimum_length = candidates.begin()->length;
142
143	// Try to add each candidate to the results set, ignoring the non-minimal
144	for (vector<Phrase>::iterator candidate = candidates.begin();
145	candidate != candidates.end(); candidate++) {
146
147	// Make a copy of candidate to mutilate while performing sub-phrase checks
148	Phrase temp_phrase(*candidate);
149	bool shorter_found = false;
150
151	// Check for shorter and shorter versions of the tenporary phrase
152	while (temp_phrase.length >= minimum_length && !shorter_found) {
153	temp_phrase.ensureSuffixFound();
154	if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
155	temp_phrase.shortenByOneAtPrefix();
156	else
157	shorter_found = true;
158
159	// Possible efficiency here: we can finish if the prefix of c
160	// and temp_phrase are the same for candidate->length symbols.
161	}
162
163	if (!shorter_found) {
164	results.push_back(*candidate);
165	candidate->ensureSuffixFound();
166	for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
167	suffixCheck[k] = candidate->length;
168	}
169	}
170	}
171
172
173	// suffixCompare
174	//
175	// Compare two pointers into a suffix array. We use this in the
176	// qsort function, so the input are pointers to pointers.
177	//
178	// Return -1 if (a < b), otherwise (a > b) so return +1,
179
180	int suffixCompare(const void cpa, const void cpb) {
181
182	// Cast then dereference pointers to suffix array elements
183	symbol pa = (symbol ) cpa;
184	symbol pb = (symbol ) cpb;
185	pa = (symbol ) pa;
186	pb = (symbol ) pb;
187
188	// If the two elements are the same, examine the next one
189	while (pa == pb) {
190	*pa++;
191	*pb++;
192	}
193
194	// Make the copmparison and return
195	if ( pa < pb) {
196	return -1;
197	} else {
198	return +1;
199	}
200	}
201
202
203	// prefixCompare
204	//
205	// Compare two pointers into a prefix array. We use this in the
206	// qsort function, so the input are pointers to pointers.
207	//
208	// Return -1 if (a > b), otherwise (a < b) so return +1,
209
210	int prefixCompare(const void cpa, const void cpb) {
211
212	// Cast then dereference pointers to prefix array elements
213	symbol pa = (symbol ) cpa;
214	symbol pb = (symbol ) cpb;
215	pa = (symbol ) pa;
216	pb = (symbol ) pb;
217
218	// If the two elements are the same, examine the next one
219	while (pa == pb) {
220	*pa--;
221	*pb--;
222	}
223
224	// Make the copmparison and return
225	if ( pa > pb) {
226	return -1;
227	} else {
228	return +1;
229	}
230	}
231
232	// simpleCompare
233	//
234	// Compare two pointers based on the memory location they point to.
235
236	int pointerCompare( const void pa, const void pb ) {
237
238	symbol a = (symbol ) pa;
239	symbol b = (symbol ) pb;
240
241	if (a < b) {
242	return -1;
243	} else if (a > b) {
244	return 1;
245	} else {
246	return 0;
247	}
248	}
249
250
251	// Read the clauses.numbers file into the "symbols" array.
252	//
253	// Each number in the file is a symbol number; it is essential that
254	// the first symbol (and no others) be COLLECTIONSTART and the last
255	// symbol (and no others) be COLLECTIONEND.
256	//
257	// Return the number of numbers in the array.
258
259	int readNumbers() {
260
261	char filename[FILENAME_MAX];
262	sprintf(filename, "%s/clauses.numbers", collection);
263	if (verbosity) {
264	cout << "Reading numbers file: " << filename << endl;
265	}
266
267	// Open the numbers file
268	ifstream inFile1(filename, ios::in);
269	if (!inFile1) {
270	cerr << "File " << filename << " could not be opened\n";
271	exit(1);
272	}
273
274	// Count the number of symbols
275	inputLength = 0;
276	symbol word;
277	while (inFile1 >> word) {
278	inputLength++;
279	}
280	inFile1.close();
281
282	// Allocate the symbbols array
283	if (verbosity > 1) {
284	cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
285	}
286	symbols = new symbol[inputLength];
287	if (symbols == NULL) {
288	cerr << "Suffix error: not enough memory to hold " << inputLength
289	<< " symbols." << endl;
290	exit(2);
291	}
292
293	// Read the numbers file into the numbers array
294	if (verbosity > 2) {
295	cout << "Reading the numbers" << endl;
296	}
297
298	ifstream inFile2(filename, ios::in);
299	if (!inFile2) {
300	cerr << "File " << filename << " could not be opened\n";
301	exit(1);
302	}
303	cellcount next = 0;
304	numberOfDocuments = 0;
305	while (inFile2 >> word) {
306	symbols[next++] = word;
307	if (word == DOCUMENTSTART) {
308	numberOfDocuments++;
309	}
310	}
311	inFile2.close();
312
313	// Make sure the numbers file is intact
314	assert(symbols[0] == COLLECTIONSTART);
315	assert(symbols[next-1] == COLLECTIONEND);
316
317	return inputLength;
318	}
319
320
321
322	// Get Document Occurrance statistics
323	//
324	// Given a phrase, what documents does it occur in?
325
326	cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
327
328	// cout << "searching for \""<< p << "\" in documents "
329	// << 0 << "-" << numberOfDocuments - 1 << endl;
330
331	// The number of documents in which this phrase occurs
332	cellcount df = 0;
333
334	// Initialise the document frequency array
335	for (cellindex i = 0; i < numberOfDocuments; i++) {
336	frequency[i] = 0;
337	}
338
339	// variables used to facilitate the search
340	cellindex begin;
341	cellindex end;
342	cellindex d;
343	symbol *target;
344	bool found;
345
346	// search for the document in which each occurence of the phrase is found
347	for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
348
349	// cout << "looking for phrase at suffixArray[" << j << "]\n";
350
351	target = suffixArray[j];
352	begin = 0;
353	end = numberOfDocuments - 1;
354	found = false;
355
356	// Search for the occurence of a document delimiter that target
357	// occurs immediately after.
358	// We do this by performing a binary chop search on documentArray.
359	while (!found) {
360
361	// cout << "searching for " << (cellindex) target << " in "
362	// << begin << " - " << end << endl;
363
364	assert (begin <= end);
365
366	// If the beginning and end of the interval are the same,
367	// then we've found the correct document
368	if (begin == end) {
369	if (frequency[begin] == 0) {
370	df++;
371	}
372	frequency[begin]++;
373	found = true;
374	}
375
376	// Otherwise, examine a new document midway through the begin-end
377	// interval and see if it is the one.
378	else {
379	d = (begin + end) / 2;
380	if (target > documentArray[d]) {
381	// If target addrss is greater than this, but thisi sthe last document,
382	// then this must be the one we want. Or, if target is greater than
383	// this one but less then the next, this must be the one we wnat.
384	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
385	if (frequency[d] == 0) {
386	df++;
387	}
388	frequency[d]++;
389	found = true;
390	} else {
391	// otherwise we know to search later in the document set
392	begin = d + 1;
393	}
394	} else {
395	// search earlier in the document set
396	end = d - 1;
397	}
398	}
399	}
400	}
401	return df;
402	}
403
404
405
406
407
408
409	// phraseExpansionMemory : Which phrases have we expanded?
410	//
411	// A set of utilities for keeping track of which phrases we have expanded.
412	// We don't want to expand a phrase more than once, after all.
413	//
414	// This REALLY ought to be in its own class, but it works so that's okay.
415	//
416	// Phrases are identified by their firstSuffixPosition and length.
417	//
418	// Functions provided are:
419	// void initialisePhraseMemory()
420	// void rememberThisPhrase(index, length)
421	// bool isPhraseStored(index, length)
422	// void deletePhraseMemory()
423	//
424	// Internally, we will have two separate cases:
425	//
426	// Phrases of length 1-8:
427	// unsigned char phraseMemory[inputLength]
428	// is an array where each cell "remembers" the corresponding index in the
429	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
430	// of length 1, 2... 8.
431	// Eventually, we will make this disk-based (i.e. store the array in a file).
432	//
433	// Phrases of length 9+:
434	// file hashTableFile
435	// file listOfEntries
436	// The first file is a hash table; each phrase maps to one of its cells, which
437	// contains either 0 (empty, no occurence) or a number which is an entry number
438	// in the second file. This file contains a "list" of entries. Each consists of
439	// three numbers: the suffixArray index of the phrase, the length of the phrase,
440	// and the entry number of the next phrase with the same hash.
441	//
442
443
444	unsigned char *phraseMemory;
445
446	void initialiseLongPhraseMemory();
447	void rememberThisLongPhrase(cellindex index, cellcount length);
448	bool isLongPhraseStored(cellindex index, cellcount length);
449	void deleteLongPhraseMemory();
450
451
452	void initialisePhraseMemory() {
453
454	phraseMemory = new unsigned char[inputLength];
455
456	// to begin with, everything is empty
457	for (cellcount i = 0; i < inputLength; i++) {
458	phraseMemory[i] = 0;
459	}
460
461	// intialise the hashTable of long phrases
462	initialiseLongPhraseMemory();
463
464	}
465
466	void rememberThisPhrase(cellindex index, cellcount length) {
467
468	// if the phrase is very long, use the file-based system
469	if (length > 8) {
470	rememberThisLongPhrase(index, length);
471	return;
472	}
473
474	// create a char with just the bit corresponding to length set
475	unsigned char newbit = 1;
476	for (cellcount i = 1; i < length; i++) {
477	newbit <<= 1;
478	}
479
480	// set that bit in the memory array at position index
481	phraseMemory[index] \|= newbit;
482	}
483
484
485	bool isPhraseStored(cellindex index, cellcount length) {
486
487	// if the phrase is very long, use the file-based system
488	if (length > 8) {
489	return isLongPhraseStored(index, length);
490	}
491
492	// create a char with just the bit corresponding to length set
493	unsigned char newbit = 1;
494	for (cellcount i = 1; i < length; i++) {
495	newbit <<= 1;
496	}
497
498	// retrurn true if that bit is set in memory arrayat position index
499	return (phraseMemory[index] & newbit);
500	}
501
502	void deletePhraseMemory() {
503	delete phraseMemory;
504	deleteLongPhraseMemory();
505	}
506
507
508
509	// Files etc used to store "long" equavlents of the above
510
511	fstream hashTableFile;
512	char hashTableFileName[FILENAME_MAX];
513	fstream listOfEntries;
514	char listOfEntriesName[FILENAME_MAX];
515	cellindex nextEntryNumber;
516
517	const cellcount bigPrime = 7919;
518
519
520	void initialiseLongPhraseMemory() {
521
522	cellindex example = 0;
523
524	sprintf(hashTableFileName, "%s/hashTable", collection);
525	sprintf(listOfEntriesName, "%s/hashLists", collection);
526
527
528	// create the new hashtable
529	if (verbosity > 1) {
530	cout << "Initialising hashTable: " << hashTableFileName << endl;
531	}
532	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
533	for (cellcount i = 0; i < bigPrime; i++) {
534	hashTableFile.write((char *) &example, sizeof(example));
535	}
536
537	// create the list of phrases
538	if (verbosity > 1) {
539	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
540	}
541	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
542	listOfEntries.write((char *) &example, sizeof(example));
543	listOfEntries.write((char *) &example, sizeof(example));
544	listOfEntries.write((char *) &example, sizeof(example));
545	nextEntryNumber = 1;
546	}
547
548
549	void rememberThisLongPhrase(cellindex index, cellcount length) {
550
551	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
552
553	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
554	cellindex pointer;
555	cellindex zero = 0;
556	cellindex readp = 0;
557	cellindex readi = 0;
558	cellindex readl = 0;
559
560	hashTableFile.seekg(hashOffset);
561	hashTableFile.read((char *) &pointer, sizeof(cellindex));
562
563	if (pointer == 0) {
564	// There is no entry at all in the hash table for this entry
565	// so create one
566
567	pointer = nextEntryNumber++;
568	hashTableFile.seekg(hashOffset);
569	hashTableFile.write((char *) &pointer, sizeof(cellindex));
570
571	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
572	listOfEntries.write((char *) &zero, sizeof(cellindex));
573	listOfEntries.write((char *) &index, sizeof(cellindex));
574	listOfEntries.write((char *) &length, sizeof(cellindex));
575
576	} else {
577	// There is a list starting at this hash value, so the phrase may
578	// be already remembered, or it might need to be appended
579
580	while (pointer != 0) {
581	// Read the entry pointed to by pointer
582	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
583	listOfEntries.read((char *) &readp, sizeof(cellindex));
584	listOfEntries.read((char *) &readi, sizeof(cellindex));
585	listOfEntries.read((char *) &readl, sizeof(cellindex));
586
587	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
588
589	if ((readi == index) && (readl = length)) {
590	// we've found that we've already stored it
591	return;
592	} else if (readp == 0) {
593	// we're reached the end of the list. Add a new entry.
594	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
595	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
596	pointer = nextEntryNumber++;
597
598	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
599	listOfEntries.write((char *) &zero, sizeof(cellindex));
600	listOfEntries.write((char *) &index, sizeof(cellindex));
601	listOfEntries.write((char *) &length, sizeof(cellindex));
602	return;
603	} else {
604	// go on to the next node
605	pointer = readp;
606	}
607	}
608	}
609
610
611	}
612
613
614	bool isLongPhraseStored(cellindex index, cellcount length) {
615
616	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
617
618	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
619	cellindex pointer;
620	cellindex readp = 0;
621	cellindex readi = 0;
622	cellindex readl = 0;
623
624	// Find the phrase in the hashFile
625	hashTableFile.seekg(hashOffset);
626	hashTableFile.read((char *) &pointer, sizeof(cellindex));
627
628	if (pointer == 0) {
629	// There is no entry at all in the hash table for this entry
630	// so nothing is stored
631	return false;
632
633	} else {
634	// There is a list starting at this hash value, so the phrase may
635	// be already remembered in that list
636	while (pointer != 0) {
637	// Read the entry pointed to by pointer
638	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
639	listOfEntries.read((char *) &readp, sizeof(cellindex));
640	listOfEntries.read((char *) &readi, sizeof(cellindex));
641	listOfEntries.read((char *) &readl, sizeof(cellindex));
642
643	if ((readi == index) && (readl = length)) {
644	// we've found the phrase stored here
645	return true;
646	} else {
647	// go on to the next node
648	pointer = readp;
649	}
650	}
651	}
652	return false;
653	}
654
655
656	void deleteLongPhraseMemory() {
657	// remove the hash & other files
658
659	hashTableFile.close();
660	listOfEntries.close();
661	remove(hashTableFileName);
662	remove(listOfEntriesName);
663
664	}
665
666
667	// Read the collection statistics file
668	//
669	void readStatistics() {
670
671	// open the statistics file
672	char filename[FILENAME_MAX];
673	sprintf(filename, "%s/clauses.stats", collection);
674
675	// Open the file
676	ifstream inFile(filename, ios::in);
677	if (!inFile) {
678	cerr << "File " << filename << " could not be opened\n";
679	exit(1);
680	}
681
682	// Read the numbers file into the numbers array
683	char key[1000];
684	symbol value;
685	while (inFile >> key >> value){
686	if (strcmp(key, "first_stopword") == 0) {
687	firstStopSymbol = value;
688	} else if (strcmp(key, "last_stopword") == 0) {
689	lastStopSymbol = value;
690	} else if (strcmp(key, "first_contentword") == 0) {
691	firstContentSymbol = value;
692	} else if (strcmp(key, "last_contentword") == 0) {
693	lastContentSymbol = value;
694	}
695	}
696	inFile.close();
697
698	// Make sure we have the information we need
699	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
700	cerr << "Statistics file incomplete" << endl;
701	exit(1);
702	}
703	}
704
705	cellcount getContentCount(symbol firstContent) {
706
707	cellcount content=0;
708	for (cellcount i=0; i<inputLength; i++) {
709	if (symbols[i]>=firstContent) content++;
710	}
711
712	return content;
713	}
714
715	int main (int argc, char * argv[]) {
716
717	// Command-line arguments
718	// argv[1] is the phindex directory
719	// argv[2] is the maximum array symbol length (optional)
720	// argv[3] is the mode, where 1 is stopword mode (optional)
721	if (argc < 2) {
722	cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
723	exit(1);
724	}
725
726	// collection directory
727	strcpy(collection, argv[1]);
728
729	// mode parameter
730	phraseMode = atoi(argv[2]);
731	assert((phraseMode == STOPWORDS) \|\| (phraseMode == ANYPHRASE));
732
733	// optional verbosity parameter
734	if (argc == 4) {
735	verbosity = atoi(argv[3]);
736	assert (verbosity >= 0);
737	}
738
739	if (verbosity) {
740	cout << "suffix: the phrase extraction program" << endl;
741	}
742
743	if (verbosity > 1) {
744	if (phraseMode == STOPWORDS) {
745	cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
746	} else {
747	cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
748	}
749	}
750
751	// Read the statistics file
752	readStatistics();
753
754	// Read the numbers file
755	readNumbers();
756
757	if (numberOfDocuments == 0) {
758	cerr << "There are no documents in this collection!" << endl;
759	exit(1);
760	}
761
762	symbol firstContent;
763	if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
764	else firstContent = firstStopSymbol;
765
766	cellcount contentLength = 0;
767	contentLength = getContentCount(firstContent);
768
769	// Create the suffix & prefix arrays
770	suffixArray = new symbol *[contentLength];
771	prefixArray = new symbol *[contentLength];
772
773	cellcount here=0;
774	// Initialise prefix and suffix arrays, only use the needed suffixes
775	for (cellcount j = 0; j < inputLength; j++) {
776	if (symbols[j]>=firstContent) {
777	suffixArray[here] = &symbols[j];
778	prefixArray[here] = &symbols[j];
779	here++;
780	}
781	}
782	qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
783	qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
784
785	suffixCheck = new check[contentLength];
786	if (suffixCheck == NULL) {
787	cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
788	exit(2);
789	}
790	for (cellcount j = 0; j < contentLength; j++)
791	suffixCheck[j] = 0;
792
793	cout <<"\ngenerating the phrase hierarchy\n\n";
794
795	// Create the document arrays
796	if (verbosity > 1) {
797	cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
798	}
799
800	// The document frequecy array is used to count the number of times
801	// each phrase occurs in each document. The number of documents in
802	// which a phrase occurs is stored in df.
803	frequency *documentFrequency = new frequency[numberOfDocuments];
804	frequency df;
805
806	// documentArray will be searched in order to discover which document
807	// each phrase occurs in.
808	documentArray = new symbol *[numberOfDocuments];
809
810	// just scan through the input text to find the doc starts
811	cellindex d = 0;
812	for (cellindex i=0; i<inputLength; i++) {
813	if (symbols[i] == DOCUMENTSTART) {
814	documentArray[d] = &symbols[i];
815	d++;
816	}
817	}
818
819	// the phrases stuff is expecting inputLength to be the length of the
820	// suffix array, so change it.
821	inputLength = contentLength;
822
823	// Extract phrases
824	//
825	// We will make several passesover the data, in each case considering
826	// a set of input phrases and generating a set of output phrases, which
827	// we will expancd in later passes.
828	//
829	// The input phrases in the first pass will be the vocabulary.
830	// In later passes, the input phrases will be the output phrases of the
831	// previous pass.
832	//
833	// In each pass we will consider each input phrase in turn. If we
834	// have seen it before, we will ignore it. Otherwise, we will expand
835	// it and add its expansions to the set of output phrases.
836
837	// Store the phrase data in the phrases file
838	char phraseDataName[FILENAME_MAX];
839	sprintf(phraseDataName, "%s/phrases", collection);
840	ofstream phraseData(phraseDataName, ios::out);
841	if (!phraseData) {
842	cout << "File " << phraseDataName << " could not be opened\n";
843	exit(1);
844	}
845
846	// Count the number of phrases output
847	unsigned long int phraseCounter = 0;
848
849	// Set up the phrase expansion memory.
850	// We need this so that we don't expand a phrase more than once
851	initialisePhraseMemory();
852
853	// The current pass numebr
854	int phrasePass = 1;
855
856
857	// PASS NUMBER 1
858	if (verbosity > 1) {
859	cout << "Starting pass " << phrasePass << endl;
860	}
861
862	ofstream outPhrase;
863	char outPhraseName[FILENAME_MAX];
864	unsigned long int outPhraseCounter = 0;
865
866	// On the first pass, simply work through the vocabulary
867	sprintf(outPhraseName, "%s/outPhrase.1", collection);
868	outPhrase.open(outPhraseName, ios::out);
869	if (!outPhrase) {
870	cerr << "File " << outPhraseName << " could not be opened\n";
871	exit(1);
872	}
873
874	// Iterate over the different symbols by working through the suffix array
875	vector<Phrase> result;
876	cellindex ij = 0;
877	char *tmpString;
878
879	Phrase p;
880	while (ij < inputLength) {
881
882	// make a new phrase of length 1
883	p = Phrase(suffixArray[ij], 1, SUFFIX);
884	p.findFirstAndLastSuffix(ij, inputLength-1);
885
886	// We ignore this symbol if it occurs only once, if it is a delimiter,
887	// of if we are in stopwords mode and it is a stopword
888	// - in this new version, only need to check freq
889	// We could imagine a new mode/command-line option, which is like
890	// STOPWORDS but without this restrictrion. This would let you browse
891	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
892	// "the AGRIS" (where the is a stopword and AGRIS a content word).
893	// The system used to work like this; it is easy to implement, but
894	// it explodes the size of the indexes. So: would it be useful?
895	if (p.suffixFrequency > 1) {
896	// Get minimal expansions of the phrase
897	getExpansions(p, result);
898
899	if (!result.empty()) {
900
901	// Remember that we have expanded this phrase
902	rememberThisPhrase(ij, 1);
903
904	// write the phrase text
905	phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
906	<< result.size() << ":";
907
908	// write the results
909	for (cellcount k = 0; k < result.size(); k++) {
910	if (k) {
911	phraseData << ",";
912	}
913	phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
914	outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
915	outPhraseCounter++;
916	}
917	result.clear();
918
919	// Write the documents in which this phrase occurs
920	df = getDocumentOccurrances(p, documentFrequency);
921	phraseData << ":" << df << ":";
922
923	// write the documents
924	for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
925	if (documentFrequency[m]) {
926	if (first) {
927	first = 0;
928	} else {
929	phraseData << ";";
930	}
931	// Output the document number. Note that here we've numbered the
932	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
933	// add 1 to the document id when we output it.
934	phraseData << "d" << (m+1);
935	// Next, output the frequency with which the document occurs, but
936	// only if it is > 1.
937	if (documentFrequency[m] > 1) {
938	phraseData << "," << documentFrequency[m];
939	}
940	}
941	}
942
943	phraseData << endl;
944	phraseCounter++;
945
946	// feedback
947	if (verbosity) {
948	if (phraseCounter % 1000 == 0) {
949	cout << "phrase " << phraseCounter << ": "
950	<< "cell " << p.firstSuffixIndex << " - " << p << endl;
951	}
952	}
953	}
954	}
955	ij = p.lastSuffixIndex + 1;
956	}
957	outPhrase.close();
958
959	// REMAINING PASSES
960	// The previous outPhrase file forms the input to each new pass
961	cellcount start, length;
962	while (outPhraseCounter > 0) {
963
964	// Start a new pass
965	phrasePass++;
966	if (verbosity) {
967	cout << "Starting pass " << phrasePass << endl;
968	}
969
970	// Open the input file
971	char inPhraseName[FILENAME_MAX];
972	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
973	ifstream inPhrase (inPhraseName, ios::in);
974	if (!inPhrase) {
975	cerr << "File " << inPhraseName << " could not be opened\n";
976	exit(1);
977	}
978
979	// Open the output file
980	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
981	outPhrase.open(outPhraseName, ios::out);
982	if (!outPhrase) {
983	cerr << "File " << outPhraseName << " could not be opened\n";
984	exit(1);
985	}
986	outPhraseCounter = 0;
987
988	// Process each phrase
989	while(inPhrase >> start >> length) {
990
991	// Ignore the phrase if we have expanded it before
992	if (isPhraseStored(start, length))
993	continue;
994
995	// Remember that we have examined this phrase
996	rememberThisPhrase(start, length);
997
998	// Find the phrase in the suffixarray
999	p = Phrase(suffixArray[start], length, SUFFIX);
1000	p.findFirstAndLastSuffix(start, inputLength-1);
1001
1002	// Ignore the phrase if it only occurs once
1003	if (p.suffixFrequency < 2)
1004	continue;
1005
1006	// Write the phrase text;
1007	phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1008
1009	// Expand the phrase, if it is fewer than 8 words long
1010	if (length <= 8) {
1011
1012	// Get the minimal expansions for this phrase
1013	getExpansions(p, result);
1014
1015	// write the results
1016	phraseData << result.size() << ":";
1017
1018	for (cellcount i = 0; i < result.size(); i++) {
1019	if (i) {
1020	phraseData << ",";
1021	}
1022	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1023	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1024	outPhraseCounter++;
1025	}
1026	result.clear();
1027
1028	} else {
1029	// phrase is too long to expand further
1030	phraseData << "0:";
1031	}
1032
1033	// Write the documents in which this phrase occurs
1034	df = getDocumentOccurrances(p, documentFrequency);
1035	phraseData << ":" << df << ":";
1036
1037	// write the documents
1038	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
1039	if (documentFrequency[i]) {
1040	if (first) {
1041	first = 0;
1042	} else {
1043	phraseData << ";";
1044	}
1045	// Output the document number. Note that here we've numbered the
1046	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
1047	// add 1 to the document id when we output it.
1048	phraseData << "d" << (i+1);
1049	// Next, output the frequency with which the document occurs, but
1050	// only if it is > 1.
1051	if (documentFrequency[i] > 1) {
1052	phraseData << "," << documentFrequency[i];
1053	}
1054	}
1055	}
1056
1057	phraseData << endl;
1058	phraseCounter++;
1059
1060	// feedback
1061	if (verbosity) {
1062	if (phraseCounter % 1000 == 0) {
1063	cout << "phrase " << phraseCounter << ": "<< "start " << start
1064	<< ", length " << length << " - " << p << endl;
1065	}
1066	}
1067
1068	}
1069
1070	inPhrase.close();
1071	outPhrase.close();
1072	}
1073
1074	phraseData.close();
1075	deletePhraseMemory();
1076
1077	delete [] documentFrequency;
1078	delete [] symbols;
1079	delete [] suffixArray;
1080	delete [] prefixArray;
1081	delete [] suffixCheck;
1082	delete [] documentArray;
1083
1084
1085
1086	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1087	return 0;
1088	}
1089
1090
1091
1092
1093

Note: See TracBrowser for help on using the repository browser.

Download in other formats: