Context Navigation

source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2806

Last change on this file since 2806 was 2806, checked in by kjm18, 22 years ago
changed initialisation loops to memset
Property svn:keywords set to `Author Date Id Revision`
File size: 30.1 KB

Line
1	/**********************************************************************
2	*
3	* Suffix.cpp -- Extract the repeated phrases in the input with suffix
4	* and prefix arrays (cgn & gwp's simpler algorithm,
5	* and kjm's improvements).
6	*
7	* Copyright 2000 Gordon W. Paynter
8	* Copyright 2000 The New Zealand Digital Library Project
9	*
10	* A component of the Greenstone digital library software
11	* from the New Zealand Digital Library Project at the
12	* University of Waikato, New Zealand.
13	*
14	* This program is free software; you can redistribute it and/or modify
15	* it under the terms of the GNU General Public License as published by
16	* the Free Software Foundation; either version 2 of the License, or
17	* (at your option) any later version.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the GNU General Public License
25	* along with this program; if not, write to the Free Software
26	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	*
28	*********************************************************************/
29
30	#include <assert.h>
31	#include <math.h>
32	#include <stdio.h>
33	#include <stdlib.h>
34	#include <string.h>
35
36	#if defined(GSDL_USE_IOS_H)
37	# include <fstream.h>
38	# include <iostream.h>
39	#else
40	# include <fstream>
41	# include <iostream>
42	#endif
43
44	#if defined(GSDL_USE_STL_H)
45	# if defined(GSDL_USE_ALGO_H)
46	# include <algo.h>
47	# else
48	# include <algorithm.h>
49	# endif
50	# include <vector.h>
51	#else
52	# include <algorithm>
53	# include <vector>
54	#endif
55
56	#include "suffix.h"
57	#include "phrase.h"
58
59	// Global variables declared in suffix.h
60	cellcount inputLength;
61
62	symbol *symbols;
63	symbol **suffixArray;
64	symbol **prefixArray;
65	check *suffixCheck;
66
67
68	// How many documents are in this collection?
69	cellcount numberOfDocuments;
70	symbol **documentArray;
71
72
73	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
74	int phraseMode = ANYPHRASE; //STOPWORDS;
75
76
77	// The filestem of the collection's phindex directory
78	char collection[FILENAME_MAX];
79
80
81	// The ranges of the stopword and content-word symbols for the collection
82	symbol firstStopSymbol = 0;
83	symbol lastStopSymbol = 0;
84	symbol firstContentSymbol = 0;
85	symbol lastContentSymbol = 0;
86
87
88	// Some useful comparison functions, defined below.
89	int suffixCompare(const void , const void );
90	int prefixCompare(const void , const void );
91	int pointerCompare(const void , const void );
92
93
94	// Functions for implementing "phrase memory". These let us "remember"
95	// each phrase that we've expanded without using too much memory.
96	void initialisePhraseMemory();
97	void rememberThisPhrase(cellindex index, cellcount length);
98	bool isPhraseStored(cellindex index, cellcount length);
99	void deletePhraseMemory();
100
101
102	// how much output do we want?
103	int verbosity = 1;
104
105
106	// Get a phrase's expansions
107	//
108	// Get the set of "minimal", "maximal", non-unique expansions of a
109	// phrase p, using the simpler algorithm that Craig and Gordon came up
110	// with at Google.
111	//
112	// Returns a vector of Expansions.
113
114	void getExpansions(Phrase &p, vector<Phrase> &results) {
115
116	// 1. Get the initial candidates
117	vector<Phrase> candidates;
118	p.initialSuffixCandidates(candidates);
119	int suffcands = candidates.size();
120	p.initialPrefixCandidates(candidates);
121
122	if (candidates.size() == 0)
123	return;
124
125	vector<Phrase>::iterator i;
126	for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
127	i->expandWhileUniquePrefixExtension();
128	i->ensureSuffixFound();
129	}
130	for (i; i != candidates.end(); ++i) {
131	i->expandWhileUniqueSuffixExtension();
132	}
133
134	// 3. Ensure minimality: ignore phrases whose subphrases are also found
135	results.clear();
136
137	// Initialise the candidates, check array, and various variables.
138	sort(candidates.begin(), candidates.end(), isShorter);
139	memset(suffixCheck, 0, sizeof(check)*inputLength);
140	//for (cellcount j = 0; j < inputLength; j++)
141	// suffixCheck[j] = 0;
142	unsigned minimum_length = candidates.begin()->length;
143
144	// Try to add each candidate to the results set, ignoring the non-minimal
145	for (vector<Phrase>::iterator candidate = candidates.begin();
146	candidate != candidates.end(); candidate++) {
147
148	// Make a copy of candidate to mutilate while performing sub-phrase checks
149	Phrase temp_phrase(*candidate);
150	bool shorter_found = false;
151
152	// Check for shorter and shorter versions of the tenporary phrase
153	while (temp_phrase.length >= minimum_length && !shorter_found) {
154	temp_phrase.ensureSuffixFound();
155	if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
156	temp_phrase.shortenByOneAtPrefix();
157	else
158	shorter_found = true;
159
160	// Possible efficiency here: we can finish if the prefix of c
161	// and temp_phrase are the same for candidate->length symbols.
162	}
163
164	if (!shorter_found) {
165	results.push_back(*candidate);
166	candidate->ensureSuffixFound();
167	for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
168	suffixCheck[k] = candidate->length;
169	}
170	}
171	}
172
173
174	// suffixCompare
175	//
176	// Compare two pointers into a suffix array. We use this in the
177	// qsort function, so the input are pointers to pointers.
178	//
179	// Return -1 if (a < b), otherwise (a > b) so return +1,
180
181	int suffixCompare(const void cpa, const void cpb) {
182
183	// Cast then dereference pointers to suffix array elements
184	symbol pa = (symbol ) cpa;
185	symbol pb = (symbol ) cpb;
186	pa = (symbol ) pa;
187	pb = (symbol ) pb;
188
189	// If the two elements are the same, examine the next one
190	while (pa == pb) {
191	*pa++;
192	*pb++;
193	}
194
195	// Make the copmparison and return
196	if ( pa < pb) {
197	return -1;
198	} else {
199	return +1;
200	}
201	}
202
203
204	// prefixCompare
205	//
206	// Compare two pointers into a prefix array. We use this in the
207	// qsort function, so the input are pointers to pointers.
208	//
209	// Return -1 if (a > b), otherwise (a < b) so return +1,
210
211	int prefixCompare(const void cpa, const void cpb) {
212
213	// Cast then dereference pointers to prefix array elements
214	symbol pa = (symbol ) cpa;
215	symbol pb = (symbol ) cpb;
216	pa = (symbol ) pa;
217	pb = (symbol ) pb;
218
219	// If the two elements are the same, examine the next one
220	while (pa == pb) {
221	*pa--;
222	*pb--;
223	}
224
225	// Make the copmparison and return
226	if ( pa > pb) {
227	return -1;
228	} else {
229	return +1;
230	}
231	}
232
233	// simpleCompare
234	//
235	// Compare two pointers based on the memory location they point to.
236
237	int pointerCompare( const void pa, const void pb ) {
238
239	symbol a = (symbol ) pa;
240	symbol b = (symbol ) pb;
241
242	if (a < b) {
243	return -1;
244	} else if (a > b) {
245	return 1;
246	} else {
247	return 0;
248	}
249	}
250
251
252	// Read the clauses.numbers file into the "symbols" array.
253	//
254	// Each number in the file is a symbol number; it is essential that
255	// the first symbol (and no others) be COLLECTIONSTART and the last
256	// symbol (and no others) be COLLECTIONEND.
257	//
258	// Return the number of numbers in the array.
259
260	int readNumbers() {
261
262	char filename[FILENAME_MAX];
263	sprintf(filename, "%s/clauses.numbers", collection);
264	if (verbosity) {
265	cout << "Reading numbers file: " << filename << endl;
266	}
267
268	// Open the numbers file
269	ifstream inFile1(filename, ios::in);
270	if (!inFile1) {
271	cerr << "File " << filename << " could not be opened\n";
272	exit(1);
273	}
274
275	// Count the number of symbols
276	inputLength = 0;
277	symbol word;
278	while (inFile1 >> word) {
279	inputLength++;
280	}
281	inFile1.close();
282
283	// Allocate the symbbols array
284	if (verbosity > 1) {
285	cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
286	}
287	symbols = new symbol[inputLength];
288	if (symbols == NULL) {
289	cerr << "Suffix error: not enough memory to hold " << inputLength
290	<< " symbols." << endl;
291	exit(2);
292	}
293
294	// Read the numbers file into the numbers array
295	if (verbosity > 2) {
296	cout << "Reading the numbers" << endl;
297	}
298
299	ifstream inFile2(filename, ios::in);
300	if (!inFile2) {
301	cerr << "File " << filename << " could not be opened\n";
302	exit(1);
303	}
304	cellcount next = 0;
305	numberOfDocuments = 0;
306	while (inFile2 >> word) {
307	symbols[next++] = word;
308	if (word == DOCUMENTSTART) {
309	numberOfDocuments++;
310	}
311	}
312	inFile2.close();
313
314	// Make sure the numbers file is intact
315	assert(symbols[0] == COLLECTIONSTART);
316	assert(symbols[next-1] == COLLECTIONEND);
317
318	return inputLength;
319	}
320
321
322
323	// Get Document Occurrance statistics
324	//
325	// Given a phrase, what documents does it occur in?
326
327	cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
328
329	// cout << "searching for \""<< p << "\" in documents "
330	// << 0 << "-" << numberOfDocuments - 1 << endl;
331
332	// The number of documents in which this phrase occurs
333	cellcount df = 0;
334
335	// Initialise the document frequency array
336	// for (cellindex i = 0; i < numberOfDocuments; i++) {
337	// frequency[i] = 0;
338	//}
339	memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
340
341	// variables used to facilitate the search
342	cellindex begin;
343	cellindex end;
344	cellindex d;
345	symbol *target;
346	bool found;
347
348	// search for the document in which each occurence of the phrase is found
349	for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
350
351	// cout << "looking for phrase at suffixArray[" << j << "]\n";
352
353	target = suffixArray[j];
354	begin = 0;
355	end = numberOfDocuments - 1;
356	found = false;
357
358	// Search for the occurence of a document delimiter that target
359	// occurs immediately after.
360	// We do this by performing a binary chop search on documentArray.
361	while (!found) {
362
363	// cout << "searching for " << (cellindex) target << " in "
364	// << begin << " - " << end << endl;
365
366	assert (begin <= end);
367
368	// If the beginning and end of the interval are the same,
369	// then we've found the correct document
370	if (begin == end) {
371	if (frequency[begin] == 0) {
372	df++;
373	}
374	frequency[begin]++;
375	found = true;
376	}
377
378	// Otherwise, examine a new document midway through the begin-end
379	// interval and see if it is the one.
380	else {
381	d = (begin + end) / 2;
382	if (target > documentArray[d]) {
383	// If target addrss is greater than this, but thisi sthe last document,
384	// then this must be the one we want. Or, if target is greater than
385	// this one but less then the next, this must be the one we wnat.
386	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
387	if (frequency[d] == 0) {
388	df++;
389	}
390	frequency[d]++;
391	found = true;
392	} else {
393	// otherwise we know to search later in the document set
394	begin = d + 1;
395	}
396	} else {
397	// search earlier in the document set
398	end = d - 1;
399	}
400	}
401	}
402	}
403	return df;
404	}
405
406
407
408
409
410
411	// phraseExpansionMemory : Which phrases have we expanded?
412	//
413	// A set of utilities for keeping track of which phrases we have expanded.
414	// We don't want to expand a phrase more than once, after all.
415	//
416	// This REALLY ought to be in its own class, but it works so that's okay.
417	//
418	// Phrases are identified by their firstSuffixPosition and length.
419	//
420	// Functions provided are:
421	// void initialisePhraseMemory()
422	// void rememberThisPhrase(index, length)
423	// bool isPhraseStored(index, length)
424	// void deletePhraseMemory()
425	//
426	// Internally, we will have two separate cases:
427	//
428	// Phrases of length 1-8:
429	// unsigned char phraseMemory[inputLength]
430	// is an array where each cell "remembers" the corresponding index in the
431	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
432	// of length 1, 2... 8.
433	// Eventually, we will make this disk-based (i.e. store the array in a file).
434	//
435	// Phrases of length 9+:
436	// file hashTableFile
437	// file listOfEntries
438	// The first file is a hash table; each phrase maps to one of its cells, which
439	// contains either 0 (empty, no occurence) or a number which is an entry number
440	// in the second file. This file contains a "list" of entries. Each consists of
441	// three numbers: the suffixArray index of the phrase, the length of the phrase,
442	// and the entry number of the next phrase with the same hash.
443	//
444
445
446	unsigned char *phraseMemory;
447
448	void initialiseLongPhraseMemory();
449	void rememberThisLongPhrase(cellindex index, cellcount length);
450	bool isLongPhraseStored(cellindex index, cellcount length);
451	void deleteLongPhraseMemory();
452
453
454	void initialisePhraseMemory() {
455
456	phraseMemory = new unsigned char[inputLength];
457
458	// to begin with, everything is empty
459	// for (cellcount i = 0; i < inputLength; i++) {
460	// phraseMemory[i] = 0;
461	//}
462	memset(phraseMemory, 0, sizeof(char)*inputLength);
463
464	// intialise the hashTable of long phrases
465	initialiseLongPhraseMemory();
466
467	}
468
469	void rememberThisPhrase(cellindex index, cellcount length) {
470
471	// if the phrase is very long, use the file-based system
472	if (length > 8) {
473	rememberThisLongPhrase(index, length);
474	return;
475	}
476
477	// create a char with just the bit corresponding to length set
478	unsigned char newbit = 1;
479	for (cellcount i = 1; i < length; i++) {
480	newbit <<= 1;
481	}
482
483	// set that bit in the memory array at position index
484	phraseMemory[index] \|= newbit;
485	}
486
487
488	bool isPhraseStored(cellindex index, cellcount length) {
489
490	// if the phrase is very long, use the file-based system
491	if (length > 8) {
492	return isLongPhraseStored(index, length);
493	}
494
495	// create a char with just the bit corresponding to length set
496	unsigned char newbit = 1;
497	for (cellcount i = 1; i < length; i++) {
498	newbit <<= 1;
499	}
500
501	// retrurn true if that bit is set in memory arrayat position index
502	return (phraseMemory[index] & newbit);
503	}
504
505	void deletePhraseMemory() {
506	delete phraseMemory;
507	deleteLongPhraseMemory();
508	}
509
510
511
512	// Files etc used to store "long" equavlents of the above
513
514	fstream hashTableFile;
515	char hashTableFileName[FILENAME_MAX];
516	fstream listOfEntries;
517	char listOfEntriesName[FILENAME_MAX];
518	cellindex nextEntryNumber;
519
520	const cellcount bigPrime = 7919;
521
522
523	void initialiseLongPhraseMemory() {
524
525	cellindex example = 0;
526
527	sprintf(hashTableFileName, "%s/hashTable", collection);
528	sprintf(listOfEntriesName, "%s/hashLists", collection);
529
530
531	// create the new hashtable
532	if (verbosity > 1) {
533	cout << "Initialising hashTable: " << hashTableFileName << endl;
534	}
535	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
536	for (cellcount i = 0; i < bigPrime; i++) {
537	hashTableFile.write((char *) &example, sizeof(example));
538	}
539
540	// create the list of phrases
541	if (verbosity > 1) {
542	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
543	}
544	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
545	listOfEntries.write((char *) &example, sizeof(example));
546	listOfEntries.write((char *) &example, sizeof(example));
547	listOfEntries.write((char *) &example, sizeof(example));
548	nextEntryNumber = 1;
549	}
550
551
552	void rememberThisLongPhrase(cellindex index, cellcount length) {
553
554	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
555
556	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
557	cellindex pointer;
558	cellindex zero = 0;
559	cellindex readp = 0;
560	cellindex readi = 0;
561	cellindex readl = 0;
562
563	hashTableFile.seekg(hashOffset);
564	hashTableFile.read((char *) &pointer, sizeof(cellindex));
565
566	if (pointer == 0) {
567	// There is no entry at all in the hash table for this entry
568	// so create one
569
570	pointer = nextEntryNumber++;
571	hashTableFile.seekg(hashOffset);
572	hashTableFile.write((char *) &pointer, sizeof(cellindex));
573
574	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
575	listOfEntries.write((char *) &zero, sizeof(cellindex));
576	listOfEntries.write((char *) &index, sizeof(cellindex));
577	listOfEntries.write((char *) &length, sizeof(cellindex));
578
579	} else {
580	// There is a list starting at this hash value, so the phrase may
581	// be already remembered, or it might need to be appended
582
583	while (pointer != 0) {
584	// Read the entry pointed to by pointer
585	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
586	listOfEntries.read((char *) &readp, sizeof(cellindex));
587	listOfEntries.read((char *) &readi, sizeof(cellindex));
588	listOfEntries.read((char *) &readl, sizeof(cellindex));
589
590	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
591
592	if ((readi == index) && (readl = length)) {
593	// we've found that we've already stored it
594	return;
595	} else if (readp == 0) {
596	// we're reached the end of the list. Add a new entry.
597	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
598	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
599	pointer = nextEntryNumber++;
600
601	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
602	listOfEntries.write((char *) &zero, sizeof(cellindex));
603	listOfEntries.write((char *) &index, sizeof(cellindex));
604	listOfEntries.write((char *) &length, sizeof(cellindex));
605	return;
606	} else {
607	// go on to the next node
608	pointer = readp;
609	}
610	}
611	}
612
613
614	}
615
616
617	bool isLongPhraseStored(cellindex index, cellcount length) {
618
619	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
620
621	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
622	cellindex pointer;
623	cellindex readp = 0;
624	cellindex readi = 0;
625	cellindex readl = 0;
626
627	// Find the phrase in the hashFile
628	hashTableFile.seekg(hashOffset);
629	hashTableFile.read((char *) &pointer, sizeof(cellindex));
630
631	if (pointer == 0) {
632	// There is no entry at all in the hash table for this entry
633	// so nothing is stored
634	return false;
635
636	} else {
637	// There is a list starting at this hash value, so the phrase may
638	// be already remembered in that list
639	while (pointer != 0) {
640	// Read the entry pointed to by pointer
641	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
642	listOfEntries.read((char *) &readp, sizeof(cellindex));
643	listOfEntries.read((char *) &readi, sizeof(cellindex));
644	listOfEntries.read((char *) &readl, sizeof(cellindex));
645
646	if ((readi == index) && (readl = length)) {
647	// we've found the phrase stored here
648	return true;
649	} else {
650	// go on to the next node
651	pointer = readp;
652	}
653	}
654	}
655	return false;
656	}
657
658
659	void deleteLongPhraseMemory() {
660	// remove the hash & other files
661
662	hashTableFile.close();
663	listOfEntries.close();
664	remove(hashTableFileName);
665	remove(listOfEntriesName);
666
667	}
668
669
670	// Read the collection statistics file
671	//
672	void readStatistics() {
673
674	// open the statistics file
675	char filename[FILENAME_MAX];
676	sprintf(filename, "%s/clauses.stats", collection);
677
678	// Open the file
679	ifstream inFile(filename, ios::in);
680	if (!inFile) {
681	cerr << "File " << filename << " could not be opened\n";
682	exit(1);
683	}
684
685	// Read the numbers file into the numbers array
686	char key[1000];
687	symbol value;
688	while (inFile >> key >> value){
689	if (strcmp(key, "first_stopword") == 0) {
690	firstStopSymbol = value;
691	} else if (strcmp(key, "last_stopword") == 0) {
692	lastStopSymbol = value;
693	} else if (strcmp(key, "first_contentword") == 0) {
694	firstContentSymbol = value;
695	} else if (strcmp(key, "last_contentword") == 0) {
696	lastContentSymbol = value;
697	}
698	}
699	inFile.close();
700
701	// Make sure we have the information we need
702	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
703	cerr << "Statistics file incomplete" << endl;
704	exit(1);
705	}
706	}
707
708	cellcount getContentCount(symbol firstContent) {
709
710	cellcount content=0;
711	for (cellcount i=0; i<inputLength; i++) {
712	if (symbols[i]>=firstContent) content++;
713	}
714
715	return content;
716	}
717
718	int main (int argc, char * argv[]) {
719
720	// Command-line arguments
721	// argv[1] is the phindex directory
722	// argv[2] is the maximum array symbol length (optional)
723	// argv[3] is the mode, where 1 is stopword mode (optional)
724	if (argc < 2) {
725	cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
726	exit(1);
727	}
728
729	// collection directory
730	strcpy(collection, argv[1]);
731
732	// mode parameter
733	phraseMode = atoi(argv[2]);
734	assert((phraseMode == STOPWORDS) \|\| (phraseMode == ANYPHRASE));
735
736	// optional verbosity parameter
737	if (argc == 4) {
738	verbosity = atoi(argv[3]);
739	assert (verbosity >= 0);
740	}
741
742	if (verbosity) {
743	cout << "suffix: the phrase extraction program" << endl;
744	}
745
746	if (verbosity > 1) {
747	if (phraseMode == STOPWORDS) {
748	cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
749	} else {
750	cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
751	}
752	}
753
754	// Read the statistics file
755	readStatistics();
756
757	// Read the numbers file
758	readNumbers();
759
760	if (numberOfDocuments == 0) {
761	cerr << "There are no documents in this collection!" << endl;
762	exit(1);
763	}
764
765	symbol firstContent;
766	if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
767	else firstContent = firstStopSymbol;
768
769	cellcount contentLength = 0;
770	contentLength = getContentCount(firstContent);
771
772	// Create the suffix & prefix arrays
773	suffixArray = new symbol *[contentLength];
774	prefixArray = new symbol *[contentLength];
775
776	cellcount here=0;
777	// Initialise prefix and suffix arrays, only use the needed suffixes
778	for (cellcount j = 0; j < inputLength; j++) {
779	if (symbols[j]>=firstContent) {
780	suffixArray[here] = &symbols[j];
781	prefixArray[here] = &symbols[j];
782	here++;
783	}
784	}
785	qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
786	qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
787
788	suffixCheck = new check[contentLength];
789	if (suffixCheck == NULL) {
790	cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
791	exit(2);
792	}
793	//for (cellcount j = 0; j < contentLength; j++)
794	// suffixCheck[j] = 0;
795	memset(suffixCheck, 0, sizeof(check)*contentLength);
796
797	cout <<"\ngenerating the phrase hierarchy\n\n";
798
799	// Create the document arrays
800	if (verbosity > 1) {
801	cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
802	}
803
804	// The document frequecy array is used to count the number of times
805	// each phrase occurs in each document. The number of documents in
806	// which a phrase occurs is stored in df.
807	frequency *documentFrequency = new frequency[numberOfDocuments];
808	frequency df;
809
810	// documentArray will be searched in order to discover which document
811	// each phrase occurs in.
812	documentArray = new symbol *[numberOfDocuments];
813
814	// just scan through the input text to find the doc starts
815	cellindex d = 0;
816	for (cellindex i=0; i<inputLength; i++) {
817	if (symbols[i] == DOCUMENTSTART) {
818	documentArray[d] = &symbols[i];
819	d++;
820	}
821	}
822
823	// the phrases stuff is expecting inputLength to be the length of the
824	// suffix array, so change it.
825	inputLength = contentLength;
826
827	// Extract phrases
828	//
829	// We will make several passesover the data, in each case considering
830	// a set of input phrases and generating a set of output phrases, which
831	// we will expancd in later passes.
832	//
833	// The input phrases in the first pass will be the vocabulary.
834	// In later passes, the input phrases will be the output phrases of the
835	// previous pass.
836	//
837	// In each pass we will consider each input phrase in turn. If we
838	// have seen it before, we will ignore it. Otherwise, we will expand
839	// it and add its expansions to the set of output phrases.
840
841	// Store the phrase data in the phrases file
842	char phraseDataName[FILENAME_MAX];
843	sprintf(phraseDataName, "%s/phrases", collection);
844	ofstream phraseData(phraseDataName, ios::out);
845	if (!phraseData) {
846	cout << "File " << phraseDataName << " could not be opened\n";
847	exit(1);
848	}
849
850	// Count the number of phrases output
851	unsigned long int phraseCounter = 0;
852
853	// Set up the phrase expansion memory.
854	// We need this so that we don't expand a phrase more than once
855	initialisePhraseMemory();
856
857	// The current pass numebr
858	int phrasePass = 1;
859
860
861	// PASS NUMBER 1
862	if (verbosity > 1) {
863	cout << "Starting pass " << phrasePass << endl;
864	}
865
866	ofstream outPhrase;
867	char outPhraseName[FILENAME_MAX];
868	unsigned long int outPhraseCounter = 0;
869
870	// On the first pass, simply work through the vocabulary
871	sprintf(outPhraseName, "%s/outPhrase.1", collection);
872	outPhrase.open(outPhraseName, ios::out);
873	if (!outPhrase) {
874	cerr << "File " << outPhraseName << " could not be opened\n";
875	exit(1);
876	}
877
878	// Iterate over the different symbols by working through the suffix array
879	vector<Phrase> result;
880	cellindex ij = 0;
881	char *tmpString;
882
883	Phrase p;
884	while (ij < inputLength) {
885
886	// make a new phrase of length 1
887	p = Phrase(suffixArray[ij], 1, SUFFIX);
888	p.findFirstAndLastSuffix(ij, inputLength-1);
889
890	// We ignore this symbol if it occurs only once, if it is a delimiter,
891	// of if we are in stopwords mode and it is a stopword
892	// - in this new version, only need to check freq
893	// We could imagine a new mode/command-line option, which is like
894	// STOPWORDS but without this restrictrion. This would let you browse
895	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
896	// "the AGRIS" (where the is a stopword and AGRIS a content word).
897	// The system used to work like this; it is easy to implement, but
898	// it explodes the size of the indexes. So: would it be useful?
899	if (p.suffixFrequency > 1) {
900	// Get minimal expansions of the phrase
901	getExpansions(p, result);
902
903	if (!result.empty()) {
904
905	// Remember that we have expanded this phrase
906	rememberThisPhrase(ij, 1);
907
908	// write the phrase text
909	phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
910	<< result.size() << ":";
911
912	// write the results
913	for (cellcount k = 0; k < result.size(); k++) {
914	if (k) {
915	phraseData << ",";
916	}
917	phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
918	outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
919	outPhraseCounter++;
920	}
921	result.clear();
922
923	// Write the documents in which this phrase occurs
924	df = getDocumentOccurrances(p, documentFrequency);
925	phraseData << ":" << df << ":";
926
927	// write the documents
928	for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
929	if (documentFrequency[m]) {
930	if (first) {
931	first = 0;
932	} else {
933	phraseData << ";";
934	}
935	// Output the document number. Note that here we've numbered the
936	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
937	// add 1 to the document id when we output it.
938	phraseData << "d" << (m+1);
939	// Next, output the frequency with which the document occurs, but
940	// only if it is > 1.
941	if (documentFrequency[m] > 1) {
942	phraseData << "," << documentFrequency[m];
943	}
944	}
945	}
946
947	phraseData << endl;
948	phraseCounter++;
949
950	// feedback
951	if (verbosity) {
952	if (phraseCounter % 1000 == 0) {
953	cout << "phrase " << phraseCounter << ": "
954	<< "cell " << p.firstSuffixIndex << " - " << p << endl;
955	}
956	}
957	}
958	}
959	ij = p.lastSuffixIndex + 1;
960	}
961	outPhrase.close();
962
963	// REMAINING PASSES
964	// The previous outPhrase file forms the input to each new pass
965	cellcount start, length;
966	while (outPhraseCounter > 0) {
967
968	// Start a new pass
969	phrasePass++;
970	if (verbosity) {
971	cout << "Starting pass " << phrasePass << endl;
972	}
973
974	// Open the input file
975	char inPhraseName[FILENAME_MAX];
976	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
977	ifstream inPhrase (inPhraseName, ios::in);
978	if (!inPhrase) {
979	cerr << "File " << inPhraseName << " could not be opened\n";
980	exit(1);
981	}
982
983	// Open the output file
984	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
985	outPhrase.open(outPhraseName, ios::out);
986	if (!outPhrase) {
987	cerr << "File " << outPhraseName << " could not be opened\n";
988	exit(1);
989	}
990	outPhraseCounter = 0;
991
992	// Process each phrase
993	while(inPhrase >> start >> length) {
994
995	// Ignore the phrase if we have expanded it before
996	if (isPhraseStored(start, length))
997	continue;
998
999	// Remember that we have examined this phrase
1000	rememberThisPhrase(start, length);
1001
1002	// Find the phrase in the suffixarray
1003	p = Phrase(suffixArray[start], length, SUFFIX);
1004	p.findFirstAndLastSuffix(start, inputLength-1);
1005
1006	// Ignore the phrase if it only occurs once
1007	if (p.suffixFrequency < 2)
1008	continue;
1009
1010	// Write the phrase text;
1011	phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1012
1013	// Expand the phrase, if it is fewer than 8 words long
1014	if (length <= 8) {
1015
1016	// Get the minimal expansions for this phrase
1017	getExpansions(p, result);
1018
1019	// write the results
1020	phraseData << result.size() << ":";
1021
1022	for (cellcount i = 0; i < result.size(); i++) {
1023	if (i) {
1024	phraseData << ",";
1025	}
1026	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1027	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1028	outPhraseCounter++;
1029	}
1030	result.clear();
1031
1032	} else {
1033	// phrase is too long to expand further
1034	phraseData << "0:";
1035	}
1036
1037	// Write the documents in which this phrase occurs
1038	df = getDocumentOccurrances(p, documentFrequency);
1039	phraseData << ":" << df << ":";
1040
1041	// write the documents
1042	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
1043	if (documentFrequency[i]) {
1044	if (first) {
1045	first = 0;
1046	} else {
1047	phraseData << ";";
1048	}
1049	// Output the document number. Note that here we've numbered the
1050	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
1051	// add 1 to the document id when we output it.
1052	phraseData << "d" << (i+1);
1053	// Next, output the frequency with which the document occurs, but
1054	// only if it is > 1.
1055	if (documentFrequency[i] > 1) {
1056	phraseData << "," << documentFrequency[i];
1057	}
1058	}
1059	}
1060
1061	phraseData << endl;
1062	phraseCounter++;
1063
1064	// feedback
1065	if (verbosity) {
1066	if (phraseCounter % 1000 == 0) {
1067	cout << "phrase " << phraseCounter << ": "<< "start " << start
1068	<< ", length " << length << " - " << p << endl;
1069	}
1070	}
1071
1072	}
1073
1074	inPhrase.close();
1075	outPhrase.close();
1076	}
1077
1078	phraseData.close();
1079	deletePhraseMemory();
1080
1081	delete [] documentFrequency;
1082	delete [] symbols;
1083	delete [] suffixArray;
1084	delete [] prefixArray;
1085	delete [] suffixCheck;
1086	delete [] documentArray;
1087
1088
1089
1090	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1091	return 0;
1092	}
1093
1094
1095
1096
1097

Note: See TracBrowser for help on using the repository browser.

Download in other formats: