Context Navigation

source: main/trunk/greenstone2/build-src/src/phind/generate/suffix.cpp@ 26135

Last change on this file since 26135 was 26135, checked in by ak19, 12 years ago
Changing unsigned long to mg_u_long in Phind's suffix.exe program. This did not help to get suffix to work on 64 bit linux, but the changes have not broken it on Windows either (where it already worked). Therefore, since the unsigned long to mg_u_long changes bring this part of the code up to speed with the same changes made elsewhere in the C code, am still committing this.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.3 KB

Line
1	/**********************************************************************
2	*
3	* Suffix.cpp -- Extract the repeated phrases in the input with suffix
4	* and prefix arrays (cgn & gwp's simpler algorithm,
5	* and kjm's improvements).
6	*
7	* Copyright 2000 Gordon W. Paynter
8	* Copyright 2000 The New Zealand Digital Library Project
9	*
10	* A component of the Greenstone digital library software from the
11	* New Zealand Digital Library Project at the University of Waikato,
12	* New Zealand.
13	*
14	* This program is free software; you can redistribute it and/or
15	* modify it under the terms of the GNU General Public License as
16	* published by the Free Software Foundation; either version 2 of
17	* the License, or (at your option) any later version.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the GNU General Public License
25	* along with this program; if not, write to the Free Software
26	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	*
28	*********************************************************************/
29
30	#include <assert.h>
31	#include <math.h>
32	#include <stdio.h>
33	#include <stdlib.h>
34	#include <string.h>
35
36	#if defined(GSDL_USE_IOS_H)
37	# include <fstream.h>
38	# include <iostream.h>
39	#else
40	# include <fstream>
41	# include <iostream>
42	#endif
43
44	#if defined(GSDL_USE_STL_H)
45	# if defined(GSDL_USE_ALGO_H)
46	# include <algo.h>
47	# else
48	# include <algorithm.h>
49	# endif
50	# include <vector.h>
51	#else
52	# include <algorithm>
53	# include <vector>
54	#endif
55
56	#include "suffix.h"
57	#include "phrase.h"
58	#include "check.h"
59	#include "mglong.h"
60
61	// Global variables declared in suffix.h
62	cellcount inputLength;
63
64	symbol *symbols;
65	symbol **suffixArray;
66	symbol **prefixArray;
67
68	// How many documents are in this collection?
69	cellcount numberOfDocuments;
70	symbol **documentArray;
71
72
73	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
74	int phraseMode = ANYPHRASE; //STOPWORDS;
75	// What is the minimum phrase frequency for a phrase to be included in the hierarchy
76	int minOccurs = 2;
77
78	// The filestem of the collection's phindex directory
79	char collection[FILENAME_MAX];
80
81
82	// The ranges of the stopword and content-word symbols for the collection
83	symbol firstStopSymbol = 0;
84	symbol lastStopSymbol = 0;
85	symbol firstContentSymbol = 0;
86	symbol lastContentSymbol = 0;
87
88
89	// Some useful comparison functions, defined below.
90	int suffixCompare(const void , const void );
91	int prefixCompare(const void , const void );
92	int pointerCompare(const void , const void );
93
94
95	// Functions for implementing "phrase memory". These let us "remember"
96	// each phrase that we've expanded without using too much memory.
97	void initialisePhraseMemory();
98	void rememberThisPhrase(cellindex index, cellcount length);
99	bool isPhraseStored(cellindex index, cellcount length);
100	void deletePhraseMemory();
101
102
103	// how much output do we want?
104	int verbosity = 1;
105
106
107	// Get a phrase's expansions
108	//
109	// Get the set of "minimal", "maximal", non-unique expansions of a
110	// phrase p, using the simpler algorithm that Craig and Gordon came up
111	// with at Google.
112	//
113	// Returns a vector of Expansions.
114
115	void getExpansions(Phrase &p, vector<Phrase> &results) {
116
117	// 1. Get the initial candidates
118	vector<Phrase> candidates;
119	p.initialSuffixCandidates(candidates);
120	int suffcands = candidates.size();
121	p.initialPrefixCandidates(candidates);
122
123	if (candidates.size() == 0)
124	return;
125
126	vector<Phrase>::iterator i;
127	for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
128	i->expandWhileUniquePrefixExtension();
129	i->ensureSuffixFound();
130	}
131	for (i; i != candidates.end(); ++i) {
132	i->expandWhileUniqueSuffixExtension();
133	}
134
135	// 3. Ensure minimality: ignore phrases whose subphrases are also found
136	results.clear();
137
138	// Initialise the candidates, check array, and various variables.
139	sort(candidates.begin(), candidates.end(), isShorter);
140	unsigned minimum_length = candidates.begin()->length;
141	clearSuffixCheck();
142
143	// Try to add each candidate to the results set, ignoring the non-minimal
144	for (vector<Phrase>::iterator candidate = candidates.begin();
145	candidate != candidates.end(); ++candidate) {
146
147	// Make a copy of candidate to mutilate while performing sub-phrase checks
148	Phrase temp_phrase(*candidate);
149	bool shorter_found = false;
150
151	// Check for shorter and shorter versions of the temporary phrase
152	while (temp_phrase.length >= minimum_length && !shorter_found) {
153	temp_phrase.ensureSuffixFound();
154	if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
155	temp_phrase.shortenByOneAtPrefix();
156	else
157	shorter_found = true;
158
159	// Possible efficiency here: we can finish if the prefix of c
160	// and temp_phrase are the same for candidate->length symbols.
161	}
162
163	// If no shorter phrase is found, use this one
164	if (!shorter_found) {
165	results.push_back(*candidate);
166	candidate->ensureSuffixFound();
167	setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
168	}
169	}
170	}
171
172
173	// suffixCompare
174	//
175	// Compare two pointers into a suffix array. We use this in the
176	// qsort function, so the input are pointers to pointers.
177	//
178	// Return -1 if (a < b), otherwise (a > b) so return +1,
179
180	int suffixCompare(const void cpa, const void cpb) {
181
182	// Cast then dereference pointers to suffix array elements
183	symbol pa = (symbol ) cpa;
184	symbol pb = (symbol ) cpb;
185	pa = (symbol ) pa;
186	pb = (symbol ) pb;
187
188	// If the two elements are the same, examine the next one
189	while (pa == pb) {
190	*pa++;
191	*pb++;
192	}
193
194	// Make the copmparison and return
195	if ( pa < pb) {
196	return -1;
197	} else {
198	return +1;
199	}
200	}
201
202
203	// prefixCompare
204	//
205	// Compare two pointers into a prefix array. We use this in the
206	// qsort function, so the input are pointers to pointers.
207	//
208	// Return -1 if (a > b), otherwise (a < b) so return +1,
209
210	int prefixCompare(const void cpa, const void cpb) {
211
212	// Cast then dereference pointers to prefix array elements
213	symbol pa = (symbol ) cpa;
214	symbol pb = (symbol ) cpb;
215	pa = (symbol ) pa;
216	pb = (symbol ) pb;
217
218	// If the two elements are the same, examine the next one
219	while (pa == pb) {
220	*pa--;
221	*pb--;
222	}
223
224	// Make the copmparison and return
225	if ( pa > pb) {
226	return -1;
227	} else {
228	return +1;
229	}
230	}
231
232	// simpleCompare
233	//
234	// Compare two pointers based on the memory location they point to.
235
236	int pointerCompare( const void pa, const void pb ) {
237
238	symbol a = (symbol ) pa;
239	symbol b = (symbol ) pb;
240
241	if (a < b) {
242	return -1;
243	} else if (a > b) {
244	return 1;
245	} else {
246	return 0;
247	}
248	}
249
250
251	// Read the clauses.numbers file into the "symbols" array.
252	//
253	// Each number in the file is a symbol number; it is essential that
254	// the first symbol (and no others) be COLLECTIONSTART and the last
255	// symbol (and no others) be COLLECTIONEND.
256	//
257	// Return the number of numbers in the array.
258
259	int readNumbers() {
260
261	char filename[FILENAME_MAX];
262	sprintf(filename, "%s/clauses.numbers", collection);
263	if (verbosity) {
264	cout << "Reading numbers file: " << filename << endl;
265	}
266
267	// Open the numbers file
268	ifstream inFile1(filename, ios::in);
269	if (!inFile1) {
270	cerr << "File " << filename << " could not be opened\n";
271	exit(1);
272	}
273
274	// Count the number of symbols
275	inputLength = 0;
276	symbol word;
277	while (inFile1 >> word) {
278	++inputLength;
279	}
280	inFile1.close();
281
282	// Allocate the symbbols array
283	if (verbosity > 1) {
284	cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
285	}
286	symbols = new symbol[inputLength];
287	if (symbols == NULL) {
288	cerr << "Suffix error: not enough memory to hold " << inputLength
289	<< " symbols." << endl;
290	exit(2);
291	}
292
293	// Read the numbers file into the numbers array
294	if (verbosity > 2) {
295	cout << "Reading the numbers" << endl;
296	}
297
298	ifstream inFile2(filename, ios::in);
299	if (!inFile2) {
300	cerr << "File " << filename << " could not be opened\n";
301	exit(1);
302	}
303	cellcount next = 0;
304	numberOfDocuments = 0;
305	while (inFile2 >> word) {
306	symbols[next++] = word;
307	if (word == DOCUMENTSTART) {
308	++numberOfDocuments;
309	}
310	}
311	inFile2.close();
312
313	// Make sure the numbers file is intact
314	assert(symbols[0] == COLLECTIONSTART);
315	assert(symbols[next-1] == COLLECTIONEND);
316
317	return inputLength;
318	}
319
320
321
322	// Get Document Occurrance statistics
323	//
324	// Given a phrase, what documents does it occur in?
325
326	cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
327
328	// cout << "searching for \""<< p << "\" in documents "
329	// << 0 << "-" << numberOfDocuments - 1 << endl;
330
331	// The number of documents in which this phrase occurs
332	cellcount df = 0;
333
334	// Initialise the document frequency array
335	// for (cellindex i = 0; i < numberOfDocuments; ++i) {
336	// frequency[i] = 0;
337	//}
338	memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
339
340	// variables used to facilitate the search
341	cellindex begin;
342	cellindex end;
343	cellindex d;
344	symbol *target;
345	bool found;
346
347	// search for the document in which each occurence of the phrase is found
348	for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; ++j) {
349
350	// cout << "looking for phrase at suffixArray[" << j << "]\n";
351
352	target = suffixArray[j];
353	begin = 0;
354	end = numberOfDocuments - 1;
355	found = false;
356
357	// Search for the occurence of a document delimiter that target
358	// occurs immediately after.
359	// We do this by performing a binary chop search on documentArray.
360	while (!found) {
361
362	// cout << "searching for " << (cellindex) target << " in "
363	// << begin << " - " << end << endl;
364
365	assert (begin <= end);
366
367	// If the beginning and end of the interval are the same,
368	// then we've found the correct document
369	if (begin == end) {
370	if (frequency[begin] == 0) {
371	++df;
372	}
373	++frequency[begin];
374	found = true;
375	}
376
377	// Otherwise, examine a new document midway through the begin-end
378	// interval and see if it is the one.
379	else {
380	d = (begin + end) / 2;
381	if (target > documentArray[d]) {
382	// If target addrss is greater than this, but thisi sthe last document,
383	// then this must be the one we want. Or, if target is greater than
384	// this one but less then the next, this must be the one we wnat.
385	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
386	if (frequency[d] == 0) {
387	++df;
388	}
389	++frequency[d];
390	found = true;
391	} else {
392	// otherwise we know to search later in the document set
393	begin = d + 1;
394	}
395	} else {
396	// search earlier in the document set
397	end = d - 1;
398	}
399	}
400	}
401	}
402	return df;
403	}
404
405
406
407
408
409
410	// phraseExpansionMemory : Which phrases have we expanded?
411	//
412	// A set of utilities for keeping track of which phrases we have expanded.
413	// We don't want to expand a phrase more than once, after all.
414	//
415	// This REALLY ought to be in its own class, but it works so that's okay.
416	//
417	// Phrases are identified by their firstSuffixPosition and length.
418	//
419	// Functions provided are:
420	// void initialisePhraseMemory()
421	// void rememberThisPhrase(index, length)
422	// bool isPhraseStored(index, length)
423	// void deletePhraseMemory()
424	//
425	// Internally, we will have two separate cases:
426	//
427	// Phrases of length 1-8:
428	// unsigned char phraseMemory[inputLength]
429	// is an array where each cell "remembers" the corresponding index in the
430	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
431	// of length 1, 2... 8.
432	// Eventually, we will make this disk-based (i.e. store the array in a file).
433	//
434	// Phrases of length 9+:
435	// file hashTableFile
436	// file listOfEntries
437	// The first file is a hash table; each phrase maps to one of its cells, which
438	// contains either 0 (empty, no occurence) or a number which is an entry number
439	// in the second file. This file contains a "list" of entries. Each consists of
440	// three numbers: the suffixArray index of the phrase, the length of the phrase,
441	// and the entry number of the next phrase with the same hash.
442	//
443
444
445	unsigned char *phraseMemory;
446
447	void initialiseLongPhraseMemory();
448	void rememberThisLongPhrase(cellindex index, cellcount length);
449	bool isLongPhraseStored(cellindex index, cellcount length);
450	void deleteLongPhraseMemory();
451
452
453	void initialisePhraseMemory() {
454
455	phraseMemory = new unsigned char[inputLength];
456
457	// to begin with, everything is empty
458	// for (cellcount i = 0; i < inputLength; ++i) {
459	// phraseMemory[i] = 0;
460	//}
461	memset(phraseMemory, 0, sizeof(char)*inputLength);
462
463	// intialise the hashTable of long phrases
464	initialiseLongPhraseMemory();
465
466	}
467
468	void rememberThisPhrase(cellindex index, cellcount length) {
469
470	// if the phrase is very long, use the file-based system
471	if (length > 8) {
472	rememberThisLongPhrase(index, length);
473	return;
474	}
475
476	// create a char with just the bit corresponding to length set
477	unsigned char newbit = 1;
478	for (cellcount i = 1; i < length; ++i) {
479	newbit <<= 1;
480	}
481
482	// set that bit in the memory array at position index
483	phraseMemory[index] \|= newbit;
484	}
485
486
487	bool isPhraseStored(cellindex index, cellcount length) {
488
489	// if the phrase is very long, use the file-based system
490	if (length > 8) {
491	return isLongPhraseStored(index, length);
492	}
493
494	// create a char with just the bit corresponding to length set
495	unsigned char newbit = 1;
496	for (cellcount i = 1; i < length; ++i) {
497	newbit <<= 1;
498	}
499
500	// retrurn true if that bit is set in memory arrayat position index
501	return (phraseMemory[index] & newbit);
502	}
503
504	void deletePhraseMemory() {
505	delete []phraseMemory;
506	deleteLongPhraseMemory();
507	}
508
509
510
511	// Files etc used to store "long" equavlents of the above
512
513	fstream hashTableFile;
514	char hashTableFileName[FILENAME_MAX];
515	fstream listOfEntries;
516	char listOfEntriesName[FILENAME_MAX];
517	cellindex nextEntryNumber;
518
519	const cellcount bigPrime = 7919;
520
521
522	void initialiseLongPhraseMemory() {
523
524	cellindex example = 0;
525
526	sprintf(hashTableFileName, "%s/hashTable", collection);
527	sprintf(listOfEntriesName, "%s/hashLists", collection);
528
529
530	// create the new hashtable
531	if (verbosity > 1) {
532	cout << "Initialising hashTable: " << hashTableFileName << endl;
533	}
534	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
535	for (cellcount i = 0; i < bigPrime; ++i) {
536	hashTableFile.write((char *) &example, sizeof(example));
537	}
538
539	// create the list of phrases
540	if (verbosity > 1) {
541	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
542	}
543	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
544	listOfEntries.write((char *) &example, sizeof(example));
545	listOfEntries.write((char *) &example, sizeof(example));
546	listOfEntries.write((char *) &example, sizeof(example));
547	nextEntryNumber = 1;
548	}
549
550
551	void rememberThisLongPhrase(cellindex index, cellcount length) {
552
553	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
554
555	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
556	cellindex pointer;
557	cellindex zero = 0;
558	cellindex readp = 0;
559	cellindex readi = 0;
560	cellindex readl = 0;
561
562	hashTableFile.seekg(hashOffset);
563	hashTableFile.read((char *) &pointer, sizeof(cellindex));
564
565	if (pointer == 0) {
566	// There is no entry at all in the hash table for this entry
567	// so create one
568
569	pointer = nextEntryNumber++;
570	hashTableFile.seekg(hashOffset);
571	hashTableFile.write((char *) &pointer, sizeof(cellindex));
572
573	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
574	listOfEntries.write((char *) &zero, sizeof(cellindex));
575	listOfEntries.write((char *) &index, sizeof(cellindex));
576	listOfEntries.write((char *) &length, sizeof(cellindex));
577
578	} else {
579	// There is a list starting at this hash value, so the phrase may
580	// be already remembered, or it might need to be appended
581
582	while (pointer != 0) {
583	// Read the entry pointed to by pointer
584	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
585	listOfEntries.read((char *) &readp, sizeof(cellindex));
586	listOfEntries.read((char *) &readi, sizeof(cellindex));
587	listOfEntries.read((char *) &readl, sizeof(cellindex));
588
589	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
590
591	if ((readi == index) && (readl = length)) {
592	// we've found that we've already stored it
593	return;
594	} else if (readp == 0) {
595	// we're reached the end of the list. Add a new entry.
596	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
597	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
598	pointer = nextEntryNumber++;
599
600	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
601	listOfEntries.write((char *) &zero, sizeof(cellindex));
602	listOfEntries.write((char *) &index, sizeof(cellindex));
603	listOfEntries.write((char *) &length, sizeof(cellindex));
604	return;
605	} else {
606	// go on to the next node
607	pointer = readp;
608	}
609	}
610	}
611
612
613	}
614
615
616	bool isLongPhraseStored(cellindex index, cellcount length) {
617
618	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
619
620	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
621	cellindex pointer;
622	cellindex readp = 0;
623	cellindex readi = 0;
624	cellindex readl = 0;
625
626	// Find the phrase in the hashFile
627	hashTableFile.seekg(hashOffset);
628	hashTableFile.read((char *) &pointer, sizeof(cellindex));
629
630	if (pointer == 0) {
631	// There is no entry at all in the hash table for this entry
632	// so nothing is stored
633	return false;
634
635	} else {
636	// There is a list starting at this hash value, so the phrase may
637	// be already remembered in that list
638	while (pointer != 0) {
639	// Read the entry pointed to by pointer
640	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
641	listOfEntries.read((char *) &readp, sizeof(cellindex));
642	listOfEntries.read((char *) &readi, sizeof(cellindex));
643	listOfEntries.read((char *) &readl, sizeof(cellindex));
644
645	if ((readi == index) && (readl = length)) {
646	// we've found the phrase stored here
647	return true;
648	} else {
649	// go on to the next node
650	pointer = readp;
651	}
652	}
653	}
654	return false;
655	}
656
657
658	void deleteLongPhraseMemory() {
659	// remove the hash & other files
660
661	hashTableFile.close();
662	listOfEntries.close();
663	remove(hashTableFileName);
664	remove(listOfEntriesName);
665
666	}
667
668
669	// Read the collection statistics file
670	//
671	void readStatistics() {
672
673	// open the statistics file
674	char filename[FILENAME_MAX];
675	sprintf(filename, "%s/clauses.stats", collection);
676
677	// Open the file
678	ifstream inFile(filename, ios::in);
679	if (!inFile) {
680	cerr << "File " << filename << " could not be opened\n";
681	exit(1);
682	}
683
684	// Read the numbers file into the numbers array
685	char key[1000];
686	symbol value;
687	while (inFile >> key >> value){
688	if (strcmp(key, "first_stopword") == 0) {
689	firstStopSymbol = value;
690	} else if (strcmp(key, "last_stopword") == 0) {
691	lastStopSymbol = value;
692	} else if (strcmp(key, "first_contentword") == 0) {
693	firstContentSymbol = value;
694	} else if (strcmp(key, "last_contentword") == 0) {
695	lastContentSymbol = value;
696	}
697	}
698	inFile.close();
699
700	// Make sure we have the information we need
701	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
702	cerr << "Statistics file incomplete" << endl;
703	exit(1);
704	}
705	}
706
707	cellcount getContentCount(symbol firstContent) {
708
709	cellcount content=0;
710	for (cellcount i=0; i<inputLength; ++i) {
711	if (symbols[i]>=firstContent) ++content;
712	}
713
714	return content;
715	}
716
717
718	int main (int argc, char * argv[]) {
719
720	// Command-line arguments
721	// argv[1] is the phindex directory
722	// argv[2] is the mode, where 1 is stopword mode
723	// argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy
724	// argv[4] is opitonal verbosity
725	if (argc < 4) {
726	cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl;
727	exit(1);
728	}
729
730	// collection directory
731	strcpy(collection, argv[1]);
732
733	// mode parameter
734	phraseMode = atoi(argv[2]);
735	assert((phraseMode == STOPWORDS) \|\| (phraseMode == ANYPHRASE));
736
737	minOccurs = atoi(argv[3]);
738	assert((minOccurs > 0));
739
740	// optional verbosity parameter
741	if (argc == 5) {
742	verbosity = atoi(argv[4]);
743	assert (verbosity >= 0);
744	}
745
746	if (verbosity) {
747	cout << "suffix: the phrase extraction program" << endl;
748	}
749	if (verbosity > 1) {
750	if (phraseMode == STOPWORDS) {
751	cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
752	} else {
753	cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
754	}
755	}
756
757	// Read the statistics file
758	readStatistics();
759	// Read the numbers file
760	readNumbers();
761
762	if (numberOfDocuments == 0) {
763	cerr << "There are no documents in this collection!" << endl;
764	exit(0); // not necessarily an error...
765	}
766
767	symbol firstContent;
768	if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
769	else firstContent = firstStopSymbol;
770
771	// Allocate memory for the suffix & prefix arrays
772	cellcount contentLength = 0;
773	contentLength = getContentCount(firstContent);
774	suffixArray = new symbol *[contentLength];
775	prefixArray = new symbol *[contentLength];
776	if (prefixArray == NULL) {
777	cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
778	exit(2);
779	}
780	allocateSuffixCheck(contentLength);
781	// Initialise prefix and suffix arrays, only use the needed suffixes
782	for (cellcount j = 0, here = 0; j < inputLength; ++j) {
783	if (symbols[j]>=firstContent) {
784	suffixArray[here] = &symbols[j];
785	prefixArray[here] = &symbols[j];
786	++here;
787	}
788	}
789	qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
790	qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
791	// Create the document arrays
792	if (verbosity > 1) {
793	cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
794	}
795
796	// The document frequecy array is used to count the number of times
797	// each phrase occurs in each document. The number of documents in
798	// which a phrase occurs is stored in df.
799	frequency *documentFrequency = new frequency[numberOfDocuments];
800	frequency df;
801
802	// documentArray will be searched in order to discover which document
803	// each phrase occurs in.
804	documentArray = new symbol *[numberOfDocuments];
805	if (documentArray == NULL) {
806	cerr << "Suffix: out of memory allocating document arrays." << endl;
807	exit(2);
808	}
809
810	// just scan through the input text to find the doc starts
811	cellindex d = 0;
812	for (cellindex i=0; i<inputLength; ++i) {
813	if (symbols[i] == DOCUMENTSTART) {
814	documentArray[d] = &symbols[i];
815	++d;
816	}
817	}
818
819	// the phrases stuff is expecting inputLength to be the length of the
820	// suffix array, so change it.
821	inputLength = contentLength;
822
823	// Extract phrases
824	//
825	// We will make several passesover the data, in each case considering
826	// a set of input phrases and generating a set of output phrases, which
827	// we will expancd in later passes.
828	//
829	// The input phrases in the first pass will be the vocabulary.
830	// In later passes, the input phrases will be the output phrases of the
831	// previous pass.
832	//
833	// In each pass we will consider each input phrase in turn. If we
834	// have seen it before, we will ignore it. Otherwise, we will expand
835	// it and add its expansions to the set of output phrases.
836
837	cout <<"\ngenerating the phrase hierarchy\n\n";
838
839	// Store the phrase data in the phrases file
840	char phraseDataName[FILENAME_MAX];
841	sprintf(phraseDataName, "%s/phrases", collection);
842	ofstream phraseData(phraseDataName, ios::out);
843	if (!phraseData) {
844	cout << "File " << phraseDataName << " could not be opened\n";
845	exit(1);
846	}
847
848	// Count the number of phrases output
849	mg_u_long phraseCounter = 0; //unsigned long int phraseCounter = 0;
850
851	// Set up the phrase expansion memory.
852	// We need this so that we don't expand a phrase more than once
853	initialisePhraseMemory();
854
855	// The current pass numebr
856	int phrasePass = 1;
857
858
859	// PASS NUMBER 1
860	if (verbosity > 1) {
861	cout << "Starting pass " << phrasePass << endl;
862	}
863
864	ofstream outPhrase;
865	char outPhraseName[FILENAME_MAX];
866	mg_u_long outPhraseCounter = 0; // unsigned long int outPhraseCounter = 0;
867
868	// On the first pass, simply work through the vocabulary
869	sprintf(outPhraseName, "%s/outPhrase.1", collection);
870	outPhrase.open(outPhraseName, ios::out);
871	if (!outPhrase) {
872	cerr << "File " << outPhraseName << " could not be opened\n";
873	exit(1);
874	}
875
876	// Iterate over the different symbols by working through the suffix array
877	vector<Phrase> result;
878	cellindex ij = 0;
879	char *tmpString;
880
881	Phrase p;
882	while (ij < inputLength) {
883
884	// make a new phrase of length 1
885	p = Phrase(suffixArray[ij], 1, SUFFIX);
886	p.findFirstAndLastSuffix(ij, inputLength-1);
887
888	// We ignore this symbol if it occurs only once, if it is a delimiter,
889	// of if we are in stopwords mode and it is a stopword
890	// - in this new version, only need to check freq
891	// We could imagine a new mode/command-line option, which is like
892	// STOPWORDS but without this restrictrion. This would let you browse
893	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
894	// "the AGRIS" (where the is a stopword and AGRIS a content word).
895	// The system used to work like this; it is easy to implement, but
896	// it explodes the size of the indexes. So: would it be useful?
897	if (p.suffixFrequency >= minOccurs) {
898	// Get minimal expansions of the phrase
899	getExpansions(p, result);
900
901	if (!result.empty()) {
902
903	// Remember that we have expanded this phrase
904	rememberThisPhrase(ij, 1);
905
906	// write the phrase text
907	phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
908	<< result.size() << ":";
909
910	// write the results
911	for (cellcount k = 0; k < result.size(); ++k) {
912	if (k) {
913	phraseData << ",";
914	}
915	phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
916	outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
917	++outPhraseCounter;
918	}
919	result.clear();
920
921	// Write the documents in which this phrase occurs
922	df = getDocumentOccurrances(p, documentFrequency);
923	phraseData << ":" << df << ":";
924
925	// write the documents
926	for (cellcount m = 0, first = 1; m < numberOfDocuments; ++m) {
927	if (documentFrequency[m]) {
928	if (first) {
929	first = 0;
930	} else {
931	phraseData << ";";
932	}
933	// Output the document number. Note that here we've numbered the
934	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
935	// add 1 to the document id when we output it.
936	phraseData << "d" << (m+1);
937	// Next, output the frequency with which the document occurs, but
938	// only if it is > 1.
939	if (documentFrequency[m] > 1) {
940	phraseData << "," << documentFrequency[m];
941	}
942	}
943	}
944
945	phraseData << endl;
946	++phraseCounter;
947
948	// feedback
949	if (verbosity) {
950	if (phraseCounter % 1000 == 0) {
951	cout << "phrase " << phraseCounter << ": "
952	<< "cell " << p.firstSuffixIndex << " - " << p << endl;
953	}
954	}
955	}
956	}
957	ij = p.lastSuffixIndex + 1;
958	}
959	outPhrase.close();
960
961	// REMAINING PASSES
962	// The previous outPhrase file forms the input to each new pass
963	cellcount start, length;
964	while (outPhraseCounter > 0) {
965
966	// Start a new pass
967	++phrasePass;
968	if (verbosity) {
969	cout << "Starting pass " << phrasePass << endl;
970	}
971
972	// Open the input file
973	char inPhraseName[FILENAME_MAX];
974	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
975	ifstream inPhrase (inPhraseName, ios::in);
976	if (!inPhrase) {
977	cerr << "File " << inPhraseName << " could not be opened\n";
978	exit(1);
979	}
980
981	// Open the output file
982	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
983	outPhrase.open(outPhraseName, ios::out);
984	if (!outPhrase) {
985	cerr << "File " << outPhraseName << " could not be opened\n";
986	exit(1);
987	}
988	outPhraseCounter = 0;
989
990	// Process each phrase
991	while(inPhrase >> start >> length) {
992
993	// Ignore the phrase if we have expanded it before
994	if (isPhraseStored(start, length))
995	continue;
996
997	// Remember that we have examined this phrase
998	rememberThisPhrase(start, length);
999
1000	// Find the phrase in the suffixarray
1001	p = Phrase(suffixArray[start], length, SUFFIX);
1002	p.findFirstAndLastSuffix(start, inputLength-1);
1003
1004	// Ignore the phrase if it only occurs once
1005	if (p.suffixFrequency < minOccurs)
1006	continue;
1007
1008	// Write the phrase text;
1009	phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1010
1011	// Expand the phrase, if it is fewer than 8 words long
1012	if (length <= 8) {
1013
1014	// Get the minimal expansions for this phrase
1015	getExpansions(p, result);
1016
1017	// write the results
1018	phraseData << result.size() << ":";
1019
1020	for (cellcount i = 0; i < result.size(); ++i) {
1021	if (i) {
1022	phraseData << ",";
1023	}
1024	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1025	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1026	++outPhraseCounter;
1027	}
1028	result.clear();
1029
1030	} else {
1031	// phrase is too long to expand further
1032	phraseData << "0:";
1033	}
1034
1035	// Write the documents in which this phrase occurs
1036	df = getDocumentOccurrances(p, documentFrequency);
1037	phraseData << ":" << df << ":";
1038
1039	// write the documents
1040	for (cellcount i = 0, first = 1; i < numberOfDocuments; ++i) {
1041	if (documentFrequency[i]) {
1042	if (first) {
1043	first = 0;
1044	} else {
1045	phraseData << ";";
1046	}
1047	// Output the document number. Note that here we've numbered the
1048	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
1049	// add 1 to the document id when we output it.
1050	phraseData << "d" << (i+1);
1051	// Next, output the frequency with which the document occurs, but
1052	// only if it is > 1.
1053	if (documentFrequency[i] > 1) {
1054	phraseData << "," << documentFrequency[i];
1055	}
1056	}
1057	}
1058
1059	phraseData << endl;
1060	++phraseCounter;
1061
1062	// feedback
1063	if (verbosity) {
1064	if (phraseCounter % 1000 == 0) {
1065	cout << "phrase " << phraseCounter << ": "<< "start " << start
1066	<< ", length " << length << " - " << p << endl;
1067	}
1068	}
1069
1070	}
1071
1072	inPhrase.close();
1073	outPhrase.close();
1074	}
1075
1076	phraseData.close();
1077	deletePhraseMemory();
1078
1079	delete [] documentFrequency;
1080	delete [] symbols;
1081	delete [] suffixArray;
1082	delete [] prefixArray;
1083	delete [] suffixCheck;
1084	delete [] documentArray;
1085
1086
1087
1088	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1089	return 0;
1090	}
1091
1092
1093
1094
1095

Note: See TracBrowser for help on using the repository browser.

Download in other formats: