Context Navigation

source: main/trunk/greenstone2/build-src/src/phind/generate/suffix.cpp@ 21356

Last change on this file since 21356 was 10335, checked in by kjdon, 19 years ago
++pa is not the same as pa++ - reversed these erroneous changes
Property svn:keywords set to `Author Date Id Revision`
File size: 30.3 KB

Line
1	/**********************************************************************
2	*
3	* Suffix.cpp -- Extract the repeated phrases in the input with suffix
4	* and prefix arrays (cgn & gwp's simpler algorithm,
5	* and kjm's improvements).
6	*
7	* Copyright 2000 Gordon W. Paynter
8	* Copyright 2000 The New Zealand Digital Library Project
9	*
10	* A component of the Greenstone digital library software from the
11	* New Zealand Digital Library Project at the University of Waikato,
12	* New Zealand.
13	*
14	* This program is free software; you can redistribute it and/or
15	* modify it under the terms of the GNU General Public License as
16	* published by the Free Software Foundation; either version 2 of
17	* the License, or (at your option) any later version.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the GNU General Public License
25	* along with this program; if not, write to the Free Software
26	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	*
28	*********************************************************************/
29
30	#include <assert.h>
31	#include <math.h>
32	#include <stdio.h>
33	#include <stdlib.h>
34	#include <string.h>
35
36	#if defined(GSDL_USE_IOS_H)
37	# include <fstream.h>
38	# include <iostream.h>
39	#else
40	# include <fstream>
41	# include <iostream>
42	#endif
43
44	#if defined(GSDL_USE_STL_H)
45	# if defined(GSDL_USE_ALGO_H)
46	# include <algo.h>
47	# else
48	# include <algorithm.h>
49	# endif
50	# include <vector.h>
51	#else
52	# include <algorithm>
53	# include <vector>
54	#endif
55
56	#include "suffix.h"
57	#include "phrase.h"
58	#include "check.h"
59
60	// Global variables declared in suffix.h
61	cellcount inputLength;
62
63	symbol *symbols;
64	symbol **suffixArray;
65	symbol **prefixArray;
66
67	// How many documents are in this collection?
68	cellcount numberOfDocuments;
69	symbol **documentArray;
70
71
72	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
73	int phraseMode = ANYPHRASE; //STOPWORDS;
74	// What is the minimum phrase frequency for a phrase to be included in the hierarchy
75	int minOccurs = 2;
76
77	// The filestem of the collection's phindex directory
78	char collection[FILENAME_MAX];
79
80
81	// The ranges of the stopword and content-word symbols for the collection
82	symbol firstStopSymbol = 0;
83	symbol lastStopSymbol = 0;
84	symbol firstContentSymbol = 0;
85	symbol lastContentSymbol = 0;
86
87
88	// Some useful comparison functions, defined below.
89	int suffixCompare(const void , const void );
90	int prefixCompare(const void , const void );
91	int pointerCompare(const void , const void );
92
93
94	// Functions for implementing "phrase memory". These let us "remember"
95	// each phrase that we've expanded without using too much memory.
96	void initialisePhraseMemory();
97	void rememberThisPhrase(cellindex index, cellcount length);
98	bool isPhraseStored(cellindex index, cellcount length);
99	void deletePhraseMemory();
100
101
102	// how much output do we want?
103	int verbosity = 1;
104
105
106	// Get a phrase's expansions
107	//
108	// Get the set of "minimal", "maximal", non-unique expansions of a
109	// phrase p, using the simpler algorithm that Craig and Gordon came up
110	// with at Google.
111	//
112	// Returns a vector of Expansions.
113
114	void getExpansions(Phrase &p, vector<Phrase> &results) {
115
116	// 1. Get the initial candidates
117	vector<Phrase> candidates;
118	p.initialSuffixCandidates(candidates);
119	int suffcands = candidates.size();
120	p.initialPrefixCandidates(candidates);
121
122	if (candidates.size() == 0)
123	return;
124
125	vector<Phrase>::iterator i;
126	for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
127	i->expandWhileUniquePrefixExtension();
128	i->ensureSuffixFound();
129	}
130	for (i; i != candidates.end(); ++i) {
131	i->expandWhileUniqueSuffixExtension();
132	}
133
134	// 3. Ensure minimality: ignore phrases whose subphrases are also found
135	results.clear();
136
137	// Initialise the candidates, check array, and various variables.
138	sort(candidates.begin(), candidates.end(), isShorter);
139	unsigned minimum_length = candidates.begin()->length;
140	clearSuffixCheck();
141
142	// Try to add each candidate to the results set, ignoring the non-minimal
143	for (vector<Phrase>::iterator candidate = candidates.begin();
144	candidate != candidates.end(); ++candidate) {
145
146	// Make a copy of candidate to mutilate while performing sub-phrase checks
147	Phrase temp_phrase(*candidate);
148	bool shorter_found = false;
149
150	// Check for shorter and shorter versions of the temporary phrase
151	while (temp_phrase.length >= minimum_length && !shorter_found) {
152	temp_phrase.ensureSuffixFound();
153	if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
154	temp_phrase.shortenByOneAtPrefix();
155	else
156	shorter_found = true;
157
158	// Possible efficiency here: we can finish if the prefix of c
159	// and temp_phrase are the same for candidate->length symbols.
160	}
161
162	// If no shorter phrase is found, use this one
163	if (!shorter_found) {
164	results.push_back(*candidate);
165	candidate->ensureSuffixFound();
166	setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
167	}
168	}
169	}
170
171
172	// suffixCompare
173	//
174	// Compare two pointers into a suffix array. We use this in the
175	// qsort function, so the input are pointers to pointers.
176	//
177	// Return -1 if (a < b), otherwise (a > b) so return +1,
178
179	int suffixCompare(const void cpa, const void cpb) {
180
181	// Cast then dereference pointers to suffix array elements
182	symbol pa = (symbol ) cpa;
183	symbol pb = (symbol ) cpb;
184	pa = (symbol ) pa;
185	pb = (symbol ) pb;
186
187	// If the two elements are the same, examine the next one
188	while (pa == pb) {
189	*pa++;
190	*pb++;
191	}
192
193	// Make the copmparison and return
194	if ( pa < pb) {
195	return -1;
196	} else {
197	return +1;
198	}
199	}
200
201
202	// prefixCompare
203	//
204	// Compare two pointers into a prefix array. We use this in the
205	// qsort function, so the input are pointers to pointers.
206	//
207	// Return -1 if (a > b), otherwise (a < b) so return +1,
208
209	int prefixCompare(const void cpa, const void cpb) {
210
211	// Cast then dereference pointers to prefix array elements
212	symbol pa = (symbol ) cpa;
213	symbol pb = (symbol ) cpb;
214	pa = (symbol ) pa;
215	pb = (symbol ) pb;
216
217	// If the two elements are the same, examine the next one
218	while (pa == pb) {
219	*pa--;
220	*pb--;
221	}
222
223	// Make the copmparison and return
224	if ( pa > pb) {
225	return -1;
226	} else {
227	return +1;
228	}
229	}
230
231	// simpleCompare
232	//
233	// Compare two pointers based on the memory location they point to.
234
235	int pointerCompare( const void pa, const void pb ) {
236
237	symbol a = (symbol ) pa;
238	symbol b = (symbol ) pb;
239
240	if (a < b) {
241	return -1;
242	} else if (a > b) {
243	return 1;
244	} else {
245	return 0;
246	}
247	}
248
249
250	// Read the clauses.numbers file into the "symbols" array.
251	//
252	// Each number in the file is a symbol number; it is essential that
253	// the first symbol (and no others) be COLLECTIONSTART and the last
254	// symbol (and no others) be COLLECTIONEND.
255	//
256	// Return the number of numbers in the array.
257
258	int readNumbers() {
259
260	char filename[FILENAME_MAX];
261	sprintf(filename, "%s/clauses.numbers", collection);
262	if (verbosity) {
263	cout << "Reading numbers file: " << filename << endl;
264	}
265
266	// Open the numbers file
267	ifstream inFile1(filename, ios::in);
268	if (!inFile1) {
269	cerr << "File " << filename << " could not be opened\n";
270	exit(1);
271	}
272
273	// Count the number of symbols
274	inputLength = 0;
275	symbol word;
276	while (inFile1 >> word) {
277	++inputLength;
278	}
279	inFile1.close();
280
281	// Allocate the symbbols array
282	if (verbosity > 1) {
283	cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
284	}
285	symbols = new symbol[inputLength];
286	if (symbols == NULL) {
287	cerr << "Suffix error: not enough memory to hold " << inputLength
288	<< " symbols." << endl;
289	exit(2);
290	}
291
292	// Read the numbers file into the numbers array
293	if (verbosity > 2) {
294	cout << "Reading the numbers" << endl;
295	}
296
297	ifstream inFile2(filename, ios::in);
298	if (!inFile2) {
299	cerr << "File " << filename << " could not be opened\n";
300	exit(1);
301	}
302	cellcount next = 0;
303	numberOfDocuments = 0;
304	while (inFile2 >> word) {
305	symbols[next++] = word;
306	if (word == DOCUMENTSTART) {
307	++numberOfDocuments;
308	}
309	}
310	inFile2.close();
311
312	// Make sure the numbers file is intact
313	assert(symbols[0] == COLLECTIONSTART);
314	assert(symbols[next-1] == COLLECTIONEND);
315
316	return inputLength;
317	}
318
319
320
321	// Get Document Occurrance statistics
322	//
323	// Given a phrase, what documents does it occur in?
324
325	cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
326
327	// cout << "searching for \""<< p << "\" in documents "
328	// << 0 << "-" << numberOfDocuments - 1 << endl;
329
330	// The number of documents in which this phrase occurs
331	cellcount df = 0;
332
333	// Initialise the document frequency array
334	// for (cellindex i = 0; i < numberOfDocuments; ++i) {
335	// frequency[i] = 0;
336	//}
337	memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
338
339	// variables used to facilitate the search
340	cellindex begin;
341	cellindex end;
342	cellindex d;
343	symbol *target;
344	bool found;
345
346	// search for the document in which each occurence of the phrase is found
347	for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; ++j) {
348
349	// cout << "looking for phrase at suffixArray[" << j << "]\n";
350
351	target = suffixArray[j];
352	begin = 0;
353	end = numberOfDocuments - 1;
354	found = false;
355
356	// Search for the occurence of a document delimiter that target
357	// occurs immediately after.
358	// We do this by performing a binary chop search on documentArray.
359	while (!found) {
360
361	// cout << "searching for " << (cellindex) target << " in "
362	// << begin << " - " << end << endl;
363
364	assert (begin <= end);
365
366	// If the beginning and end of the interval are the same,
367	// then we've found the correct document
368	if (begin == end) {
369	if (frequency[begin] == 0) {
370	++df;
371	}
372	++frequency[begin];
373	found = true;
374	}
375
376	// Otherwise, examine a new document midway through the begin-end
377	// interval and see if it is the one.
378	else {
379	d = (begin + end) / 2;
380	if (target > documentArray[d]) {
381	// If target addrss is greater than this, but thisi sthe last document,
382	// then this must be the one we want. Or, if target is greater than
383	// this one but less then the next, this must be the one we wnat.
384	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
385	if (frequency[d] == 0) {
386	++df;
387	}
388	++frequency[d];
389	found = true;
390	} else {
391	// otherwise we know to search later in the document set
392	begin = d + 1;
393	}
394	} else {
395	// search earlier in the document set
396	end = d - 1;
397	}
398	}
399	}
400	}
401	return df;
402	}
403
404
405
406
407
408
409	// phraseExpansionMemory : Which phrases have we expanded?
410	//
411	// A set of utilities for keeping track of which phrases we have expanded.
412	// We don't want to expand a phrase more than once, after all.
413	//
414	// This REALLY ought to be in its own class, but it works so that's okay.
415	//
416	// Phrases are identified by their firstSuffixPosition and length.
417	//
418	// Functions provided are:
419	// void initialisePhraseMemory()
420	// void rememberThisPhrase(index, length)
421	// bool isPhraseStored(index, length)
422	// void deletePhraseMemory()
423	//
424	// Internally, we will have two separate cases:
425	//
426	// Phrases of length 1-8:
427	// unsigned char phraseMemory[inputLength]
428	// is an array where each cell "remembers" the corresponding index in the
429	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
430	// of length 1, 2... 8.
431	// Eventually, we will make this disk-based (i.e. store the array in a file).
432	//
433	// Phrases of length 9+:
434	// file hashTableFile
435	// file listOfEntries
436	// The first file is a hash table; each phrase maps to one of its cells, which
437	// contains either 0 (empty, no occurence) or a number which is an entry number
438	// in the second file. This file contains a "list" of entries. Each consists of
439	// three numbers: the suffixArray index of the phrase, the length of the phrase,
440	// and the entry number of the next phrase with the same hash.
441	//
442
443
444	unsigned char *phraseMemory;
445
446	void initialiseLongPhraseMemory();
447	void rememberThisLongPhrase(cellindex index, cellcount length);
448	bool isLongPhraseStored(cellindex index, cellcount length);
449	void deleteLongPhraseMemory();
450
451
452	void initialisePhraseMemory() {
453
454	phraseMemory = new unsigned char[inputLength];
455
456	// to begin with, everything is empty
457	// for (cellcount i = 0; i < inputLength; ++i) {
458	// phraseMemory[i] = 0;
459	//}
460	memset(phraseMemory, 0, sizeof(char)*inputLength);
461
462	// intialise the hashTable of long phrases
463	initialiseLongPhraseMemory();
464
465	}
466
467	void rememberThisPhrase(cellindex index, cellcount length) {
468
469	// if the phrase is very long, use the file-based system
470	if (length > 8) {
471	rememberThisLongPhrase(index, length);
472	return;
473	}
474
475	// create a char with just the bit corresponding to length set
476	unsigned char newbit = 1;
477	for (cellcount i = 1; i < length; ++i) {
478	newbit <<= 1;
479	}
480
481	// set that bit in the memory array at position index
482	phraseMemory[index] \|= newbit;
483	}
484
485
486	bool isPhraseStored(cellindex index, cellcount length) {
487
488	// if the phrase is very long, use the file-based system
489	if (length > 8) {
490	return isLongPhraseStored(index, length);
491	}
492
493	// create a char with just the bit corresponding to length set
494	unsigned char newbit = 1;
495	for (cellcount i = 1; i < length; ++i) {
496	newbit <<= 1;
497	}
498
499	// retrurn true if that bit is set in memory arrayat position index
500	return (phraseMemory[index] & newbit);
501	}
502
503	void deletePhraseMemory() {
504	delete []phraseMemory;
505	deleteLongPhraseMemory();
506	}
507
508
509
510	// Files etc used to store "long" equavlents of the above
511
512	fstream hashTableFile;
513	char hashTableFileName[FILENAME_MAX];
514	fstream listOfEntries;
515	char listOfEntriesName[FILENAME_MAX];
516	cellindex nextEntryNumber;
517
518	const cellcount bigPrime = 7919;
519
520
521	void initialiseLongPhraseMemory() {
522
523	cellindex example = 0;
524
525	sprintf(hashTableFileName, "%s/hashTable", collection);
526	sprintf(listOfEntriesName, "%s/hashLists", collection);
527
528
529	// create the new hashtable
530	if (verbosity > 1) {
531	cout << "Initialising hashTable: " << hashTableFileName << endl;
532	}
533	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
534	for (cellcount i = 0; i < bigPrime; ++i) {
535	hashTableFile.write((char *) &example, sizeof(example));
536	}
537
538	// create the list of phrases
539	if (verbosity > 1) {
540	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
541	}
542	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
543	listOfEntries.write((char *) &example, sizeof(example));
544	listOfEntries.write((char *) &example, sizeof(example));
545	listOfEntries.write((char *) &example, sizeof(example));
546	nextEntryNumber = 1;
547	}
548
549
550	void rememberThisLongPhrase(cellindex index, cellcount length) {
551
552	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
553
554	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
555	cellindex pointer;
556	cellindex zero = 0;
557	cellindex readp = 0;
558	cellindex readi = 0;
559	cellindex readl = 0;
560
561	hashTableFile.seekg(hashOffset);
562	hashTableFile.read((char *) &pointer, sizeof(cellindex));
563
564	if (pointer == 0) {
565	// There is no entry at all in the hash table for this entry
566	// so create one
567
568	pointer = nextEntryNumber++;
569	hashTableFile.seekg(hashOffset);
570	hashTableFile.write((char *) &pointer, sizeof(cellindex));
571
572	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
573	listOfEntries.write((char *) &zero, sizeof(cellindex));
574	listOfEntries.write((char *) &index, sizeof(cellindex));
575	listOfEntries.write((char *) &length, sizeof(cellindex));
576
577	} else {
578	// There is a list starting at this hash value, so the phrase may
579	// be already remembered, or it might need to be appended
580
581	while (pointer != 0) {
582	// Read the entry pointed to by pointer
583	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
584	listOfEntries.read((char *) &readp, sizeof(cellindex));
585	listOfEntries.read((char *) &readi, sizeof(cellindex));
586	listOfEntries.read((char *) &readl, sizeof(cellindex));
587
588	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
589
590	if ((readi == index) && (readl = length)) {
591	// we've found that we've already stored it
592	return;
593	} else if (readp == 0) {
594	// we're reached the end of the list. Add a new entry.
595	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
596	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
597	pointer = nextEntryNumber++;
598
599	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
600	listOfEntries.write((char *) &zero, sizeof(cellindex));
601	listOfEntries.write((char *) &index, sizeof(cellindex));
602	listOfEntries.write((char *) &length, sizeof(cellindex));
603	return;
604	} else {
605	// go on to the next node
606	pointer = readp;
607	}
608	}
609	}
610
611
612	}
613
614
615	bool isLongPhraseStored(cellindex index, cellcount length) {
616
617	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
618
619	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
620	cellindex pointer;
621	cellindex readp = 0;
622	cellindex readi = 0;
623	cellindex readl = 0;
624
625	// Find the phrase in the hashFile
626	hashTableFile.seekg(hashOffset);
627	hashTableFile.read((char *) &pointer, sizeof(cellindex));
628
629	if (pointer == 0) {
630	// There is no entry at all in the hash table for this entry
631	// so nothing is stored
632	return false;
633
634	} else {
635	// There is a list starting at this hash value, so the phrase may
636	// be already remembered in that list
637	while (pointer != 0) {
638	// Read the entry pointed to by pointer
639	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
640	listOfEntries.read((char *) &readp, sizeof(cellindex));
641	listOfEntries.read((char *) &readi, sizeof(cellindex));
642	listOfEntries.read((char *) &readl, sizeof(cellindex));
643
644	if ((readi == index) && (readl = length)) {
645	// we've found the phrase stored here
646	return true;
647	} else {
648	// go on to the next node
649	pointer = readp;
650	}
651	}
652	}
653	return false;
654	}
655
656
657	void deleteLongPhraseMemory() {
658	// remove the hash & other files
659
660	hashTableFile.close();
661	listOfEntries.close();
662	remove(hashTableFileName);
663	remove(listOfEntriesName);
664
665	}
666
667
668	// Read the collection statistics file
669	//
670	void readStatistics() {
671
672	// open the statistics file
673	char filename[FILENAME_MAX];
674	sprintf(filename, "%s/clauses.stats", collection);
675
676	// Open the file
677	ifstream inFile(filename, ios::in);
678	if (!inFile) {
679	cerr << "File " << filename << " could not be opened\n";
680	exit(1);
681	}
682
683	// Read the numbers file into the numbers array
684	char key[1000];
685	symbol value;
686	while (inFile >> key >> value){
687	if (strcmp(key, "first_stopword") == 0) {
688	firstStopSymbol = value;
689	} else if (strcmp(key, "last_stopword") == 0) {
690	lastStopSymbol = value;
691	} else if (strcmp(key, "first_contentword") == 0) {
692	firstContentSymbol = value;
693	} else if (strcmp(key, "last_contentword") == 0) {
694	lastContentSymbol = value;
695	}
696	}
697	inFile.close();
698
699	// Make sure we have the information we need
700	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
701	cerr << "Statistics file incomplete" << endl;
702	exit(1);
703	}
704	}
705
706	cellcount getContentCount(symbol firstContent) {
707
708	cellcount content=0;
709	for (cellcount i=0; i<inputLength; ++i) {
710	if (symbols[i]>=firstContent) ++content;
711	}
712
713	return content;
714	}
715
716
717	int main (int argc, char * argv[]) {
718
719	// Command-line arguments
720	// argv[1] is the phindex directory
721	// argv[2] is the mode, where 1 is stopword mode
722	// argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy
723	// argv[4] is opitonal verbosity
724	if (argc < 4) {
725	cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl;
726	exit(1);
727	}
728
729	// collection directory
730	strcpy(collection, argv[1]);
731
732	// mode parameter
733	phraseMode = atoi(argv[2]);
734	assert((phraseMode == STOPWORDS) \|\| (phraseMode == ANYPHRASE));
735
736	minOccurs = atoi(argv[3]);
737	assert((minOccurs > 0));
738
739	// optional verbosity parameter
740	if (argc == 5) {
741	verbosity = atoi(argv[4]);
742	assert (verbosity >= 0);
743	}
744
745	if (verbosity) {
746	cout << "suffix: the phrase extraction program" << endl;
747	}
748	if (verbosity > 1) {
749	if (phraseMode == STOPWORDS) {
750	cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
751	} else {
752	cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
753	}
754	}
755
756	// Read the statistics file
757	readStatistics();
758	// Read the numbers file
759	readNumbers();
760
761	if (numberOfDocuments == 0) {
762	cerr << "There are no documents in this collection!" << endl;
763	exit(0); // not necessarily an error...
764	}
765
766	symbol firstContent;
767	if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
768	else firstContent = firstStopSymbol;
769
770	// Allocate memory for the suffix & prefix arrays
771	cellcount contentLength = 0;
772	contentLength = getContentCount(firstContent);
773	suffixArray = new symbol *[contentLength];
774	prefixArray = new symbol *[contentLength];
775	if (prefixArray == NULL) {
776	cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
777	exit(2);
778	}
779	allocateSuffixCheck(contentLength);
780	// Initialise prefix and suffix arrays, only use the needed suffixes
781	for (cellcount j = 0, here = 0; j < inputLength; ++j) {
782	if (symbols[j]>=firstContent) {
783	suffixArray[here] = &symbols[j];
784	prefixArray[here] = &symbols[j];
785	++here;
786	}
787	}
788	qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
789	qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
790	// Create the document arrays
791	if (verbosity > 1) {
792	cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
793	}
794
795	// The document frequecy array is used to count the number of times
796	// each phrase occurs in each document. The number of documents in
797	// which a phrase occurs is stored in df.
798	frequency *documentFrequency = new frequency[numberOfDocuments];
799	frequency df;
800
801	// documentArray will be searched in order to discover which document
802	// each phrase occurs in.
803	documentArray = new symbol *[numberOfDocuments];
804	if (documentArray == NULL) {
805	cerr << "Suffix: out of memory allocating document arrays." << endl;
806	exit(2);
807	}
808
809	// just scan through the input text to find the doc starts
810	cellindex d = 0;
811	for (cellindex i=0; i<inputLength; ++i) {
812	if (symbols[i] == DOCUMENTSTART) {
813	documentArray[d] = &symbols[i];
814	++d;
815	}
816	}
817
818	// the phrases stuff is expecting inputLength to be the length of the
819	// suffix array, so change it.
820	inputLength = contentLength;
821
822	// Extract phrases
823	//
824	// We will make several passesover the data, in each case considering
825	// a set of input phrases and generating a set of output phrases, which
826	// we will expancd in later passes.
827	//
828	// The input phrases in the first pass will be the vocabulary.
829	// In later passes, the input phrases will be the output phrases of the
830	// previous pass.
831	//
832	// In each pass we will consider each input phrase in turn. If we
833	// have seen it before, we will ignore it. Otherwise, we will expand
834	// it and add its expansions to the set of output phrases.
835
836	cout <<"\ngenerating the phrase hierarchy\n\n";
837
838	// Store the phrase data in the phrases file
839	char phraseDataName[FILENAME_MAX];
840	sprintf(phraseDataName, "%s/phrases", collection);
841	ofstream phraseData(phraseDataName, ios::out);
842	if (!phraseData) {
843	cout << "File " << phraseDataName << " could not be opened\n";
844	exit(1);
845	}
846
847	// Count the number of phrases output
848	unsigned long int phraseCounter = 0;
849
850	// Set up the phrase expansion memory.
851	// We need this so that we don't expand a phrase more than once
852	initialisePhraseMemory();
853
854	// The current pass numebr
855	int phrasePass = 1;
856
857
858	// PASS NUMBER 1
859	if (verbosity > 1) {
860	cout << "Starting pass " << phrasePass << endl;
861	}
862
863	ofstream outPhrase;
864	char outPhraseName[FILENAME_MAX];
865	unsigned long int outPhraseCounter = 0;
866
867	// On the first pass, simply work through the vocabulary
868	sprintf(outPhraseName, "%s/outPhrase.1", collection);
869	outPhrase.open(outPhraseName, ios::out);
870	if (!outPhrase) {
871	cerr << "File " << outPhraseName << " could not be opened\n";
872	exit(1);
873	}
874
875	// Iterate over the different symbols by working through the suffix array
876	vector<Phrase> result;
877	cellindex ij = 0;
878	char *tmpString;
879
880	Phrase p;
881	while (ij < inputLength) {
882
883	// make a new phrase of length 1
884	p = Phrase(suffixArray[ij], 1, SUFFIX);
885	p.findFirstAndLastSuffix(ij, inputLength-1);
886
887	// We ignore this symbol if it occurs only once, if it is a delimiter,
888	// of if we are in stopwords mode and it is a stopword
889	// - in this new version, only need to check freq
890	// We could imagine a new mode/command-line option, which is like
891	// STOPWORDS but without this restrictrion. This would let you browse
892	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
893	// "the AGRIS" (where the is a stopword and AGRIS a content word).
894	// The system used to work like this; it is easy to implement, but
895	// it explodes the size of the indexes. So: would it be useful?
896	if (p.suffixFrequency >= minOccurs) {
897	// Get minimal expansions of the phrase
898	getExpansions(p, result);
899
900	if (!result.empty()) {
901
902	// Remember that we have expanded this phrase
903	rememberThisPhrase(ij, 1);
904
905	// write the phrase text
906	phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
907	<< result.size() << ":";
908
909	// write the results
910	for (cellcount k = 0; k < result.size(); ++k) {
911	if (k) {
912	phraseData << ",";
913	}
914	phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
915	outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
916	++outPhraseCounter;
917	}
918	result.clear();
919
920	// Write the documents in which this phrase occurs
921	df = getDocumentOccurrances(p, documentFrequency);
922	phraseData << ":" << df << ":";
923
924	// write the documents
925	for (cellcount m = 0, first = 1; m < numberOfDocuments; ++m) {
926	if (documentFrequency[m]) {
927	if (first) {
928	first = 0;
929	} else {
930	phraseData << ";";
931	}
932	// Output the document number. Note that here we've numbered the
933	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
934	// add 1 to the document id when we output it.
935	phraseData << "d" << (m+1);
936	// Next, output the frequency with which the document occurs, but
937	// only if it is > 1.
938	if (documentFrequency[m] > 1) {
939	phraseData << "," << documentFrequency[m];
940	}
941	}
942	}
943
944	phraseData << endl;
945	++phraseCounter;
946
947	// feedback
948	if (verbosity) {
949	if (phraseCounter % 1000 == 0) {
950	cout << "phrase " << phraseCounter << ": "
951	<< "cell " << p.firstSuffixIndex << " - " << p << endl;
952	}
953	}
954	}
955	}
956	ij = p.lastSuffixIndex + 1;
957	}
958	outPhrase.close();
959
960	// REMAINING PASSES
961	// The previous outPhrase file forms the input to each new pass
962	cellcount start, length;
963	while (outPhraseCounter > 0) {
964
965	// Start a new pass
966	++phrasePass;
967	if (verbosity) {
968	cout << "Starting pass " << phrasePass << endl;
969	}
970
971	// Open the input file
972	char inPhraseName[FILENAME_MAX];
973	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
974	ifstream inPhrase (inPhraseName, ios::in);
975	if (!inPhrase) {
976	cerr << "File " << inPhraseName << " could not be opened\n";
977	exit(1);
978	}
979
980	// Open the output file
981	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
982	outPhrase.open(outPhraseName, ios::out);
983	if (!outPhrase) {
984	cerr << "File " << outPhraseName << " could not be opened\n";
985	exit(1);
986	}
987	outPhraseCounter = 0;
988
989	// Process each phrase
990	while(inPhrase >> start >> length) {
991
992	// Ignore the phrase if we have expanded it before
993	if (isPhraseStored(start, length))
994	continue;
995
996	// Remember that we have examined this phrase
997	rememberThisPhrase(start, length);
998
999	// Find the phrase in the suffixarray
1000	p = Phrase(suffixArray[start], length, SUFFIX);
1001	p.findFirstAndLastSuffix(start, inputLength-1);
1002
1003	// Ignore the phrase if it only occurs once
1004	if (p.suffixFrequency < minOccurs)
1005	continue;
1006
1007	// Write the phrase text;
1008	phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1009
1010	// Expand the phrase, if it is fewer than 8 words long
1011	if (length <= 8) {
1012
1013	// Get the minimal expansions for this phrase
1014	getExpansions(p, result);
1015
1016	// write the results
1017	phraseData << result.size() << ":";
1018
1019	for (cellcount i = 0; i < result.size(); ++i) {
1020	if (i) {
1021	phraseData << ",";
1022	}
1023	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1024	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1025	++outPhraseCounter;
1026	}
1027	result.clear();
1028
1029	} else {
1030	// phrase is too long to expand further
1031	phraseData << "0:";
1032	}
1033
1034	// Write the documents in which this phrase occurs
1035	df = getDocumentOccurrances(p, documentFrequency);
1036	phraseData << ":" << df << ":";
1037
1038	// write the documents
1039	for (cellcount i = 0, first = 1; i < numberOfDocuments; ++i) {
1040	if (documentFrequency[i]) {
1041	if (first) {
1042	first = 0;
1043	} else {
1044	phraseData << ";";
1045	}
1046	// Output the document number. Note that here we've numbered the
1047	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
1048	// add 1 to the document id when we output it.
1049	phraseData << "d" << (i+1);
1050	// Next, output the frequency with which the document occurs, but
1051	// only if it is > 1.
1052	if (documentFrequency[i] > 1) {
1053	phraseData << "," << documentFrequency[i];
1054	}
1055	}
1056	}
1057
1058	phraseData << endl;
1059	++phraseCounter;
1060
1061	// feedback
1062	if (verbosity) {
1063	if (phraseCounter % 1000 == 0) {
1064	cout << "phrase " << phraseCounter << ": "<< "start " << start
1065	<< ", length " << length << " - " << p << endl;
1066	}
1067	}
1068
1069	}
1070
1071	inPhrase.close();
1072	outPhrase.close();
1073	}
1074
1075	phraseData.close();
1076	deletePhraseMemory();
1077
1078	delete [] documentFrequency;
1079	delete [] symbols;
1080	delete [] suffixArray;
1081	delete [] prefixArray;
1082	delete [] suffixCheck;
1083	delete [] documentArray;
1084
1085
1086
1087	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1088	return 0;
1089	}
1090
1091
1092
1093
1094

Note: See TracBrowser for help on using the repository browser.

Download in other formats: