Context Navigation

source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2807

Last change on this file since 2807 was 2807, checked in by kjm18, 23 years ago
changed suffixCheck to bit array.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.7 KB

Line
1	/**********************************************************************
2	*
3	* Suffix.cpp -- Extract the repeated phrases in the input with suffix
4	* and prefix arrays (cgn & gwp's simpler algorithm,
5	* and kjm's improvements).
6	*
7	* Copyright 2000 Gordon W. Paynter
8	* Copyright 2000 The New Zealand Digital Library Project
9	*
10	* A component of the Greenstone digital library software
11	* from the New Zealand Digital Library Project at the
12	* University of Waikato, New Zealand.
13	*
14	* This program is free software; you can redistribute it and/or modify
15	* it under the terms of the GNU General Public License as published by
16	* the Free Software Foundation; either version 2 of the License, or
17	* (at your option) any later version.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the GNU General Public License
25	* along with this program; if not, write to the Free Software
26	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	*
28	*********************************************************************/
29
30	#include <assert.h>
31	#include <math.h>
32	#include <stdio.h>
33	#include <stdlib.h>
34	#include <string.h>
35
36	#if defined(GSDL_USE_IOS_H)
37	# include <fstream.h>
38	# include <iostream.h>
39	#else
40	# include <fstream>
41	# include <iostream>
42	#endif
43
44	#if defined(GSDL_USE_STL_H)
45	# if defined(GSDL_USE_ALGO_H)
46	# include <algo.h>
47	# else
48	# include <algorithm.h>
49	# endif
50	# include <vector.h>
51	#else
52	# include <algorithm>
53	# include <vector>
54	#endif
55
56	#include "suffix.h"
57	#include "phrase.h"
58
59	// Global variables declared in suffix.h
60	cellcount inputLength;
61
62	symbol *symbols;
63	symbol **suffixArray;
64	symbol **prefixArray;
65	check *suffixCheck;
66
67	// the length of the check array
68	cellcount checkLength;
69
70	// How many documents are in this collection?
71	cellcount numberOfDocuments;
72	symbol **documentArray;
73
74
75	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
76	int phraseMode = ANYPHRASE; //STOPWORDS;
77
78
79	// The filestem of the collection's phindex directory
80	char collection[FILENAME_MAX];
81
82
83	// The ranges of the stopword and content-word symbols for the collection
84	symbol firstStopSymbol = 0;
85	symbol lastStopSymbol = 0;
86	symbol firstContentSymbol = 0;
87	symbol lastContentSymbol = 0;
88
89
90	// Some useful comparison functions, defined below.
91	int suffixCompare(const void , const void );
92	int prefixCompare(const void , const void );
93	int pointerCompare(const void , const void );
94
95	// some bit manipulation functions for the check arrays, defined below
96	int getSuffixCheck(cellindex suff);
97	void setSuffixCheck(cellindex suff);
98
99	// Functions for implementing "phrase memory". These let us "remember"
100	// each phrase that we've expanded without using too much memory.
101	void initialisePhraseMemory();
102	void rememberThisPhrase(cellindex index, cellcount length);
103	bool isPhraseStored(cellindex index, cellcount length);
104	void deletePhraseMemory();
105
106
107	// how much output do we want?
108	int verbosity = 1;
109
110
111	// Get a phrase's expansions
112	//
113	// Get the set of "minimal", "maximal", non-unique expansions of a
114	// phrase p, using the simpler algorithm that Craig and Gordon came up
115	// with at Google.
116	//
117	// Returns a vector of Expansions.
118
119	void getExpansions(Phrase &p, vector<Phrase> &results) {
120
121	// 1. Get the initial candidates
122	vector<Phrase> candidates;
123	p.initialSuffixCandidates(candidates);
124	int suffcands = candidates.size();
125	p.initialPrefixCandidates(candidates);
126
127	if (candidates.size() == 0)
128	return;
129
130	vector<Phrase>::iterator i;
131	for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
132	i->expandWhileUniquePrefixExtension();
133	i->ensureSuffixFound();
134	}
135	for (i; i != candidates.end(); ++i) {
136	i->expandWhileUniqueSuffixExtension();
137	}
138
139	// 3. Ensure minimality: ignore phrases whose subphrases are also found
140	results.clear();
141
142	// Initialise the candidates, check array, and various variables.
143	sort(candidates.begin(), candidates.end(), isShorter);
144	memset(suffixCheck, 0, sizeof(check)*checkLength);
145	unsigned minimum_length = candidates.begin()->length;
146
147	// Try to add each candidate to the results set, ignoring the non-minimal
148	for (vector<Phrase>::iterator candidate = candidates.begin();
149	candidate != candidates.end(); candidate++) {
150
151	// Make a copy of candidate to mutilate while performing sub-phrase checks
152	Phrase temp_phrase(*candidate);
153	bool shorter_found = false;
154
155	// Check for shorter and shorter versions of the tenporary phrase
156	while (temp_phrase.length >= minimum_length && !shorter_found) {
157	temp_phrase.ensureSuffixFound();
158	//if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
159	if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
160	temp_phrase.shortenByOneAtPrefix();
161	else
162	shorter_found = true;
163
164	// Possible efficiency here: we can finish if the prefix of c
165	// and temp_phrase are the same for candidate->length symbols.
166	}
167
168	if (!shorter_found) {
169	results.push_back(*candidate);
170	candidate->ensureSuffixFound();
171	for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
172	//suffixCheck[k] = candidate->length;
173	setSuffixCheck(k);
174	}
175	}
176	}
177
178
179	// suffixCompare
180	//
181	// Compare two pointers into a suffix array. We use this in the
182	// qsort function, so the input are pointers to pointers.
183	//
184	// Return -1 if (a < b), otherwise (a > b) so return +1,
185
186	int suffixCompare(const void cpa, const void cpb) {
187
188	// Cast then dereference pointers to suffix array elements
189	symbol pa = (symbol ) cpa;
190	symbol pb = (symbol ) cpb;
191	pa = (symbol ) pa;
192	pb = (symbol ) pb;
193
194	// If the two elements are the same, examine the next one
195	while (pa == pb) {
196	*pa++;
197	*pb++;
198	}
199
200	// Make the copmparison and return
201	if ( pa < pb) {
202	return -1;
203	} else {
204	return +1;
205	}
206	}
207
208
209	// prefixCompare
210	//
211	// Compare two pointers into a prefix array. We use this in the
212	// qsort function, so the input are pointers to pointers.
213	//
214	// Return -1 if (a > b), otherwise (a < b) so return +1,
215
216	int prefixCompare(const void cpa, const void cpb) {
217
218	// Cast then dereference pointers to prefix array elements
219	symbol pa = (symbol ) cpa;
220	symbol pb = (symbol ) cpb;
221	pa = (symbol ) pa;
222	pb = (symbol ) pb;
223
224	// If the two elements are the same, examine the next one
225	while (pa == pb) {
226	*pa--;
227	*pb--;
228	}
229
230	// Make the copmparison and return
231	if ( pa > pb) {
232	return -1;
233	} else {
234	return +1;
235	}
236	}
237
238	// simpleCompare
239	//
240	// Compare two pointers based on the memory location they point to.
241
242	int pointerCompare( const void pa, const void pb ) {
243
244	symbol a = (symbol ) pa;
245	symbol b = (symbol ) pb;
246
247	if (a < b) {
248	return -1;
249	} else if (a > b) {
250	return 1;
251	} else {
252	return 0;
253	}
254	}
255
256	int getSuffixCheck(cellindex suff) {
257	cellindex cell = suff >> 3;
258	check remainder = suff & 0x07; // the last 3 bits
259	if (suffixCheck[cell]& (1 << remainder)) {
260	return 1;
261	}
262	return 0;
263	}
264	void setSuffixCheck(cellindex suff) {
265	cellindex cell = suff >> 3;
266	check remainder = suff & 0x07; // the last 3 bits
267	suffixCheck[cell] \|= (1 << remainder);
268
269	}
270
271	// Read the clauses.numbers file into the "symbols" array.
272	//
273	// Each number in the file is a symbol number; it is essential that
274	// the first symbol (and no others) be COLLECTIONSTART and the last
275	// symbol (and no others) be COLLECTIONEND.
276	//
277	// Return the number of numbers in the array.
278
279	int readNumbers() {
280
281	char filename[FILENAME_MAX];
282	sprintf(filename, "%s/clauses.numbers", collection);
283	if (verbosity) {
284	cout << "Reading numbers file: " << filename << endl;
285	}
286
287	// Open the numbers file
288	ifstream inFile1(filename, ios::in);
289	if (!inFile1) {
290	cerr << "File " << filename << " could not be opened\n";
291	exit(1);
292	}
293
294	// Count the number of symbols
295	inputLength = 0;
296	symbol word;
297	while (inFile1 >> word) {
298	inputLength++;
299	}
300	inFile1.close();
301
302	// Allocate the symbbols array
303	if (verbosity > 1) {
304	cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
305	}
306	symbols = new symbol[inputLength];
307	if (symbols == NULL) {
308	cerr << "Suffix error: not enough memory to hold " << inputLength
309	<< " symbols." << endl;
310	exit(2);
311	}
312
313	// Read the numbers file into the numbers array
314	if (verbosity > 2) {
315	cout << "Reading the numbers" << endl;
316	}
317
318	ifstream inFile2(filename, ios::in);
319	if (!inFile2) {
320	cerr << "File " << filename << " could not be opened\n";
321	exit(1);
322	}
323	cellcount next = 0;
324	numberOfDocuments = 0;
325	while (inFile2 >> word) {
326	symbols[next++] = word;
327	if (word == DOCUMENTSTART) {
328	numberOfDocuments++;
329	}
330	}
331	inFile2.close();
332
333	// Make sure the numbers file is intact
334	assert(symbols[0] == COLLECTIONSTART);
335	assert(symbols[next-1] == COLLECTIONEND);
336
337	return inputLength;
338	}
339
340
341
342	// Get Document Occurrance statistics
343	//
344	// Given a phrase, what documents does it occur in?
345
346	cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
347
348	// cout << "searching for \""<< p << "\" in documents "
349	// << 0 << "-" << numberOfDocuments - 1 << endl;
350
351	// The number of documents in which this phrase occurs
352	cellcount df = 0;
353
354	// Initialise the document frequency array
355	// for (cellindex i = 0; i < numberOfDocuments; i++) {
356	// frequency[i] = 0;
357	//}
358	memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
359
360	// variables used to facilitate the search
361	cellindex begin;
362	cellindex end;
363	cellindex d;
364	symbol *target;
365	bool found;
366
367	// search for the document in which each occurence of the phrase is found
368	for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
369
370	// cout << "looking for phrase at suffixArray[" << j << "]\n";
371
372	target = suffixArray[j];
373	begin = 0;
374	end = numberOfDocuments - 1;
375	found = false;
376
377	// Search for the occurence of a document delimiter that target
378	// occurs immediately after.
379	// We do this by performing a binary chop search on documentArray.
380	while (!found) {
381
382	// cout << "searching for " << (cellindex) target << " in "
383	// << begin << " - " << end << endl;
384
385	assert (begin <= end);
386
387	// If the beginning and end of the interval are the same,
388	// then we've found the correct document
389	if (begin == end) {
390	if (frequency[begin] == 0) {
391	df++;
392	}
393	frequency[begin]++;
394	found = true;
395	}
396
397	// Otherwise, examine a new document midway through the begin-end
398	// interval and see if it is the one.
399	else {
400	d = (begin + end) / 2;
401	if (target > documentArray[d]) {
402	// If target addrss is greater than this, but thisi sthe last document,
403	// then this must be the one we want. Or, if target is greater than
404	// this one but less then the next, this must be the one we wnat.
405	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
406	if (frequency[d] == 0) {
407	df++;
408	}
409	frequency[d]++;
410	found = true;
411	} else {
412	// otherwise we know to search later in the document set
413	begin = d + 1;
414	}
415	} else {
416	// search earlier in the document set
417	end = d - 1;
418	}
419	}
420	}
421	}
422	return df;
423	}
424
425
426
427
428
429
430	// phraseExpansionMemory : Which phrases have we expanded?
431	//
432	// A set of utilities for keeping track of which phrases we have expanded.
433	// We don't want to expand a phrase more than once, after all.
434	//
435	// This REALLY ought to be in its own class, but it works so that's okay.
436	//
437	// Phrases are identified by their firstSuffixPosition and length.
438	//
439	// Functions provided are:
440	// void initialisePhraseMemory()
441	// void rememberThisPhrase(index, length)
442	// bool isPhraseStored(index, length)
443	// void deletePhraseMemory()
444	//
445	// Internally, we will have two separate cases:
446	//
447	// Phrases of length 1-8:
448	// unsigned char phraseMemory[inputLength]
449	// is an array where each cell "remembers" the corresponding index in the
450	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
451	// of length 1, 2... 8.
452	// Eventually, we will make this disk-based (i.e. store the array in a file).
453	//
454	// Phrases of length 9+:
455	// file hashTableFile
456	// file listOfEntries
457	// The first file is a hash table; each phrase maps to one of its cells, which
458	// contains either 0 (empty, no occurence) or a number which is an entry number
459	// in the second file. This file contains a "list" of entries. Each consists of
460	// three numbers: the suffixArray index of the phrase, the length of the phrase,
461	// and the entry number of the next phrase with the same hash.
462	//
463
464
465	unsigned char *phraseMemory;
466
467	void initialiseLongPhraseMemory();
468	void rememberThisLongPhrase(cellindex index, cellcount length);
469	bool isLongPhraseStored(cellindex index, cellcount length);
470	void deleteLongPhraseMemory();
471
472
473	void initialisePhraseMemory() {
474
475	phraseMemory = new unsigned char[inputLength];
476
477	// to begin with, everything is empty
478	// for (cellcount i = 0; i < inputLength; i++) {
479	// phraseMemory[i] = 0;
480	//}
481	memset(phraseMemory, 0, sizeof(char)*inputLength);
482
483	// intialise the hashTable of long phrases
484	initialiseLongPhraseMemory();
485
486	}
487
488	void rememberThisPhrase(cellindex index, cellcount length) {
489
490	// if the phrase is very long, use the file-based system
491	if (length > 8) {
492	rememberThisLongPhrase(index, length);
493	return;
494	}
495
496	// create a char with just the bit corresponding to length set
497	unsigned char newbit = 1;
498	for (cellcount i = 1; i < length; i++) {
499	newbit <<= 1;
500	}
501
502	// set that bit in the memory array at position index
503	phraseMemory[index] \|= newbit;
504	}
505
506
507	bool isPhraseStored(cellindex index, cellcount length) {
508
509	// if the phrase is very long, use the file-based system
510	if (length > 8) {
511	return isLongPhraseStored(index, length);
512	}
513
514	// create a char with just the bit corresponding to length set
515	unsigned char newbit = 1;
516	for (cellcount i = 1; i < length; i++) {
517	newbit <<= 1;
518	}
519
520	// retrurn true if that bit is set in memory arrayat position index
521	return (phraseMemory[index] & newbit);
522	}
523
524	void deletePhraseMemory() {
525	delete phraseMemory;
526	deleteLongPhraseMemory();
527	}
528
529
530
531	// Files etc used to store "long" equavlents of the above
532
533	fstream hashTableFile;
534	char hashTableFileName[FILENAME_MAX];
535	fstream listOfEntries;
536	char listOfEntriesName[FILENAME_MAX];
537	cellindex nextEntryNumber;
538
539	const cellcount bigPrime = 7919;
540
541
542	void initialiseLongPhraseMemory() {
543
544	cellindex example = 0;
545
546	sprintf(hashTableFileName, "%s/hashTable", collection);
547	sprintf(listOfEntriesName, "%s/hashLists", collection);
548
549
550	// create the new hashtable
551	if (verbosity > 1) {
552	cout << "Initialising hashTable: " << hashTableFileName << endl;
553	}
554	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
555	for (cellcount i = 0; i < bigPrime; i++) {
556	hashTableFile.write((char *) &example, sizeof(example));
557	}
558
559	// create the list of phrases
560	if (verbosity > 1) {
561	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
562	}
563	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
564	listOfEntries.write((char *) &example, sizeof(example));
565	listOfEntries.write((char *) &example, sizeof(example));
566	listOfEntries.write((char *) &example, sizeof(example));
567	nextEntryNumber = 1;
568	}
569
570
571	void rememberThisLongPhrase(cellindex index, cellcount length) {
572
573	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
574
575	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
576	cellindex pointer;
577	cellindex zero = 0;
578	cellindex readp = 0;
579	cellindex readi = 0;
580	cellindex readl = 0;
581
582	hashTableFile.seekg(hashOffset);
583	hashTableFile.read((char *) &pointer, sizeof(cellindex));
584
585	if (pointer == 0) {
586	// There is no entry at all in the hash table for this entry
587	// so create one
588
589	pointer = nextEntryNumber++;
590	hashTableFile.seekg(hashOffset);
591	hashTableFile.write((char *) &pointer, sizeof(cellindex));
592
593	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
594	listOfEntries.write((char *) &zero, sizeof(cellindex));
595	listOfEntries.write((char *) &index, sizeof(cellindex));
596	listOfEntries.write((char *) &length, sizeof(cellindex));
597
598	} else {
599	// There is a list starting at this hash value, so the phrase may
600	// be already remembered, or it might need to be appended
601
602	while (pointer != 0) {
603	// Read the entry pointed to by pointer
604	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
605	listOfEntries.read((char *) &readp, sizeof(cellindex));
606	listOfEntries.read((char *) &readi, sizeof(cellindex));
607	listOfEntries.read((char *) &readl, sizeof(cellindex));
608
609	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
610
611	if ((readi == index) && (readl = length)) {
612	// we've found that we've already stored it
613	return;
614	} else if (readp == 0) {
615	// we're reached the end of the list. Add a new entry.
616	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
617	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
618	pointer = nextEntryNumber++;
619
620	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
621	listOfEntries.write((char *) &zero, sizeof(cellindex));
622	listOfEntries.write((char *) &index, sizeof(cellindex));
623	listOfEntries.write((char *) &length, sizeof(cellindex));
624	return;
625	} else {
626	// go on to the next node
627	pointer = readp;
628	}
629	}
630	}
631
632
633	}
634
635
636	bool isLongPhraseStored(cellindex index, cellcount length) {
637
638	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
639
640	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
641	cellindex pointer;
642	cellindex readp = 0;
643	cellindex readi = 0;
644	cellindex readl = 0;
645
646	// Find the phrase in the hashFile
647	hashTableFile.seekg(hashOffset);
648	hashTableFile.read((char *) &pointer, sizeof(cellindex));
649
650	if (pointer == 0) {
651	// There is no entry at all in the hash table for this entry
652	// so nothing is stored
653	return false;
654
655	} else {
656	// There is a list starting at this hash value, so the phrase may
657	// be already remembered in that list
658	while (pointer != 0) {
659	// Read the entry pointed to by pointer
660	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
661	listOfEntries.read((char *) &readp, sizeof(cellindex));
662	listOfEntries.read((char *) &readi, sizeof(cellindex));
663	listOfEntries.read((char *) &readl, sizeof(cellindex));
664
665	if ((readi == index) && (readl = length)) {
666	// we've found the phrase stored here
667	return true;
668	} else {
669	// go on to the next node
670	pointer = readp;
671	}
672	}
673	}
674	return false;
675	}
676
677
678	void deleteLongPhraseMemory() {
679	// remove the hash & other files
680
681	hashTableFile.close();
682	listOfEntries.close();
683	remove(hashTableFileName);
684	remove(listOfEntriesName);
685
686	}
687
688
689	// Read the collection statistics file
690	//
691	void readStatistics() {
692
693	// open the statistics file
694	char filename[FILENAME_MAX];
695	sprintf(filename, "%s/clauses.stats", collection);
696
697	// Open the file
698	ifstream inFile(filename, ios::in);
699	if (!inFile) {
700	cerr << "File " << filename << " could not be opened\n";
701	exit(1);
702	}
703
704	// Read the numbers file into the numbers array
705	char key[1000];
706	symbol value;
707	while (inFile >> key >> value){
708	if (strcmp(key, "first_stopword") == 0) {
709	firstStopSymbol = value;
710	} else if (strcmp(key, "last_stopword") == 0) {
711	lastStopSymbol = value;
712	} else if (strcmp(key, "first_contentword") == 0) {
713	firstContentSymbol = value;
714	} else if (strcmp(key, "last_contentword") == 0) {
715	lastContentSymbol = value;
716	}
717	}
718	inFile.close();
719
720	// Make sure we have the information we need
721	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
722	cerr << "Statistics file incomplete" << endl;
723	exit(1);
724	}
725	}
726
727	cellcount getContentCount(symbol firstContent) {
728
729	cellcount content=0;
730	for (cellcount i=0; i<inputLength; i++) {
731	if (symbols[i]>=firstContent) content++;
732	}
733
734	return content;
735	}
736
737	int main (int argc, char * argv[]) {
738
739	// Command-line arguments
740	// argv[1] is the phindex directory
741	// argv[2] is the maximum array symbol length (optional)
742	// argv[3] is the mode, where 1 is stopword mode (optional)
743	if (argc < 2) {
744	cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
745	exit(1);
746	}
747
748	// collection directory
749	strcpy(collection, argv[1]);
750
751	// mode parameter
752	phraseMode = atoi(argv[2]);
753	assert((phraseMode == STOPWORDS) \|\| (phraseMode == ANYPHRASE));
754
755	// optional verbosity parameter
756	if (argc == 4) {
757	verbosity = atoi(argv[3]);
758	assert (verbosity >= 0);
759	}
760
761	if (verbosity) {
762	cout << "suffix: the phrase extraction program" << endl;
763	}
764
765	if (verbosity > 1) {
766	if (phraseMode == STOPWORDS) {
767	cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
768	} else {
769	cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
770	}
771	}
772
773	// Read the statistics file
774	readStatistics();
775
776	// Read the numbers file
777	readNumbers();
778
779	if (numberOfDocuments == 0) {
780	cerr << "There are no documents in this collection!" << endl;
781	exit(1);
782	}
783
784	symbol firstContent;
785	if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
786	else firstContent = firstStopSymbol;
787
788	cellcount contentLength = 0;
789	contentLength = getContentCount(firstContent);
790
791	// Create the suffix & prefix arrays
792	suffixArray = new symbol *[contentLength];
793	prefixArray = new symbol *[contentLength];
794
795	cellcount here=0;
796	// Initialise prefix and suffix arrays, only use the needed suffixes
797	for (cellcount j = 0; j < inputLength; j++) {
798	if (symbols[j]>=firstContent) {
799	suffixArray[here] = &symbols[j];
800	prefixArray[here] = &symbols[j];
801	here++;
802	}
803	}
804	qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
805	qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
806
807	checkLength = contentLength/8 + 1;
808	suffixCheck = new check[checkLength];
809	if (suffixCheck == NULL) {
810	cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
811	exit(2);
812	}
813	memset(suffixCheck, 0, sizeof(check)*checkLength);
814
815	cout <<"\ngenerating the phrase hierarchy\n\n";
816
817	// Create the document arrays
818	if (verbosity > 1) {
819	cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
820	}
821
822	// The document frequecy array is used to count the number of times
823	// each phrase occurs in each document. The number of documents in
824	// which a phrase occurs is stored in df.
825	frequency *documentFrequency = new frequency[numberOfDocuments];
826	frequency df;
827
828	// documentArray will be searched in order to discover which document
829	// each phrase occurs in.
830	documentArray = new symbol *[numberOfDocuments];
831
832	// just scan through the input text to find the doc starts
833	cellindex d = 0;
834	for (cellindex i=0; i<inputLength; i++) {
835	if (symbols[i] == DOCUMENTSTART) {
836	documentArray[d] = &symbols[i];
837	d++;
838	}
839	}
840
841	// the phrases stuff is expecting inputLength to be the length of the
842	// suffix array, so change it.
843	inputLength = contentLength;
844
845	// Extract phrases
846	//
847	// We will make several passesover the data, in each case considering
848	// a set of input phrases and generating a set of output phrases, which
849	// we will expancd in later passes.
850	//
851	// The input phrases in the first pass will be the vocabulary.
852	// In later passes, the input phrases will be the output phrases of the
853	// previous pass.
854	//
855	// In each pass we will consider each input phrase in turn. If we
856	// have seen it before, we will ignore it. Otherwise, we will expand
857	// it and add its expansions to the set of output phrases.
858
859	// Store the phrase data in the phrases file
860	char phraseDataName[FILENAME_MAX];
861	sprintf(phraseDataName, "%s/phrases", collection);
862	ofstream phraseData(phraseDataName, ios::out);
863	if (!phraseData) {
864	cout << "File " << phraseDataName << " could not be opened\n";
865	exit(1);
866	}
867
868	// Count the number of phrases output
869	unsigned long int phraseCounter = 0;
870
871	// Set up the phrase expansion memory.
872	// We need this so that we don't expand a phrase more than once
873	initialisePhraseMemory();
874
875	// The current pass numebr
876	int phrasePass = 1;
877
878
879	// PASS NUMBER 1
880	if (verbosity > 1) {
881	cout << "Starting pass " << phrasePass << endl;
882	}
883
884	ofstream outPhrase;
885	char outPhraseName[FILENAME_MAX];
886	unsigned long int outPhraseCounter = 0;
887
888	// On the first pass, simply work through the vocabulary
889	sprintf(outPhraseName, "%s/outPhrase.1", collection);
890	outPhrase.open(outPhraseName, ios::out);
891	if (!outPhrase) {
892	cerr << "File " << outPhraseName << " could not be opened\n";
893	exit(1);
894	}
895
896	// Iterate over the different symbols by working through the suffix array
897	vector<Phrase> result;
898	cellindex ij = 0;
899	char *tmpString;
900
901	Phrase p;
902	while (ij < inputLength) {
903
904	// make a new phrase of length 1
905	p = Phrase(suffixArray[ij], 1, SUFFIX);
906	p.findFirstAndLastSuffix(ij, inputLength-1);
907
908	// We ignore this symbol if it occurs only once, if it is a delimiter,
909	// of if we are in stopwords mode and it is a stopword
910	// - in this new version, only need to check freq
911	// We could imagine a new mode/command-line option, which is like
912	// STOPWORDS but without this restrictrion. This would let you browse
913	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
914	// "the AGRIS" (where the is a stopword and AGRIS a content word).
915	// The system used to work like this; it is easy to implement, but
916	// it explodes the size of the indexes. So: would it be useful?
917	if (p.suffixFrequency > 1) {
918	// Get minimal expansions of the phrase
919	getExpansions(p, result);
920
921	if (!result.empty()) {
922
923	// Remember that we have expanded this phrase
924	rememberThisPhrase(ij, 1);
925
926	// write the phrase text
927	phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
928	<< result.size() << ":";
929
930	// write the results
931	for (cellcount k = 0; k < result.size(); k++) {
932	if (k) {
933	phraseData << ",";
934	}
935	phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
936	outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
937	outPhraseCounter++;
938	}
939	result.clear();
940
941	// Write the documents in which this phrase occurs
942	df = getDocumentOccurrances(p, documentFrequency);
943	phraseData << ":" << df << ":";
944
945	// write the documents
946	for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
947	if (documentFrequency[m]) {
948	if (first) {
949	first = 0;
950	} else {
951	phraseData << ";";
952	}
953	// Output the document number. Note that here we've numbered the
954	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
955	// add 1 to the document id when we output it.
956	phraseData << "d" << (m+1);
957	// Next, output the frequency with which the document occurs, but
958	// only if it is > 1.
959	if (documentFrequency[m] > 1) {
960	phraseData << "," << documentFrequency[m];
961	}
962	}
963	}
964
965	phraseData << endl;
966	phraseCounter++;
967
968	// feedback
969	if (verbosity) {
970	if (phraseCounter % 1000 == 0) {
971	cout << "phrase " << phraseCounter << ": "
972	<< "cell " << p.firstSuffixIndex << " - " << p << endl;
973	}
974	}
975	}
976	}
977	ij = p.lastSuffixIndex + 1;
978	}
979	outPhrase.close();
980
981	// REMAINING PASSES
982	// The previous outPhrase file forms the input to each new pass
983	cellcount start, length;
984	while (outPhraseCounter > 0) {
985
986	// Start a new pass
987	phrasePass++;
988	if (verbosity) {
989	cout << "Starting pass " << phrasePass << endl;
990	}
991
992	// Open the input file
993	char inPhraseName[FILENAME_MAX];
994	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
995	ifstream inPhrase (inPhraseName, ios::in);
996	if (!inPhrase) {
997	cerr << "File " << inPhraseName << " could not be opened\n";
998	exit(1);
999	}
1000
1001	// Open the output file
1002	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
1003	outPhrase.open(outPhraseName, ios::out);
1004	if (!outPhrase) {
1005	cerr << "File " << outPhraseName << " could not be opened\n";
1006	exit(1);
1007	}
1008	outPhraseCounter = 0;
1009
1010	// Process each phrase
1011	while(inPhrase >> start >> length) {
1012
1013	// Ignore the phrase if we have expanded it before
1014	if (isPhraseStored(start, length))
1015	continue;
1016
1017	// Remember that we have examined this phrase
1018	rememberThisPhrase(start, length);
1019
1020	// Find the phrase in the suffixarray
1021	p = Phrase(suffixArray[start], length, SUFFIX);
1022	p.findFirstAndLastSuffix(start, inputLength-1);
1023
1024	// Ignore the phrase if it only occurs once
1025	if (p.suffixFrequency < 2)
1026	continue;
1027
1028	// Write the phrase text;
1029	phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1030
1031	// Expand the phrase, if it is fewer than 8 words long
1032	if (length <= 8) {
1033
1034	// Get the minimal expansions for this phrase
1035	getExpansions(p, result);
1036
1037	// write the results
1038	phraseData << result.size() << ":";
1039
1040	for (cellcount i = 0; i < result.size(); i++) {
1041	if (i) {
1042	phraseData << ",";
1043	}
1044	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1045	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1046	outPhraseCounter++;
1047	}
1048	result.clear();
1049
1050	} else {
1051	// phrase is too long to expand further
1052	phraseData << "0:";
1053	}
1054
1055	// Write the documents in which this phrase occurs
1056	df = getDocumentOccurrances(p, documentFrequency);
1057	phraseData << ":" << df << ":";
1058
1059	// write the documents
1060	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
1061	if (documentFrequency[i]) {
1062	if (first) {
1063	first = 0;
1064	} else {
1065	phraseData << ";";
1066	}
1067	// Output the document number. Note that here we've numbered the
1068	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
1069	// add 1 to the document id when we output it.
1070	phraseData << "d" << (i+1);
1071	// Next, output the frequency with which the document occurs, but
1072	// only if it is > 1.
1073	if (documentFrequency[i] > 1) {
1074	phraseData << "," << documentFrequency[i];
1075	}
1076	}
1077	}
1078
1079	phraseData << endl;
1080	phraseCounter++;
1081
1082	// feedback
1083	if (verbosity) {
1084	if (phraseCounter % 1000 == 0) {
1085	cout << "phrase " << phraseCounter << ": "<< "start " << start
1086	<< ", length " << length << " - " << p << endl;
1087	}
1088	}
1089
1090	}
1091
1092	inPhrase.close();
1093	outPhrase.close();
1094	}
1095
1096	phraseData.close();
1097	deletePhraseMemory();
1098
1099	delete [] documentFrequency;
1100	delete [] symbols;
1101	delete [] suffixArray;
1102	delete [] prefixArray;
1103	delete [] suffixCheck;
1104	delete [] documentArray;
1105
1106
1107
1108	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1109	return 0;
1110	}
1111
1112
1113
1114
1115

Note: See TracBrowser for help on using the repository browser.

Download in other formats: