Context Navigation

suffix.cpp@ 26158

Last change on this file since 26158 was 26158, checked in by ak19, 12 years ago
Bugfix. Many thanks to Anthony Blake. The phind browse classifier used to fail on certain particular combinations of documents on 64 bit linux. This was because of a bug in suffix.exe that caused a seg fault: all pointers are 64 bits on 64 bit machines. However, suffixCompare() and prefixCompare() functions used to try to convert their arguments, which were void* pointers to pointers, by first casting the void* pointers to symbol* and then dereference these to get their values (which are ultimately 32 bit unsigned int) but which are meant to be addresses (pointers themselves) and then converting these back to pointers objects which are 64 bit values. During this process of multiple conversion, 64 bit pointers were truncated to 32 bits before being given 64 bits for storage again. This used to work on 32 bit machines before since pointers there were 32 bits, but showed up here. The solution was a double pointer cast operation: since the data in the void* arguments are actually pointers to pointers, convert them directly back to the specific pointer data types we need. Many thanks to Anthony Blake who initially came to help with working out why gdb wasn't loading the symbols despite suffix.cpp being compiled up with the minus-g flag (when attempting to debug by printing out the values of the dereferenced pointers where things appeared to go wrong or at least step through the code). He ended up finding out what the bug actually was and that a conversion to double pointers was the solution necessary to get things working on 64 bit machines as well as still working on 32 bit machines. Now the phind classifier's been tested on 64 bit and 32 bit machines. And Anthony confirmed that this issue has indeed been resolved. Though it struck randomly, as only certain document combinations in collections show up the underlying error via a seg fault, he inspected the assembly instructions in gdb to see that the move operations were now correct, which they hadn't been before.
Property svn:keywords set to `Author Date Id Revision`
File size: 31.7 KB

Line
1	/**********************************************************************
2	*
3	* Suffix.cpp -- Extract the repeated phrases in the input with suffix
4	* and prefix arrays (cgn & gwp's simpler algorithm,
5	* and kjm's improvements).
6	*
7	* Copyright 2000 Gordon W. Paynter
8	* Copyright 2000 The New Zealand Digital Library Project
9	*
10	* A component of the Greenstone digital library software from the
11	* New Zealand Digital Library Project at the University of Waikato,
12	* New Zealand.
13	*
14	* This program is free software; you can redistribute it and/or
15	* modify it under the terms of the GNU General Public License as
16	* published by the Free Software Foundation; either version 2 of
17	* the License, or (at your option) any later version.
18	*
19	* This program is distributed in the hope that it will be useful,
20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22	* GNU General Public License for more details.
23	*
24	* You should have received a copy of the GNU General Public License
25	* along with this program; if not, write to the Free Software
26	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27	*
28	*********************************************************************/
29
30	#include <assert.h>
31	#include <math.h>
32	#include <stdio.h>
33	#include <stdlib.h>
34	#include <string.h>
35
36	#if defined(GSDL_USE_IOS_H)
37	# include <fstream.h>
38	# include <iostream.h>
39	#else
40	# include <fstream>
41	# include <iostream>
42	#endif
43
44	#if defined(GSDL_USE_STL_H)
45	# if defined(GSDL_USE_ALGO_H)
46	# include <algo.h>
47	# else
48	# include <algorithm.h>
49	# endif
50	# include <vector.h>
51	#else
52	# include <algorithm>
53	# include <vector>
54	#endif
55
56	#include "suffix.h"
57	#include "phrase.h"
58	#include "check.h"
59	#include "mglong.h"
60
61	// Global variables declared in suffix.h
62	cellcount inputLength;
63
64	symbol *symbols;
65	symbol **suffixArray;
66	symbol **prefixArray;
67
68	// How many documents are in this collection?
69	cellcount numberOfDocuments;
70	symbol **documentArray;
71
72
73	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
74	int phraseMode = ANYPHRASE; //STOPWORDS;
75	// What is the minimum phrase frequency for a phrase to be included in the hierarchy
76	int minOccurs = 2;
77
78	// The filestem of the collection's phindex directory
79	char collection[FILENAME_MAX];
80
81
82	// The ranges of the stopword and content-word symbols for the collection
83	symbol firstStopSymbol = 0;
84	symbol lastStopSymbol = 0;
85	symbol firstContentSymbol = 0;
86	symbol lastContentSymbol = 0;
87
88
89	// Some useful comparison functions, defined below.
90	int suffixCompare(const void , const void );
91	int prefixCompare(const void , const void );
92	int pointerCompare(const void , const void );
93
94
95	// Functions for implementing "phrase memory". These let us "remember"
96	// each phrase that we've expanded without using too much memory.
97	void initialisePhraseMemory();
98	void rememberThisPhrase(cellindex index, cellcount length);
99	bool isPhraseStored(cellindex index, cellcount length);
100	void deletePhraseMemory();
101
102
103	// how much output do we want?
104	int verbosity = 1;
105
106
107	// Get a phrase's expansions
108	//
109	// Get the set of "minimal", "maximal", non-unique expansions of a
110	// phrase p, using the simpler algorithm that Craig and Gordon came up
111	// with at Google.
112	//
113	// Returns a vector of Expansions.
114
115	void getExpansions(Phrase &p, vector<Phrase> &results) {
116
117	// 1. Get the initial candidates
118	vector<Phrase> candidates;
119	p.initialSuffixCandidates(candidates);
120	int suffcands = candidates.size();
121	p.initialPrefixCandidates(candidates);
122
123	if (candidates.size() == 0)
124	return;
125
126	vector<Phrase>::iterator i;
127	for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
128	i->expandWhileUniquePrefixExtension();
129	i->ensureSuffixFound();
130	}
131	for (i; i != candidates.end(); ++i) {
132	i->expandWhileUniqueSuffixExtension();
133	}
134
135	// 3. Ensure minimality: ignore phrases whose subphrases are also found
136	results.clear();
137
138	// Initialise the candidates, check array, and various variables.
139	sort(candidates.begin(), candidates.end(), isShorter);
140	unsigned minimum_length = candidates.begin()->length;
141	clearSuffixCheck();
142
143	// Try to add each candidate to the results set, ignoring the non-minimal
144	for (vector<Phrase>::iterator candidate = candidates.begin();
145	candidate != candidates.end(); ++candidate) {
146
147	// Make a copy of candidate to mutilate while performing sub-phrase checks
148	Phrase temp_phrase(*candidate);
149	bool shorter_found = false;
150
151	// Check for shorter and shorter versions of the temporary phrase
152	while (temp_phrase.length >= minimum_length && !shorter_found) {
153	temp_phrase.ensureSuffixFound();
154	if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
155	temp_phrase.shortenByOneAtPrefix();
156	else
157	shorter_found = true;
158
159	// Possible efficiency here: we can finish if the prefix of c
160	// and temp_phrase are the same for candidate->length symbols.
161	}
162
163	// If no shorter phrase is found, use this one
164	if (!shorter_found) {
165	results.push_back(*candidate);
166	candidate->ensureSuffixFound();
167	setSuffixCheck(candidate->firstSuffixIndex, candidate->lastSuffixIndex);
168	}
169	}
170	}
171
172
173	// suffixCompare
174	//
175	// Compare two pointers into a suffix array. We use this in the
176	// qsort function, so the input are pointers to pointers.
177	//
178	// Return -1 if (a < b), otherwise (a > b) so return +1,
179
180	int suffixCompare(const void cpa, const void cpb) {
181
182	// The following is a bug and so at times results in a segmentation fault on 64 bit linux machines.
183	// The bug is in the 3rd and 4th lines, which perform identical operations. Considering the 3rd line:
184	// The pa part of this statement truncates pa, the 64 bit pointer (symbol), with the 32 bit value
185	// symbol (unsigned int) that it's being dereferenced to, before trying to reconvert it to a pointer
186	// which is always 64 bit on 64 bit machines. The 4th line has the same problem and prefixCompare() too.
187
188	// WRONG: Cast then dereference pointers to suffix array elements
189	//symbol pa = (symbol ) cpa;
190	//symbol pb = (symbol ) cpb;
191	//pa = (symbol ) pa;
192	//pb = (symbol ) pb;
193
194	// This is the correct way that will work on both 64 bit and 32 bit machines:
195	// the 2 void* arguments to this function, cpa and cpb, are "pointers to elements" as per
196	// http://www.cplusplus.com/reference/clibrary/cstdlib/qsort/. The array elements themselves are also
197	// pointers themselves: to symbol values. As the comment to this function says: cpa and cpb are
198	// pointers to pointers, making them double pointers. When we cast the void* cpa and cpb arguments to
199	// the types we need to work with here, they therefore need to be cast as double pointers to symbol.
200	// We declare pa and pb as symbol pointers (symbol*) to the actual values being compared, therefore pa
201	// and pb need to be instantiated as the address of the values referenced by cpa and cpb.
202	symbol pa = ((symbol **) cpa);
203	symbol pb = ((symbol **) cpb);
204
205
206	// If the two elements are the same, examine the next one
207	while (pa == pb) {
208	*pa++;
209	*pb++;
210	}
211
212	// Make the copmparison and return
213	if ( pa < pb) {
214	return -1;
215	} else {
216	return +1;
217	}
218	}
219
220
221	// prefixCompare
222	//
223	// Compare two pointers into a prefix array. We use this in the
224	// qsort function, so the input are pointers to pointers.
225	//
226	// Return -1 if (a > b), otherwise (a < b) so return +1,
227
228	int prefixCompare(const void cpa, const void cpb) {
229
230	// Cast and dereference double pointers (pointers to prefix array elements)
231	// See comments in suffixCompare()
232	symbol pa = ((symbol **) cpa);
233	symbol pb = ((symbol **) cpb);
234
235	// If the two elements are the same, examine the next one
236	while (pa == pb) {
237	*pa--;
238	*pb--;
239	}
240
241	// Make the comparison and return
242	if ( pa > pb) {
243	return -1;
244	} else {
245	return +1;
246	}
247	}
248
249	// simpleCompare
250	//
251	// Compare two pointers based on the memory location they point to.
252
253	int pointerCompare( const void pa, const void pb ) {
254
255	symbol a = (symbol ) pa;
256	symbol b = (symbol ) pb;
257
258	if (a < b) {
259	return -1;
260	} else if (a > b) {
261	return 1;
262	} else {
263	return 0;
264	}
265	}
266
267
268	// Read the clauses.numbers file into the "symbols" array.
269	//
270	// Each number in the file is a symbol number; it is essential that
271	// the first symbol (and no others) be COLLECTIONSTART and the last
272	// symbol (and no others) be COLLECTIONEND.
273	//
274	// Return the number of numbers in the array.
275
276	int readNumbers() {
277
278	char filename[FILENAME_MAX];
279	sprintf(filename, "%s/clauses.numbers", collection);
280	if (verbosity) {
281	cout << "Reading numbers file: " << filename << endl;
282	}
283
284	// Open the numbers file
285	ifstream inFile1(filename, ios::in);
286	if (!inFile1) {
287	cerr << "File " << filename << " could not be opened\n";
288	exit(1);
289	}
290
291	// Count the number of symbols
292	inputLength = 0;
293	symbol word;
294	while (inFile1 >> word) {
295	++inputLength;
296	}
297	inFile1.close();
298
299	// Allocate the symbbols array
300	if (verbosity > 1) {
301	cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
302	}
303	symbols = new symbol[inputLength];
304	if (symbols == NULL) {
305	cerr << "Suffix error: not enough memory to hold " << inputLength
306	<< " symbols." << endl;
307	exit(2);
308	}
309
310	// Read the numbers file into the numbers array
311	if (verbosity > 2) {
312	cout << "Reading the numbers" << endl;
313	}
314
315	ifstream inFile2(filename, ios::in);
316	if (!inFile2) {
317	cerr << "File " << filename << " could not be opened\n";
318	exit(1);
319	}
320	cellcount next = 0;
321	numberOfDocuments = 0;
322	while (inFile2 >> word) {
323	symbols[next++] = word;
324	if (word == DOCUMENTSTART) {
325	++numberOfDocuments;
326	}
327	}
328	inFile2.close();
329
330	// Make sure the numbers file is intact
331	assert(symbols[0] == COLLECTIONSTART);
332	assert(symbols[next-1] == COLLECTIONEND);
333
334	return inputLength;
335	}
336
337
338
339	// Get Document Occurrance statistics
340	//
341	// Given a phrase, what documents does it occur in?
342
343	cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
344
345	// cout << "searching for \""<< p << "\" in documents "
346	// << 0 << "-" << numberOfDocuments - 1 << endl;
347
348	// The number of documents in which this phrase occurs
349	cellcount df = 0;
350
351	// Initialise the document frequency array
352	// for (cellindex i = 0; i < numberOfDocuments; ++i) {
353	// frequency[i] = 0;
354	//}
355	memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
356
357	// variables used to facilitate the search
358	cellindex begin;
359	cellindex end;
360	cellindex d;
361	symbol *target;
362	bool found;
363
364	// search for the document in which each occurence of the phrase is found
365	for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; ++j) {
366
367	// cout << "looking for phrase at suffixArray[" << j << "]\n";
368
369	target = suffixArray[j];
370	begin = 0;
371	end = numberOfDocuments - 1;
372	found = false;
373
374	// Search for the occurence of a document delimiter that target
375	// occurs immediately after.
376	// We do this by performing a binary chop search on documentArray.
377	while (!found) {
378
379	// cout << "searching for " << (cellindex) target << " in "
380	// << begin << " - " << end << endl;
381
382	assert (begin <= end);
383
384	// If the beginning and end of the interval are the same,
385	// then we've found the correct document
386	if (begin == end) {
387	if (frequency[begin] == 0) {
388	++df;
389	}
390	++frequency[begin];
391	found = true;
392	}
393
394	// Otherwise, examine a new document midway through the begin-end
395	// interval and see if it is the one.
396	else {
397	d = (begin + end) / 2;
398	if (target > documentArray[d]) {
399	// If target addrss is greater than this, but thisi sthe last document,
400	// then this must be the one we want. Or, if target is greater than
401	// this one but less then the next, this must be the one we wnat.
402	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
403	if (frequency[d] == 0) {
404	++df;
405	}
406	++frequency[d];
407	found = true;
408	} else {
409	// otherwise we know to search later in the document set
410	begin = d + 1;
411	}
412	} else {
413	// search earlier in the document set
414	end = d - 1;
415	}
416	}
417	}
418	}
419	return df;
420	}
421
422
423
424
425
426
427	// phraseExpansionMemory : Which phrases have we expanded?
428	//
429	// A set of utilities for keeping track of which phrases we have expanded.
430	// We don't want to expand a phrase more than once, after all.
431	//
432	// This REALLY ought to be in its own class, but it works so that's okay.
433	//
434	// Phrases are identified by their firstSuffixPosition and length.
435	//
436	// Functions provided are:
437	// void initialisePhraseMemory()
438	// void rememberThisPhrase(index, length)
439	// bool isPhraseStored(index, length)
440	// void deletePhraseMemory()
441	//
442	// Internally, we will have two separate cases:
443	//
444	// Phrases of length 1-8:
445	// unsigned char phraseMemory[inputLength]
446	// is an array where each cell "remembers" the corresponding index in the
447	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
448	// of length 1, 2... 8.
449	// Eventually, we will make this disk-based (i.e. store the array in a file).
450	//
451	// Phrases of length 9+:
452	// file hashTableFile
453	// file listOfEntries
454	// The first file is a hash table; each phrase maps to one of its cells, which
455	// contains either 0 (empty, no occurence) or a number which is an entry number
456	// in the second file. This file contains a "list" of entries. Each consists of
457	// three numbers: the suffixArray index of the phrase, the length of the phrase,
458	// and the entry number of the next phrase with the same hash.
459	//
460
461
462	unsigned char *phraseMemory;
463
464	void initialiseLongPhraseMemory();
465	void rememberThisLongPhrase(cellindex index, cellcount length);
466	bool isLongPhraseStored(cellindex index, cellcount length);
467	void deleteLongPhraseMemory();
468
469
470	void initialisePhraseMemory() {
471
472	phraseMemory = new unsigned char[inputLength];
473
474	// to begin with, everything is empty
475	// for (cellcount i = 0; i < inputLength; ++i) {
476	// phraseMemory[i] = 0;
477	//}
478	memset(phraseMemory, 0, sizeof(char)*inputLength);
479
480	// intialise the hashTable of long phrases
481	initialiseLongPhraseMemory();
482
483	}
484
485	void rememberThisPhrase(cellindex index, cellcount length) {
486
487	// if the phrase is very long, use the file-based system
488	if (length > 8) {
489	rememberThisLongPhrase(index, length);
490	return;
491	}
492
493	// create a char with just the bit corresponding to length set
494	unsigned char newbit = 1;
495	for (cellcount i = 1; i < length; ++i) {
496	newbit <<= 1;
497	}
498
499	// set that bit in the memory array at position index
500	phraseMemory[index] \|= newbit;
501	}
502
503
504	bool isPhraseStored(cellindex index, cellcount length) {
505
506	// if the phrase is very long, use the file-based system
507	if (length > 8) {
508	return isLongPhraseStored(index, length);
509	}
510
511	// create a char with just the bit corresponding to length set
512	unsigned char newbit = 1;
513	for (cellcount i = 1; i < length; ++i) {
514	newbit <<= 1;
515	}
516
517	// retrurn true if that bit is set in memory arrayat position index
518	return (phraseMemory[index] & newbit);
519	}
520
521	void deletePhraseMemory() {
522	delete []phraseMemory;
523	deleteLongPhraseMemory();
524	}
525
526
527
528	// Files etc used to store "long" equavlents of the above
529
530	fstream hashTableFile;
531	char hashTableFileName[FILENAME_MAX];
532	fstream listOfEntries;
533	char listOfEntriesName[FILENAME_MAX];
534	cellindex nextEntryNumber;
535
536	const cellcount bigPrime = 7919;
537
538
539	void initialiseLongPhraseMemory() {
540
541	cellindex example = 0;
542
543	sprintf(hashTableFileName, "%s/hashTable", collection);
544	sprintf(listOfEntriesName, "%s/hashLists", collection);
545
546
547	// create the new hashtable
548	if (verbosity > 1) {
549	cout << "Initialising hashTable: " << hashTableFileName << endl;
550	}
551	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
552	for (cellcount i = 0; i < bigPrime; ++i) {
553	hashTableFile.write((char *) &example, sizeof(example));
554	}
555
556	// create the list of phrases
557	if (verbosity > 1) {
558	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
559	}
560	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
561	listOfEntries.write((char *) &example, sizeof(example));
562	listOfEntries.write((char *) &example, sizeof(example));
563	listOfEntries.write((char *) &example, sizeof(example));
564	nextEntryNumber = 1;
565	}
566
567
568	void rememberThisLongPhrase(cellindex index, cellcount length) {
569
570	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
571
572	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
573	cellindex pointer;
574	cellindex zero = 0;
575	cellindex readp = 0;
576	cellindex readi = 0;
577	cellindex readl = 0;
578
579	hashTableFile.seekg(hashOffset);
580	hashTableFile.read((char *) &pointer, sizeof(cellindex));
581
582	if (pointer == 0) {
583	// There is no entry at all in the hash table for this entry
584	// so create one
585
586	pointer = nextEntryNumber++;
587	hashTableFile.seekg(hashOffset);
588	hashTableFile.write((char *) &pointer, sizeof(cellindex));
589
590	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
591	listOfEntries.write((char *) &zero, sizeof(cellindex));
592	listOfEntries.write((char *) &index, sizeof(cellindex));
593	listOfEntries.write((char *) &length, sizeof(cellindex));
594
595	} else {
596	// There is a list starting at this hash value, so the phrase may
597	// be already remembered, or it might need to be appended
598
599	while (pointer != 0) {
600	// Read the entry pointed to by pointer
601	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
602	listOfEntries.read((char *) &readp, sizeof(cellindex));
603	listOfEntries.read((char *) &readi, sizeof(cellindex));
604	listOfEntries.read((char *) &readl, sizeof(cellindex));
605
606	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
607
608	if ((readi == index) && (readl = length)) {
609	// we've found that we've already stored it
610	return;
611	} else if (readp == 0) {
612	// we're reached the end of the list. Add a new entry.
613	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
614	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
615	pointer = nextEntryNumber++;
616
617	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
618	listOfEntries.write((char *) &zero, sizeof(cellindex));
619	listOfEntries.write((char *) &index, sizeof(cellindex));
620	listOfEntries.write((char *) &length, sizeof(cellindex));
621	return;
622	} else {
623	// go on to the next node
624	pointer = readp;
625	}
626	}
627	}
628
629
630	}
631
632
633	bool isLongPhraseStored(cellindex index, cellcount length) {
634
635	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
636
637	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
638	cellindex pointer;
639	cellindex readp = 0;
640	cellindex readi = 0;
641	cellindex readl = 0;
642
643	// Find the phrase in the hashFile
644	hashTableFile.seekg(hashOffset);
645	hashTableFile.read((char *) &pointer, sizeof(cellindex));
646
647	if (pointer == 0) {
648	// There is no entry at all in the hash table for this entry
649	// so nothing is stored
650	return false;
651
652	} else {
653	// There is a list starting at this hash value, so the phrase may
654	// be already remembered in that list
655	while (pointer != 0) {
656	// Read the entry pointed to by pointer
657	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
658	listOfEntries.read((char *) &readp, sizeof(cellindex));
659	listOfEntries.read((char *) &readi, sizeof(cellindex));
660	listOfEntries.read((char *) &readl, sizeof(cellindex));
661
662	if ((readi == index) && (readl = length)) {
663	// we've found the phrase stored here
664	return true;
665	} else {
666	// go on to the next node
667	pointer = readp;
668	}
669	}
670	}
671	return false;
672	}
673
674
675	void deleteLongPhraseMemory() {
676	// remove the hash & other files
677
678	hashTableFile.close();
679	listOfEntries.close();
680	remove(hashTableFileName);
681	remove(listOfEntriesName);
682
683	}
684
685
686	// Read the collection statistics file
687	//
688	void readStatistics() {
689
690	// open the statistics file
691	char filename[FILENAME_MAX];
692	sprintf(filename, "%s/clauses.stats", collection);
693
694	// Open the file
695	ifstream inFile(filename, ios::in);
696	if (!inFile) {
697	cerr << "File " << filename << " could not be opened\n";
698	exit(1);
699	}
700
701	// Read the numbers file into the numbers array
702	char key[1000];
703	symbol value;
704	while (inFile >> key >> value){
705	if (strcmp(key, "first_stopword") == 0) {
706	firstStopSymbol = value;
707	} else if (strcmp(key, "last_stopword") == 0) {
708	lastStopSymbol = value;
709	} else if (strcmp(key, "first_contentword") == 0) {
710	firstContentSymbol = value;
711	} else if (strcmp(key, "last_contentword") == 0) {
712	lastContentSymbol = value;
713	}
714	}
715	inFile.close();
716
717	// Make sure we have the information we need
718	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
719	cerr << "Statistics file incomplete" << endl;
720	exit(1);
721	}
722	}
723
724	cellcount getContentCount(symbol firstContent) {
725
726	cellcount content=0;
727	for (cellcount i=0; i<inputLength; ++i) {
728	if (symbols[i]>=firstContent) ++content;
729	}
730
731	return content;
732	}
733
734
735	int main (int argc, char * argv[]) {
736
737	// Command-line arguments
738	// argv[1] is the phindex directory
739	// argv[2] is the mode, where 1 is stopword mode
740	// argv[3] is the min_occurs, - minimum occurrence frequency for a phrase to be included in the hierarchy
741	// argv[4] is opitonal verbosity
742	if (argc < 4) {
743	cerr << "Usage: " << argv[0] << " phind-directory mode min-phrase-freq [verbosity]" << endl;
744	exit(1);
745	}
746
747	// collection directory
748	strcpy(collection, argv[1]);
749
750	// mode parameter
751	phraseMode = atoi(argv[2]);
752	assert((phraseMode == STOPWORDS) \|\| (phraseMode == ANYPHRASE));
753
754	minOccurs = atoi(argv[3]);
755	assert((minOccurs > 0));
756
757	// optional verbosity parameter
758	if (argc == 5) {
759	verbosity = atoi(argv[4]);
760	assert (verbosity >= 0);
761	}
762
763	if (verbosity) {
764	cout << "suffix: the phrase extraction program" << endl;
765	}
766	if (verbosity > 1) {
767	if (phraseMode == STOPWORDS) {
768	cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
769	} else {
770	cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
771	}
772	}
773
774	// Read the statistics file
775	readStatistics();
776	// Read the numbers file
777	readNumbers();
778
779	if (numberOfDocuments == 0) {
780	cerr << "There are no documents in this collection!" << endl;
781	exit(0); // not necessarily an error...
782	}
783
784	symbol firstContent;
785	if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
786	else firstContent = firstStopSymbol;
787
788	// Allocate memory for the suffix & prefix arrays
789	cellcount contentLength = 0;
790	contentLength = getContentCount(firstContent);
791	suffixArray = new symbol *[contentLength];
792	prefixArray = new symbol *[contentLength];
793	if (prefixArray == NULL) {
794	cerr << "Suffix: not enough memory to hold " << inputLength << " symbols." << endl;
795	exit(2);
796	}
797	allocateSuffixCheck(contentLength);
798	// Initialise prefix and suffix arrays, only use the needed suffixes
799	for (cellcount j = 0, here = 0; j < inputLength; ++j) {
800	if (symbols[j]>=firstContent) {
801	suffixArray[here] = &symbols[j];
802	prefixArray[here] = &symbols[j];
803	++here;
804	}
805	}
806	qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
807	qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
808	// Create the document arrays
809	if (verbosity > 1) {
810	cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
811	}
812
813	// The document frequecy array is used to count the number of times
814	// each phrase occurs in each document. The number of documents in
815	// which a phrase occurs is stored in df.
816	frequency *documentFrequency = new frequency[numberOfDocuments];
817	frequency df;
818
819	// documentArray will be searched in order to discover which document
820	// each phrase occurs in.
821	documentArray = new symbol *[numberOfDocuments];
822	if (documentArray == NULL) {
823	cerr << "Suffix: out of memory allocating document arrays." << endl;
824	exit(2);
825	}
826
827	// just scan through the input text to find the doc starts
828	cellindex d = 0;
829	for (cellindex i=0; i<inputLength; ++i) {
830	if (symbols[i] == DOCUMENTSTART) {
831	documentArray[d] = &symbols[i];
832	++d;
833	}
834	}
835
836	// the phrases stuff is expecting inputLength to be the length of the
837	// suffix array, so change it.
838	inputLength = contentLength;
839
840	// Extract phrases
841	//
842	// We will make several passesover the data, in each case considering
843	// a set of input phrases and generating a set of output phrases, which
844	// we will expancd in later passes.
845	//
846	// The input phrases in the first pass will be the vocabulary.
847	// In later passes, the input phrases will be the output phrases of the
848	// previous pass.
849	//
850	// In each pass we will consider each input phrase in turn. If we
851	// have seen it before, we will ignore it. Otherwise, we will expand
852	// it and add its expansions to the set of output phrases.
853
854	cout <<"\ngenerating the phrase hierarchy\n\n";
855
856	// Store the phrase data in the phrases file
857	char phraseDataName[FILENAME_MAX];
858	sprintf(phraseDataName, "%s/phrases", collection);
859	ofstream phraseData(phraseDataName, ios::out);
860	if (!phraseData) {
861	cout << "File " << phraseDataName << " could not be opened\n";
862	exit(1);
863	}
864
865	// Count the number of phrases output
866	mg_u_long phraseCounter = 0; //unsigned long int phraseCounter = 0;
867
868	// Set up the phrase expansion memory.
869	// We need this so that we don't expand a phrase more than once
870	initialisePhraseMemory();
871
872	// The current pass numebr
873	int phrasePass = 1;
874
875
876	// PASS NUMBER 1
877	if (verbosity > 1) {
878	cout << "Starting pass " << phrasePass << endl;
879	}
880
881	ofstream outPhrase;
882	char outPhraseName[FILENAME_MAX];
883	mg_u_long outPhraseCounter = 0; // unsigned long int outPhraseCounter = 0;
884
885	// On the first pass, simply work through the vocabulary
886	sprintf(outPhraseName, "%s/outPhrase.1", collection);
887	outPhrase.open(outPhraseName, ios::out);
888	if (!outPhrase) {
889	cerr << "File " << outPhraseName << " could not be opened\n";
890	exit(1);
891	}
892
893	// Iterate over the different symbols by working through the suffix array
894	vector<Phrase> result;
895	cellindex ij = 0;
896	char *tmpString;
897
898	Phrase p;
899	while (ij < inputLength) {
900
901	// make a new phrase of length 1
902	p = Phrase(suffixArray[ij], 1, SUFFIX);
903	p.findFirstAndLastSuffix(ij, inputLength-1);
904
905	// We ignore this symbol if it occurs only once, if it is a delimiter,
906	// of if we are in stopwords mode and it is a stopword
907	// - in this new version, only need to check freq
908	// We could imagine a new mode/command-line option, which is like
909	// STOPWORDS but without this restrictrion. This would let you browse
910	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
911	// "the AGRIS" (where the is a stopword and AGRIS a content word).
912	// The system used to work like this; it is easy to implement, but
913	// it explodes the size of the indexes. So: would it be useful?
914	if (p.suffixFrequency >= minOccurs) {
915	// Get minimal expansions of the phrase
916	getExpansions(p, result);
917
918	if (!result.empty()) {
919
920	// Remember that we have expanded this phrase
921	rememberThisPhrase(ij, 1);
922
923	// write the phrase text
924	phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
925	<< result.size() << ":";
926
927	// write the results
928	for (cellcount k = 0; k < result.size(); ++k) {
929	if (k) {
930	phraseData << ",";
931	}
932	phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
933	outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
934	++outPhraseCounter;
935	}
936	result.clear();
937
938	// Write the documents in which this phrase occurs
939	df = getDocumentOccurrances(p, documentFrequency);
940	phraseData << ":" << df << ":";
941
942	// write the documents
943	for (cellcount m = 0, first = 1; m < numberOfDocuments; ++m) {
944	if (documentFrequency[m]) {
945	if (first) {
946	first = 0;
947	} else {
948	phraseData << ";";
949	}
950	// Output the document number. Note that here we've numbered the
951	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
952	// add 1 to the document id when we output it.
953	phraseData << "d" << (m+1);
954	// Next, output the frequency with which the document occurs, but
955	// only if it is > 1.
956	if (documentFrequency[m] > 1) {
957	phraseData << "," << documentFrequency[m];
958	}
959	}
960	}
961
962	phraseData << endl;
963	++phraseCounter;
964
965	// feedback
966	if (verbosity) {
967	if (phraseCounter % 1000 == 0) {
968	cout << "phrase " << phraseCounter << ": "
969	<< "cell " << p.firstSuffixIndex << " - " << p << endl;
970	}
971	}
972	}
973	}
974	ij = p.lastSuffixIndex + 1;
975	}
976	outPhrase.close();
977
978	// REMAINING PASSES
979	// The previous outPhrase file forms the input to each new pass
980	cellcount start, length;
981	while (outPhraseCounter > 0) {
982
983	// Start a new pass
984	++phrasePass;
985	if (verbosity) {
986	cout << "Starting pass " << phrasePass << endl;
987	}
988
989	// Open the input file
990	char inPhraseName[FILENAME_MAX];
991	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
992	ifstream inPhrase (inPhraseName, ios::in);
993	if (!inPhrase) {
994	cerr << "File " << inPhraseName << " could not be opened\n";
995	exit(1);
996	}
997
998	// Open the output file
999	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
1000	outPhrase.open(outPhraseName, ios::out);
1001	if (!outPhrase) {
1002	cerr << "File " << outPhraseName << " could not be opened\n";
1003	exit(1);
1004	}
1005	outPhraseCounter = 0;
1006
1007	// Process each phrase
1008	while(inPhrase >> start >> length) {
1009
1010	// Ignore the phrase if we have expanded it before
1011	if (isPhraseStored(start, length))
1012	continue;
1013
1014	// Remember that we have examined this phrase
1015	rememberThisPhrase(start, length);
1016
1017	// Find the phrase in the suffixarray
1018	p = Phrase(suffixArray[start], length, SUFFIX);
1019	p.findFirstAndLastSuffix(start, inputLength-1);
1020
1021	// Ignore the phrase if it only occurs once
1022	if (p.suffixFrequency < minOccurs)
1023	continue;
1024
1025	// Write the phrase text;
1026	phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
1027
1028	// Expand the phrase, if it is fewer than 8 words long
1029	if (length <= 8) {
1030
1031	// Get the minimal expansions for this phrase
1032	getExpansions(p, result);
1033
1034	// write the results
1035	phraseData << result.size() << ":";
1036
1037	for (cellcount i = 0; i < result.size(); ++i) {
1038	if (i) {
1039	phraseData << ",";
1040	}
1041	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
1042	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
1043	++outPhraseCounter;
1044	}
1045	result.clear();
1046
1047	} else {
1048	// phrase is too long to expand further
1049	phraseData << "0:";
1050	}
1051
1052	// Write the documents in which this phrase occurs
1053	df = getDocumentOccurrances(p, documentFrequency);
1054	phraseData << ":" << df << ":";
1055
1056	// write the documents
1057	for (cellcount i = 0, first = 1; i < numberOfDocuments; ++i) {
1058	if (documentFrequency[i]) {
1059	if (first) {
1060	first = 0;
1061	} else {
1062	phraseData << ";";
1063	}
1064	// Output the document number. Note that here we've numbered the
1065	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
1066	// add 1 to the document id when we output it.
1067	phraseData << "d" << (i+1);
1068	// Next, output the frequency with which the document occurs, but
1069	// only if it is > 1.
1070	if (documentFrequency[i] > 1) {
1071	phraseData << "," << documentFrequency[i];
1072	}
1073	}
1074	}
1075
1076	phraseData << endl;
1077	++phraseCounter;
1078
1079	// feedback
1080	if (verbosity) {
1081	if (phraseCounter % 1000 == 0) {
1082	cout << "phrase " << phraseCounter << ": "<< "start " << start
1083	<< ", length " << length << " - " << p << endl;
1084	}
1085	}
1086
1087	}
1088
1089	inPhrase.close();
1090	outPhrase.close();
1091	}
1092
1093	phraseData.close();
1094	deletePhraseMemory();
1095
1096	delete [] documentFrequency;
1097	delete [] symbols;
1098	delete [] suffixArray;
1099	delete [] prefixArray;
1100	delete [] suffixCheck;
1101	delete [] documentArray;
1102
1103
1104
1105	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
1106	return 0;
1107	}
1108
1109
1110
1111
1112

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/build-src/src/phind/generate/suffix.cpp@ 26158

Download in other formats: