Context Navigation

source: trunk/gsdl/src/phind/generate/suffix.cpp@ 2807

Last change on this file since 2807 was 2807, checked in by kjm18, 23 years ago
changed suffixCheck to bit array.
Property svn:keywords set to `Author Date Id Revision`
File size: 30.7 KB

Rev	Line
[1562]	1	/**********************************************************************
	2	*
[2801]	3	* Suffix.cpp -- Extract the repeated phrases in the input with suffix
	4	* and prefix arrays (cgn & gwp's simpler algorithm,
	5	* and kjm's improvements).
[1562]	6	*
[1873]	7	* Copyright 2000 Gordon W. Paynter
[1631]	8	* Copyright 2000 The New Zealand Digital Library Project
[1562]	9	*
	10	* A component of the Greenstone digital library software
	11	* from the New Zealand Digital Library Project at the
	12	* University of Waikato, New Zealand.
	13	*
	14	* This program is free software; you can redistribute it and/or modify
	15	* it under the terms of the GNU General Public License as published by
	16	* the Free Software Foundation; either version 2 of the License, or
	17	* (at your option) any later version.
	18	*
	19	* This program is distributed in the hope that it will be useful,
	20	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	21	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	22	* GNU General Public License for more details.
	23	*
	24	* You should have received a copy of the GNU General Public License
	25	* along with this program; if not, write to the Free Software
	26	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	27	*
	28	*********************************************************************/
	29
	30	#include <assert.h>
	31	#include <math.h>
	32	#include <stdio.h>
	33	#include <stdlib.h>
	34	#include <string.h>
	35
[2487]	36	#if defined(GSDL_USE_IOS_H)
	37	# include <fstream.h>
	38	# include <iostream.h>
	39	#else
	40	# include <fstream>
	41	# include <iostream>
	42	#endif
[1562]	43
[2487]	44	#if defined(GSDL_USE_STL_H)
	45	# if defined(GSDL_USE_ALGO_H)
	46	# include <algo.h>
	47	# else
	48	# include <algorithm.h>
	49	# endif
	50	# include <vector.h>
	51	#else
	52	# include <algorithm>
	53	# include <vector>
	54	#endif
	55
[1562]	56	#include "suffix.h"
	57	#include "phrase.h"
	58
	59	// Global variables declared in suffix.h
	60	cellcount inputLength;
	61
	62	symbol *symbols;
	63	symbol **suffixArray;
[2801]	64	symbol **prefixArray;
[1562]	65	check *suffixCheck;
	66
[2807]	67	// the length of the check array
	68	cellcount checkLength;
[1562]	69
	70	// How many documents are in this collection?
	71	cellcount numberOfDocuments;
	72	symbol **documentArray;
	73
[2801]	74
[1562]	75	// Do we accept any phrase, or do we eliminate those ending with stopwords ?
	76	int phraseMode = ANYPHRASE; //STOPWORDS;
	77
[2801]	78
[1562]	79	// The filestem of the collection's phindex directory
	80	char collection[FILENAME_MAX];
	81
	82
	83	// The ranges of the stopword and content-word symbols for the collection
	84	symbol firstStopSymbol = 0;
	85	symbol lastStopSymbol = 0;
	86	symbol firstContentSymbol = 0;
	87	symbol lastContentSymbol = 0;
	88
	89
[2801]	90	// Some useful comparison functions, defined below.
	91	int suffixCompare(const void , const void );
	92	int prefixCompare(const void , const void );
	93	int pointerCompare(const void , const void );
[1562]	94
[2807]	95	// some bit manipulation functions for the check arrays, defined below
	96	int getSuffixCheck(cellindex suff);
	97	void setSuffixCheck(cellindex suff);
	98
[2801]	99	// Functions for implementing "phrase memory". These let us "remember"
	100	// each phrase that we've expanded without using too much memory.
[1562]	101	void initialisePhraseMemory();
	102	void rememberThisPhrase(cellindex index, cellcount length);
	103	bool isPhraseStored(cellindex index, cellcount length);
	104	void deletePhraseMemory();
	105
	106
[1618]	107	// how much output do we want?
	108	int verbosity = 1;
[1562]	109
[1618]	110
[2801]	111	// Get a phrase's expansions
[1562]	112	//
[2801]	113	// Get the set of "minimal", "maximal", non-unique expansions of a
	114	// phrase p, using the simpler algorithm that Craig and Gordon came up
	115	// with at Google.
[1562]	116	//
	117	// Returns a vector of Expansions.
	118
[2801]	119	void getExpansions(Phrase &p, vector<Phrase> &results) {
[1562]	120
[2801]	121	// 1. Get the initial candidates
[1562]	122	vector<Phrase> candidates;
	123	p.initialSuffixCandidates(candidates);
[2801]	124	int suffcands = candidates.size();
[1562]	125	p.initialPrefixCandidates(candidates);
	126
[2801]	127	if (candidates.size() == 0)
	128	return;
[1562]	129
[2801]	130	vector<Phrase>::iterator i;
	131	for (i = candidates.begin(); i != candidates.end(), suffcands>0; ++i, --suffcands) {
	132	i->expandWhileUniquePrefixExtension();
	133	i->ensureSuffixFound();
	134	}
	135	for (i; i != candidates.end(); ++i) {
	136	i->expandWhileUniqueSuffixExtension();
	137	}
[1562]	138
[2801]	139	// 3. Ensure minimality: ignore phrases whose subphrases are also found
	140	results.clear();
[1562]	141
[2801]	142	// Initialise the candidates, check array, and various variables.
	143	sort(candidates.begin(), candidates.end(), isShorter);
[2807]	144	memset(suffixCheck, 0, sizeof(check)*checkLength);
[2801]	145	unsigned minimum_length = candidates.begin()->length;
	146
	147	// Try to add each candidate to the results set, ignoring the non-minimal
	148	for (vector<Phrase>::iterator candidate = candidates.begin();
	149	candidate != candidates.end(); candidate++) {
[1562]	150
[2801]	151	// Make a copy of candidate to mutilate while performing sub-phrase checks
	152	Phrase temp_phrase(*candidate);
	153	bool shorter_found = false;
	154
	155	// Check for shorter and shorter versions of the tenporary phrase
	156	while (temp_phrase.length >= minimum_length && !shorter_found) {
	157	temp_phrase.ensureSuffixFound();
[2807]	158	//if (suffixCheck[temp_phrase.firstSuffixIndex] == 0)
	159	if (getSuffixCheck(temp_phrase.firstSuffixIndex)==0)
[2801]	160	temp_phrase.shortenByOneAtPrefix();
	161	else
	162	shorter_found = true;
[1562]	163
[2801]	164	// Possible efficiency here: we can finish if the prefix of c
	165	// and temp_phrase are the same for candidate->length symbols.
[1562]	166	}
	167
[2801]	168	if (!shorter_found) {
	169	results.push_back(*candidate);
	170	candidate->ensureSuffixFound();
	171	for (cellcount k = candidate->firstSuffixIndex; k <= candidate->lastSuffixIndex; ++k)
[2807]	172	//suffixCheck[k] = candidate->length;
	173	setSuffixCheck(k);
[1562]	174	}
	175	}
	176	}
	177
	178
	179	// suffixCompare
	180	//
	181	// Compare two pointers into a suffix array. We use this in the
	182	// qsort function, so the input are pointers to pointers.
	183	//
	184	// Return -1 if (a < b), otherwise (a > b) so return +1,
	185
	186	int suffixCompare(const void cpa, const void cpb) {
	187
	188	// Cast then dereference pointers to suffix array elements
	189	symbol pa = (symbol ) cpa;
	190	symbol pb = (symbol ) cpb;
	191	pa = (symbol ) pa;
	192	pb = (symbol ) pb;
	193
	194	// If the two elements are the same, examine the next one
	195	while (pa == pb) {
	196	*pa++;
	197	*pb++;
	198	}
	199
	200	// Make the copmparison and return
	201	if ( pa < pb) {
	202	return -1;
	203	} else {
	204	return +1;
	205	}
	206	}
	207
	208
	209	// prefixCompare
	210	//
	211	// Compare two pointers into a prefix array. We use this in the
	212	// qsort function, so the input are pointers to pointers.
	213	//
	214	// Return -1 if (a > b), otherwise (a < b) so return +1,
	215
	216	int prefixCompare(const void cpa, const void cpb) {
	217
	218	// Cast then dereference pointers to prefix array elements
	219	symbol pa = (symbol ) cpa;
	220	symbol pb = (symbol ) cpb;
	221	pa = (symbol ) pa;
	222	pb = (symbol ) pb;
	223
	224	// If the two elements are the same, examine the next one
	225	while (pa == pb) {
	226	*pa--;
	227	*pb--;
	228	}
	229
	230	// Make the copmparison and return
	231	if ( pa > pb) {
	232	return -1;
	233	} else {
	234	return +1;
	235	}
	236	}
	237
	238	// simpleCompare
	239	//
	240	// Compare two pointers based on the memory location they point to.
	241
	242	int pointerCompare( const void pa, const void pb ) {
	243
	244	symbol a = (symbol ) pa;
	245	symbol b = (symbol ) pb;
	246
	247	if (a < b) {
	248	return -1;
	249	} else if (a > b) {
	250	return 1;
	251	} else {
	252	return 0;
	253	}
	254	}
	255
[2807]	256	int getSuffixCheck(cellindex suff) {
	257	cellindex cell = suff >> 3;
	258	check remainder = suff & 0x07; // the last 3 bits
	259	if (suffixCheck[cell]& (1 << remainder)) {
	260	return 1;
	261	}
	262	return 0;
	263	}
	264	void setSuffixCheck(cellindex suff) {
	265	cellindex cell = suff >> 3;
	266	check remainder = suff & 0x07; // the last 3 bits
	267	suffixCheck[cell] \|= (1 << remainder);
[1562]	268
[2807]	269	}
	270
[1882]	271	// Read the clauses.numbers file into the "symbols" array.
[1562]	272	//
[1882]	273	// Each number in the file is a symbol number; it is essential that
	274	// the first symbol (and no others) be COLLECTIONSTART and the last
	275	// symbol (and no others) be COLLECTIONEND.
[1562]	276	//
	277	// Return the number of numbers in the array.
	278
[1882]	279	int readNumbers() {
[1562]	280
	281	char filename[FILENAME_MAX];
	282	sprintf(filename, "%s/clauses.numbers", collection);
[1618]	283	if (verbosity) {
[1882]	284	cout << "Reading numbers file: " << filename << endl;
[1618]	285	}
[1562]	286
	287	// Open the numbers file
[2498]	288	ifstream inFile1(filename, ios::in);
	289	if (!inFile1) {
[1562]	290	cerr << "File " << filename << " could not be opened\n";
	291	exit(1);
	292	}
	293
[1882]	294	// Count the number of symbols
	295	inputLength = 0;
	296	symbol word;
[2498]	297	while (inFile1 >> word) {
[1882]	298	inputLength++;
	299	}
[2498]	300	inFile1.close();
[1882]	301
	302	// Allocate the symbbols array
	303	if (verbosity > 1) {
	304	cout << "Allocating symbol arrays for " << inputLength << " symbols" << endl;
	305	}
[2487]	306	symbols = new symbol[inputLength];
[1882]	307	if (symbols == NULL) {
	308	cerr << "Suffix error: not enough memory to hold " << inputLength
	309	<< " symbols." << endl;
	310	exit(2);
	311	}
	312
[1562]	313	// Read the numbers file into the numbers array
[1882]	314	if (verbosity > 2) {
	315	cout << "Reading the numbers" << endl;
	316	}
[2498]	317
	318	ifstream inFile2(filename, ios::in);
	319	if (!inFile2) {
	320	cerr << "File " << filename << " could not be opened\n";
	321	exit(1);
	322	}
[1882]	323	cellcount next = 0;
[1562]	324	numberOfDocuments = 0;
[2498]	325	while (inFile2 >> word) {
[1882]	326	symbols[next++] = word;
[1562]	327	if (word == DOCUMENTSTART) {
	328	numberOfDocuments++;
	329	}
	330	}
[2498]	331	inFile2.close();
[1562]	332
	333	// Make sure the numbers file is intact
[1882]	334	assert(symbols[0] == COLLECTIONSTART);
	335	assert(symbols[next-1] == COLLECTIONEND);
[1562]	336
[1882]	337	return inputLength;
[1562]	338	}
	339
	340
	341
	342	// Get Document Occurrance statistics
	343	//
	344	// Given a phrase, what documents does it occur in?
	345
[2801]	346	cellcount getDocumentOccurrances(const Phrase &p, cellcount *frequency) {
[1562]	347
[2801]	348	// cout << "searching for \""<< p << "\" in documents "
[1562]	349	// << 0 << "-" << numberOfDocuments - 1 << endl;
	350
	351	// The number of documents in which this phrase occurs
	352	cellcount df = 0;
	353
	354	// Initialise the document frequency array
[2806]	355	// for (cellindex i = 0; i < numberOfDocuments; i++) {
	356	// frequency[i] = 0;
	357	//}
	358	memset(frequency, 0, sizeof(cellcount)*numberOfDocuments);
[1562]	359
	360	// variables used to facilitate the search
	361	cellindex begin;
	362	cellindex end;
	363	cellindex d;
	364	symbol *target;
	365	bool found;
	366
	367	// search for the document in which each occurence of the phrase is found
[2487]	368	for (cellcount j = p.firstSuffixIndex; j <= p.lastSuffixIndex; j++) {
[1562]	369
[2487]	370	// cout << "looking for phrase at suffixArray[" << j << "]\n";
[1562]	371
[2487]	372	target = suffixArray[j];
[1562]	373	begin = 0;
	374	end = numberOfDocuments - 1;
	375	found = false;
	376
	377	// Search for the occurence of a document delimiter that target
	378	// occurs immediately after.
	379	// We do this by performing a binary chop search on documentArray.
	380	while (!found) {
	381
	382	// cout << "searching for " << (cellindex) target << " in "
	383	// << begin << " - " << end << endl;
	384
	385	assert (begin <= end);
	386
	387	// If the beginning and end of the interval are the same,
[1618]	388	// then we've found the correct document
[1562]	389	if (begin == end) {
	390	if (frequency[begin] == 0) {
	391	df++;
	392	}
	393	frequency[begin]++;
	394	found = true;
	395	}
	396
	397	// Otherwise, examine a new document midway through the begin-end
	398	// interval and see if it is the one.
	399	else {
	400	d = (begin + end) / 2;
	401	if (target > documentArray[d]) {
	402	// If target addrss is greater than this, but thisi sthe last document,
	403	// then this must be the one we want. Or, if target is greater than
	404	// this one but less then the next, this must be the one we wnat.
	405	if ((d == numberOfDocuments - 1) \|\| (target < documentArray[d+1])) {
	406	if (frequency[d] == 0) {
	407	df++;
	408	}
	409	frequency[d]++;
	410	found = true;
	411	} else {
	412	// otherwise we know to search later in the document set
	413	begin = d + 1;
	414	}
	415	} else {
	416	// search earlier in the document set
	417	end = d - 1;
	418	}
	419	}
	420	}
	421	}
	422	return df;
	423	}
	424
	425
	426
	427
	428
	429
	430	// phraseExpansionMemory : Which phrases have we expanded?
	431	//
	432	// A set of utilities for keeping track of which phrases we have expanded.
	433	// We don't want to expand a phrase more than once, after all.
	434	//
	435	// This REALLY ought to be in its own class, but it works so that's okay.
	436	//
	437	// Phrases are identified by their firstSuffixPosition and length.
	438	//
	439	// Functions provided are:
	440	// void initialisePhraseMemory()
	441	// void rememberThisPhrase(index, length)
	442	// bool isPhraseStored(index, length)
	443	// void deletePhraseMemory()
	444	//
	445	// Internally, we will have two separate cases:
	446	//
	447	// Phrases of length 1-8:
	448	// unsigned char phraseMemory[inputLength]
	449	// is an array where each cell "remembers" the corresponding index in the
	450	// suffixArray, and each of the 8 bits of the cell correspond to the phrases
	451	// of length 1, 2... 8.
	452	// Eventually, we will make this disk-based (i.e. store the array in a file).
	453	//
	454	// Phrases of length 9+:
	455	// file hashTableFile
	456	// file listOfEntries
	457	// The first file is a hash table; each phrase maps to one of its cells, which
	458	// contains either 0 (empty, no occurence) or a number which is an entry number
	459	// in the second file. This file contains a "list" of entries. Each consists of
	460	// three numbers: the suffixArray index of the phrase, the length of the phrase,
	461	// and the entry number of the next phrase with the same hash.
	462	//
	463
	464
	465	unsigned char *phraseMemory;
	466
	467	void initialiseLongPhraseMemory();
	468	void rememberThisLongPhrase(cellindex index, cellcount length);
	469	bool isLongPhraseStored(cellindex index, cellcount length);
	470	void deleteLongPhraseMemory();
	471
	472
	473	void initialisePhraseMemory() {
	474
[2487]	475	phraseMemory = new unsigned char[inputLength];
[1562]	476
	477	// to begin with, everything is empty
[2806]	478	// for (cellcount i = 0; i < inputLength; i++) {
	479	// phraseMemory[i] = 0;
	480	//}
	481	memset(phraseMemory, 0, sizeof(char)*inputLength);
	482
[1562]	483	// intialise the hashTable of long phrases
	484	initialiseLongPhraseMemory();
	485
	486	}
	487
	488	void rememberThisPhrase(cellindex index, cellcount length) {
	489
	490	// if the phrase is very long, use the file-based system
	491	if (length > 8) {
	492	rememberThisLongPhrase(index, length);
	493	return;
	494	}
	495
	496	// create a char with just the bit corresponding to length set
	497	unsigned char newbit = 1;
	498	for (cellcount i = 1; i < length; i++) {
	499	newbit <<= 1;
	500	}
	501
	502	// set that bit in the memory array at position index
	503	phraseMemory[index] \|= newbit;
	504	}
	505
	506
	507	bool isPhraseStored(cellindex index, cellcount length) {
	508
	509	// if the phrase is very long, use the file-based system
	510	if (length > 8) {
	511	return isLongPhraseStored(index, length);
	512	}
	513
	514	// create a char with just the bit corresponding to length set
	515	unsigned char newbit = 1;
	516	for (cellcount i = 1; i < length; i++) {
	517	newbit <<= 1;
	518	}
	519
	520	// retrurn true if that bit is set in memory arrayat position index
	521	return (phraseMemory[index] & newbit);
	522	}
	523
	524	void deletePhraseMemory() {
	525	delete phraseMemory;
	526	deleteLongPhraseMemory();
	527	}
	528
	529
	530
	531	// Files etc used to store "long" equavlents of the above
	532
	533	fstream hashTableFile;
	534	char hashTableFileName[FILENAME_MAX];
	535	fstream listOfEntries;
	536	char listOfEntriesName[FILENAME_MAX];
	537	cellindex nextEntryNumber;
	538
	539	const cellcount bigPrime = 7919;
	540
	541
	542	void initialiseLongPhraseMemory() {
	543
	544	cellindex example = 0;
	545
	546	sprintf(hashTableFileName, "%s/hashTable", collection);
	547	sprintf(listOfEntriesName, "%s/hashLists", collection);
	548
	549
	550	// create the new hashtable
[1618]	551	if (verbosity > 1) {
	552	cout << "Initialising hashTable: " << hashTableFileName << endl;
	553	}
[1562]	554	hashTableFile.open(hashTableFileName, ios::in \| ios::out);
	555	for (cellcount i = 0; i < bigPrime; i++) {
	556	hashTableFile.write((char *) &example, sizeof(example));
	557	}
	558
	559	// create the list of phrases
[1618]	560	if (verbosity > 1) {
	561	cout << "Initialising list of hashtable entries: " << listOfEntriesName << endl;
	562	}
[1562]	563	listOfEntries.open(listOfEntriesName, ios::in \| ios::out);
	564	listOfEntries.write((char *) &example, sizeof(example));
	565	listOfEntries.write((char *) &example, sizeof(example));
	566	listOfEntries.write((char *) &example, sizeof(example));
	567	nextEntryNumber = 1;
	568	}
	569
	570
	571	void rememberThisLongPhrase(cellindex index, cellcount length) {
	572
	573	// cout << "rememberThisLongPhrase(" << index << ", " << length << ")\n";
	574
	575	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
	576	cellindex pointer;
	577	cellindex zero = 0;
	578	cellindex readp = 0;
	579	cellindex readi = 0;
	580	cellindex readl = 0;
	581
	582	hashTableFile.seekg(hashOffset);
	583	hashTableFile.read((char *) &pointer, sizeof(cellindex));
	584
	585	if (pointer == 0) {
	586	// There is no entry at all in the hash table for this entry
	587	// so create one
	588
	589	pointer = nextEntryNumber++;
	590	hashTableFile.seekg(hashOffset);
	591	hashTableFile.write((char *) &pointer, sizeof(cellindex));
	592
	593	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
	594	listOfEntries.write((char *) &zero, sizeof(cellindex));
	595	listOfEntries.write((char *) &index, sizeof(cellindex));
	596	listOfEntries.write((char *) &length, sizeof(cellindex));
	597
	598	} else {
	599	// There is a list starting at this hash value, so the phrase may
	600	// be already remembered, or it might need to be appended
	601
	602	while (pointer != 0) {
	603	// Read the entry pointed to by pointer
	604	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
	605	listOfEntries.read((char *) &readp, sizeof(cellindex));
	606	listOfEntries.read((char *) &readi, sizeof(cellindex));
	607	listOfEntries.read((char *) &readl, sizeof(cellindex));
	608
	609	// cout << "read " << pointer << ", " << readp << ", " << readi << ", " << readl << endl;
	610
	611	if ((readi == index) && (readl = length)) {
	612	// we've found that we've already stored it
	613	return;
	614	} else if (readp == 0) {
	615	// we're reached the end of the list. Add a new entry.
	616	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
	617	listOfEntries.write((char *) &nextEntryNumber, sizeof(cellindex));
	618	pointer = nextEntryNumber++;
	619
	620	listOfEntries.seekp(pointer * sizeof(cellindex) * 3);
	621	listOfEntries.write((char *) &zero, sizeof(cellindex));
	622	listOfEntries.write((char *) &index, sizeof(cellindex));
	623	listOfEntries.write((char *) &length, sizeof(cellindex));
	624	return;
	625	} else {
	626	// go on to the next node
	627	pointer = readp;
	628	}
	629	}
	630	}
	631
	632
	633	}
	634
[2801]	635
[1562]	636	bool isLongPhraseStored(cellindex index, cellcount length) {
	637
	638	// cout << "isLongPhraseExpanded(" << index << ", " << length << ")\n";
	639
	640	cellindex hashOffset = ((index + length) % bigPrime) * sizeof(cellindex);
	641	cellindex pointer;
	642	cellindex readp = 0;
	643	cellindex readi = 0;
	644	cellindex readl = 0;
	645
	646	// Find the phrase in the hashFile
	647	hashTableFile.seekg(hashOffset);
	648	hashTableFile.read((char *) &pointer, sizeof(cellindex));
	649
	650	if (pointer == 0) {
	651	// There is no entry at all in the hash table for this entry
	652	// so nothing is stored
	653	return false;
	654
	655	} else {
	656	// There is a list starting at this hash value, so the phrase may
	657	// be already remembered in that list
	658	while (pointer != 0) {
	659	// Read the entry pointed to by pointer
	660	listOfEntries.seekg(pointer * sizeof(cellindex) * 3);
	661	listOfEntries.read((char *) &readp, sizeof(cellindex));
	662	listOfEntries.read((char *) &readi, sizeof(cellindex));
	663	listOfEntries.read((char *) &readl, sizeof(cellindex));
	664
	665	if ((readi == index) && (readl = length)) {
	666	// we've found the phrase stored here
	667	return true;
	668	} else {
	669	// go on to the next node
	670	pointer = readp;
	671	}
	672	}
	673	}
	674	return false;
	675	}
	676
[2801]	677
[1562]	678	void deleteLongPhraseMemory() {
	679	// remove the hash & other files
	680
	681	hashTableFile.close();
	682	listOfEntries.close();
	683	remove(hashTableFileName);
	684	remove(listOfEntriesName);
	685
	686	}
	687
	688
	689	// Read the collection statistics file
[2801]	690	//
[1562]	691	void readStatistics() {
	692
	693	// open the statistics file
	694	char filename[FILENAME_MAX];
	695	sprintf(filename, "%s/clauses.stats", collection);
	696
	697	// Open the file
	698	ifstream inFile(filename, ios::in);
	699	if (!inFile) {
	700	cerr << "File " << filename << " could not be opened\n";
	701	exit(1);
	702	}
	703
	704	// Read the numbers file into the numbers array
	705	char key[1000];
	706	symbol value;
	707	while (inFile >> key >> value){
	708	if (strcmp(key, "first_stopword") == 0) {
	709	firstStopSymbol = value;
	710	} else if (strcmp(key, "last_stopword") == 0) {
	711	lastStopSymbol = value;
	712	} else if (strcmp(key, "first_contentword") == 0) {
	713	firstContentSymbol = value;
	714	} else if (strcmp(key, "last_contentword") == 0) {
	715	lastContentSymbol = value;
	716	}
	717	}
	718	inFile.close();
	719
	720	// Make sure we have the information we need
	721	if (!(firstStopSymbol && lastStopSymbol && firstContentSymbol && lastContentSymbol)) {
	722	cerr << "Statistics file incomplete" << endl;
	723	exit(1);
	724	}
	725	}
	726
[2801]	727	cellcount getContentCount(symbol firstContent) {
[1562]	728
[2801]	729	cellcount content=0;
	730	for (cellcount i=0; i<inputLength; i++) {
	731	if (symbols[i]>=firstContent) content++;
	732	}
[1562]	733
[2801]	734	return content;
	735	}
[1562]	736
[2801]	737	int main (int argc, char * argv[]) {
[1562]	738
[2801]	739	// Command-line arguments
	740	// argv[1] is the phindex directory
	741	// argv[2] is the maximum array symbol length (optional)
	742	// argv[3] is the mode, where 1 is stopword mode (optional)
	743	if (argc < 2) {
	744	cerr << "Usage: " << argv[0] << " phind-directory mode [verbosity]" << endl;
	745	exit(1);
	746	}
	747
	748	// collection directory
	749	strcpy(collection, argv[1]);
	750
	751	// mode parameter
	752	phraseMode = atoi(argv[2]);
	753	assert((phraseMode == STOPWORDS) \|\| (phraseMode == ANYPHRASE));
	754
	755	// optional verbosity parameter
	756	if (argc == 4) {
	757	verbosity = atoi(argv[3]);
	758	assert (verbosity >= 0);
	759	}
	760
	761	if (verbosity) {
	762	cout << "suffix: the phrase extraction program" << endl;
	763	}
	764
	765	if (verbosity > 1) {
	766	if (phraseMode == STOPWORDS) {
	767	cout << "Stopwords mode: no phrase may begin or end with a stopword" << endl;
	768	} else {
	769	cout << "AllPhrase mode: extract every phrase that occurs more than once" << endl;
	770	}
	771	}
	772
	773	// Read the statistics file
	774	readStatistics();
	775
	776	// Read the numbers file
	777	readNumbers();
	778
	779	if (numberOfDocuments == 0) {
	780	cerr << "There are no documents in this collection!" << endl;
	781	exit(1);
	782	}
	783
	784	symbol firstContent;
	785	if (phraseMode==STOPWORDS) firstContent=firstContentSymbol;
	786	else firstContent = firstStopSymbol;
	787
	788	cellcount contentLength = 0;
	789	contentLength = getContentCount(firstContent);
	790
	791	// Create the suffix & prefix arrays
	792	suffixArray = new symbol *[contentLength];
	793	prefixArray = new symbol *[contentLength];
	794
	795	cellcount here=0;
	796	// Initialise prefix and suffix arrays, only use the needed suffixes
	797	for (cellcount j = 0; j < inputLength; j++) {
	798	if (symbols[j]>=firstContent) {
	799	suffixArray[here] = &symbols[j];
	800	prefixArray[here] = &symbols[j];
	801	here++;
	802	}
	803	}
	804	qsort(suffixArray, contentLength, sizeof(symbol *), suffixCompare);
	805	qsort(prefixArray, contentLength, sizeof(symbol *), prefixCompare);
	806
[2807]	807	checkLength = contentLength/8 + 1;
	808	suffixCheck = new check[checkLength];
[2801]	809	if (suffixCheck == NULL) {
	810	cerr << "Suffix error: not enough memory to hold " << inputLength << " symbols." << endl;
	811	exit(2);
	812	}
[2807]	813	memset(suffixCheck, 0, sizeof(check)*checkLength);
[2801]	814
	815	cout <<"\ngenerating the phrase hierarchy\n\n";
	816
	817	// Create the document arrays
	818	if (verbosity > 1) {
	819	cout << "Allocating document arrays for " << numberOfDocuments << " documents" << endl;
	820	}
	821
	822	// The document frequecy array is used to count the number of times
	823	// each phrase occurs in each document. The number of documents in
	824	// which a phrase occurs is stored in df.
	825	frequency *documentFrequency = new frequency[numberOfDocuments];
	826	frequency df;
	827
	828	// documentArray will be searched in order to discover which document
	829	// each phrase occurs in.
	830	documentArray = new symbol *[numberOfDocuments];
	831
	832	// just scan through the input text to find the doc starts
	833	cellindex d = 0;
	834	for (cellindex i=0; i<inputLength; i++) {
	835	if (symbols[i] == DOCUMENTSTART) {
	836	documentArray[d] = &symbols[i];
	837	d++;
	838	}
	839	}
	840
	841	// the phrases stuff is expecting inputLength to be the length of the
	842	// suffix array, so change it.
	843	inputLength = contentLength;
	844
	845	// Extract phrases
	846	//
	847	// We will make several passesover the data, in each case considering
	848	// a set of input phrases and generating a set of output phrases, which
	849	// we will expancd in later passes.
	850	//
	851	// The input phrases in the first pass will be the vocabulary.
	852	// In later passes, the input phrases will be the output phrases of the
	853	// previous pass.
	854	//
	855	// In each pass we will consider each input phrase in turn. If we
	856	// have seen it before, we will ignore it. Otherwise, we will expand
	857	// it and add its expansions to the set of output phrases.
	858
	859	// Store the phrase data in the phrases file
	860	char phraseDataName[FILENAME_MAX];
	861	sprintf(phraseDataName, "%s/phrases", collection);
	862	ofstream phraseData(phraseDataName, ios::out);
	863	if (!phraseData) {
	864	cout << "File " << phraseDataName << " could not be opened\n";
	865	exit(1);
	866	}
	867
	868	// Count the number of phrases output
	869	unsigned long int phraseCounter = 0;
	870
	871	// Set up the phrase expansion memory.
	872	// We need this so that we don't expand a phrase more than once
	873	initialisePhraseMemory();
	874
	875	// The current pass numebr
	876	int phrasePass = 1;
	877
	878
	879	// PASS NUMBER 1
	880	if (verbosity > 1) {
	881	cout << "Starting pass " << phrasePass << endl;
	882	}
	883
	884	ofstream outPhrase;
	885	char outPhraseName[FILENAME_MAX];
	886	unsigned long int outPhraseCounter = 0;
	887
	888	// On the first pass, simply work through the vocabulary
	889	sprintf(outPhraseName, "%s/outPhrase.1", collection);
	890	outPhrase.open(outPhraseName, ios::out);
	891	if (!outPhrase) {
	892	cerr << "File " << outPhraseName << " could not be opened\n";
	893	exit(1);
	894	}
	895
	896	// Iterate over the different symbols by working through the suffix array
	897	vector<Phrase> result;
	898	cellindex ij = 0;
	899	char *tmpString;
	900
	901	Phrase p;
	902	while (ij < inputLength) {
	903
	904	// make a new phrase of length 1
	905	p = Phrase(suffixArray[ij], 1, SUFFIX);
	906	p.findFirstAndLastSuffix(ij, inputLength-1);
	907
	908	// We ignore this symbol if it occurs only once, if it is a delimiter,
	909	// of if we are in stopwords mode and it is a stopword
	910	// - in this new version, only need to check freq
	911	// We could imagine a new mode/command-line option, which is like
	912	// STOPWORDS but without this restrictrion. This would let you browse
	913	// from "the" to "the AGRIS" for example, but not from "AGRIS" to
	914	// "the AGRIS" (where the is a stopword and AGRIS a content word).
	915	// The system used to work like this; it is easy to implement, but
	916	// it explodes the size of the indexes. So: would it be useful?
	917	if (p.suffixFrequency > 1) {
	918	// Get minimal expansions of the phrase
	919	getExpansions(p, result);
	920
	921	if (!result.empty()) {
	922
	923	// Remember that we have expanded this phrase
	924	rememberThisPhrase(ij, 1);
	925
	926	// write the phrase text
	927	phraseData << ij << "-1:" << p << ":" << p.suffixFrequency << ":"
	928	<< result.size() << ":";
	929
	930	// write the results
	931	for (cellcount k = 0; k < result.size(); k++) {
	932	if (k) {
	933	phraseData << ",";
	934	}
	935	phraseData << result[k].firstSuffixIndex << "-" << result[k].length;
	936	outPhrase << result[k].firstSuffixIndex << " " << result[k].length << endl;
	937	outPhraseCounter++;
	938	}
	939	result.clear();
	940
	941	// Write the documents in which this phrase occurs
	942	df = getDocumentOccurrances(p, documentFrequency);
	943	phraseData << ":" << df << ":";
	944
	945	// write the documents
	946	for (cellcount m = 0, first = 1; m < numberOfDocuments; m++) {
	947	if (documentFrequency[m]) {
	948	if (first) {
	949	first = 0;
	950	} else {
	951	phraseData << ";";
	952	}
	953	// Output the document number. Note that here we've numbered the
	954	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
	955	// add 1 to the document id when we output it.
	956	phraseData << "d" << (m+1);
	957	// Next, output the frequency with which the document occurs, but
	958	// only if it is > 1.
	959	if (documentFrequency[m] > 1) {
	960	phraseData << "," << documentFrequency[m];
	961	}
	962	}
	963	}
	964
	965	phraseData << endl;
	966	phraseCounter++;
	967
	968	// feedback
	969	if (verbosity) {
	970	if (phraseCounter % 1000 == 0) {
	971	cout << "phrase " << phraseCounter << ": "
	972	<< "cell " << p.firstSuffixIndex << " - " << p << endl;
	973	}
	974	}
	975	}
	976	}
	977	ij = p.lastSuffixIndex + 1;
	978	}
	979	outPhrase.close();
	980
	981	// REMAINING PASSES
	982	// The previous outPhrase file forms the input to each new pass
	983	cellcount start, length;
	984	while (outPhraseCounter > 0) {
	985
	986	// Start a new pass
	987	phrasePass++;
	988	if (verbosity) {
	989	cout << "Starting pass " << phrasePass << endl;
	990	}
	991
	992	// Open the input file
	993	char inPhraseName[FILENAME_MAX];
	994	sprintf(inPhraseName, "%s/outPhrase.%d", collection, phrasePass - 1);
	995	ifstream inPhrase (inPhraseName, ios::in);
	996	if (!inPhrase) {
	997	cerr << "File " << inPhraseName << " could not be opened\n";
	998	exit(1);
	999	}
	1000
	1001	// Open the output file
	1002	sprintf(outPhraseName, "%s/outPhrase.%d", collection, phrasePass);
	1003	outPhrase.open(outPhraseName, ios::out);
	1004	if (!outPhrase) {
	1005	cerr << "File " << outPhraseName << " could not be opened\n";
	1006	exit(1);
	1007	}
	1008	outPhraseCounter = 0;
	1009
	1010	// Process each phrase
	1011	while(inPhrase >> start >> length) {
	1012
	1013	// Ignore the phrase if we have expanded it before
	1014	if (isPhraseStored(start, length))
	1015	continue;
	1016
	1017	// Remember that we have examined this phrase
	1018	rememberThisPhrase(start, length);
	1019
	1020	// Find the phrase in the suffixarray
	1021	p = Phrase(suffixArray[start], length, SUFFIX);
	1022	p.findFirstAndLastSuffix(start, inputLength-1);
	1023
	1024	// Ignore the phrase if it only occurs once
	1025	if (p.suffixFrequency < 2)
	1026	continue;
	1027
	1028	// Write the phrase text;
	1029	phraseData << start << "-" << length << ":" << p << ":" << p.suffixFrequency << ":";
	1030
	1031	// Expand the phrase, if it is fewer than 8 words long
	1032	if (length <= 8) {
	1033
	1034	// Get the minimal expansions for this phrase
	1035	getExpansions(p, result);
	1036
	1037	// write the results
	1038	phraseData << result.size() << ":";
	1039
	1040	for (cellcount i = 0; i < result.size(); i++) {
	1041	if (i) {
	1042	phraseData << ",";
	1043	}
	1044	phraseData << result[i].firstSuffixIndex << "-" << result[i].length;
	1045	outPhrase << result[i].firstSuffixIndex << " " << result[i].length << endl;
	1046	outPhraseCounter++;
	1047	}
	1048	result.clear();
	1049
	1050	} else {
	1051	// phrase is too long to expand further
	1052	phraseData << "0:";
	1053	}
	1054
	1055	// Write the documents in which this phrase occurs
	1056	df = getDocumentOccurrances(p, documentFrequency);
	1057	phraseData << ":" << df << ":";
	1058
	1059	// write the documents
	1060	for (cellcount i = 0, first = 1; i < numberOfDocuments; i++) {
	1061	if (documentFrequency[i]) {
	1062	if (first) {
	1063	first = 0;
	1064	} else {
	1065	phraseData << ";";
	1066	}
	1067	// Output the document number. Note that here we've numbered the
	1068	// N documents from 0 to N-1, but later they'll be 1-N. Thus we
	1069	// add 1 to the document id when we output it.
	1070	phraseData << "d" << (i+1);
	1071	// Next, output the frequency with which the document occurs, but
	1072	// only if it is > 1.
	1073	if (documentFrequency[i] > 1) {
	1074	phraseData << "," << documentFrequency[i];
	1075	}
	1076	}
	1077	}
	1078
	1079	phraseData << endl;
	1080	phraseCounter++;
	1081
	1082	// feedback
	1083	if (verbosity) {
	1084	if (phraseCounter % 1000 == 0) {
	1085	cout << "phrase " << phraseCounter << ": "<< "start " << start
	1086	<< ", length " << length << " - " << p << endl;
	1087	}
	1088	}
	1089
	1090	}
	1091
	1092	inPhrase.close();
	1093	outPhrase.close();
	1094	}
	1095
	1096	phraseData.close();
	1097	deletePhraseMemory();
	1098
	1099	delete [] documentFrequency;
	1100	delete [] symbols;
	1101	delete [] suffixArray;
	1102	delete [] prefixArray;
	1103	delete [] suffixCheck;
	1104	delete [] documentArray;
	1105
	1106
	1107
	1108	cout << endl << "Done: " << phraseCounter << " phrases in " << phraseDataName << endl;
	1109	return 0;
	1110	}
	1111
	1112
	1113
	1114
	1115

Note: See TracBrowser for help on using the repository browser.

Download in other formats: