Context Navigation

source: trunk/gsdl/src/mgpp/text/ivf.pass2.cpp@ 2468

Last change on this file since 2468 was 2468, checked in by sjboddie, 23 years ago

Fiddled about with mgpp to get it compiling on Windows under VC++ 6.0. I
still can't get it to compile under VC++ 4.2 because of some weird
behaviour in STLport.

Also tidied up a little and removed some of the old log information
that was scattered about in some of the files.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 31.3 KB

Rev	Line
[856]	1	/**************************************************************************
	2	*
	3	* ivf.pass2.cpp -- Memory efficient pass 2 inversion
	4	* Copyright (C) 1999 Rodger McNab
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	**************************************************************************/
	21
[2443]	22	#define _XOPEN_SOURCE 1
	23	#define _XOPEN_SOURCE_EXTENDED 1
[2468]	24
[856]	25	#include <stdio.h>
	26
[2468]	27	#if defined __WIN32__
	28	# include <process.h>
	29	# include <io.h>
	30	# define getpid _getpid
	31	# define unlink _unlink
	32	#else
	33	# include <unistd.h>
	34	# include "non_ansi.h"
	35	#endif
[856]	36
[2468]	37	#include "UCArray.h"
[856]	38	#include "sysfuncs.h"
	39	#include "mg_files.h"
	40	#include "invf.h"
	41	#include "mg.h"
	42	#include "build.h"
	43	#include "locallib.h"
	44	#include "bitio_m_random.h"
	45	#include "bitio_m_stdio.h"
	46	#include "bitio_m_mems.h"
	47	#include "bitio_gen.h"
	48	#include <stdio.h>
	49	#include "words.h"
	50	#include "messages.h"
	51	#include "netorder.h"
	52	#include "FIvfLevelInfo.h"
	53	#include "perf_hash.h"
	54	#include "string.h"
	55
	56	#include "longlong.h"
	57
	58	#if defined(GSDL_USE_OBJECTSPACE)
	59	# include <ospace\std\map>
	60	#elif defined(GSDL_USE_STL_H)
	61	# include <map.h>
	62	#else
	63	# include <map>
	64	#endif
	65
	66
	67	#ifdef USE_LONG_LONG
	68	#define SEEK_X seek_LL
	69	#define TELL_X tell_LL
	70	#else
	71	#define SEEK_X seek
	72	#define TELL_X tell
	73	#endif
	74
	75	#ifndef RND_BUF_SIZE
	76	#define RND_BUF_SIZE 8*1024
	77	#endif
	78
	79
	80	static unsigned long numDocs = 0;
	81	static unsigned long numChunkDocs = 0;
	82	static unsigned long numDocsInChunk = 0;
	83
	84	static unsigned long numFrags = 0;
	85	static unsigned long numFragsInChunk = 0;
	86	static unsigned long chunkStartFragNum = 0;
	87
	88
	89
	90
	91	struct BitPtr {
	92	unsigned long start;
	93	unsigned long here;
	94	unsigned long lastFragNum;
	95	unsigned long lgB;
	96
	97	void Clear () { start = here = lastFragNum = lgB = 0; }
	98	BitPtr () { Clear(); }
	99	};
	100
	101	class WordBitPtrs {
	102	protected:
	103	unsigned long numWords;
	104	unsigned long numTags;
	105	unsigned long size;
	106	BitPtr *wordBitPtrs;
	107
	108	void CheckBufOverrun (unsigned long num) {
	109	if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
	110	cerr << "numDocs: " << numDocs << "\n";
	111	cerr << "numChunkDocs: " << numChunkDocs << "\n";
	112	cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
	113	cerr << "numFrags: " << numFrags << "\n";
	114	cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
	115	cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
	116	cerr << "num: " << num << "\n";
	117	cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
	118	cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
	119	cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
	120	FatalError (1, "Bit buffer overrun");
	121	}
	122	}
	123
	124	public:
	125	void Clear ();
	126	WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
	127	~WordBitPtrs ();
	128	void SetSize (unsigned long _numWords,
	129	unsigned long _numTags);
	130
	131	void ResetPtrs () {
	132	if (wordBitPtrs == NULL) return;
	133	unsigned long i;
	134	for (i=0; i<size; i++) wordBitPtrs[i].Clear();
	135	}
	136
	137	BitPtr &GetWordBitPtr (unsigned long wordNum)
	138	{ return wordBitPtrs[wordNum]; }
	139	unsigned long &GetWordStart (unsigned long wordNum)
	140	{ return wordBitPtrs[wordNum].start; }
	141	unsigned long &GetWordHere (unsigned long wordNum)
	142	{ return wordBitPtrs[wordNum].here; }
	143	void CheckWordBufOverrun (unsigned long wordNum)
	144	{ CheckBufOverrun (wordNum); }
	145
	146	BitPtr &GetTagBitPtr (unsigned long tagNum)
	147	{ return wordBitPtrs[tagNum + numWords]; }
	148	unsigned long &GetTagStart (unsigned long tagNum)
	149	{ return wordBitPtrs[tagNum + numWords].start; }
	150	unsigned long &GetTagHere (unsigned long tagNum)
	151	{ return wordBitPtrs[tagNum + numWords].here; }
	152	void CheckTagBufOverrun (unsigned long tagNum)
	153	{ CheckBufOverrun (tagNum + numWords); }
	154
	155	BitPtr &GetEndBitPtr ()
	156	{ return wordBitPtrs[size-1]; }
	157	unsigned long &GetEndStart ()
	158	{ return wordBitPtrs[size-1].start; }
	159	unsigned long &GetEndHere ()
	160	{ return wordBitPtrs[size-1].here; }
	161	};
	162
	163
	164	struct IP2TagInfo {
	165	bool inTag;
	166	unsigned long startFrag;
	167	unsigned long tagNum;
	168
	169	IP2TagInfo () {
	170	inTag = false;
	171	startFrag = 0;
	172	tagNum = 0;
	173	}
	174	};
	175
	176	// maps tags to tag information
	177	typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
	178
	179
	180	// class to handle the translation of occurrence order
	181	// to dictionary order for words and tags
	182	class OccurToDictConverter {
	183	protected:
	184	unsigned long pos;
	185	unsigned long val;
	186	FILE *transFile;
	187	random_bitio_buffer rbs;
	188
	189	unsigned long wordDictSize;
	190	unsigned long tagDictSize;
	191
	192	void SeekStart ();
	193	unsigned long TranslateNum (unsigned long num);
	194
	195	public:
	196	OccurToDictConverter ();
	197	~OccurToDictConverter ();
	198
	199	void Open (char *filename, unsigned long _wordDictSize,
	200	unsigned long _tagDictSize);
	201
	202	// Close frees all allocated memory
	203	void Close ();
	204
	205	unsigned long TranslateWord (unsigned long occurNum)
	206	{ return TranslateNum (occurNum); }
	207	unsigned long TranslateTag (unsigned long occurNum)
	208	{ return TranslateNum (occurNum+wordDictSize); }
	209	};
	210
	211
	212	struct InvfStateRec {
	213	mg_ullong start;
	214	mg_ullong here;
	215	unsigned long lastFragNum;
	216	unsigned long B;
	217
	218	void Clear () {
	219	start = here = 0;
	220	lastFragNum = B = 0;
	221	}
	222	InvfStateRec () { Clear (); }
	223	};
	224
	225
	226	#define ISR_SIZE 1024
	227
	228	class InvfStateCache {
	229	protected:
	230	InvfStateRec recCache [ISR_SIZE];
	231	unsigned long startNum;
	232
	233	FILE *stateFile;
	234
	235	void ClearCache () {
	236	unsigned int i = 0;
	237	for (i=0; i<ISR_SIZE; i++) recCache[i].Clear();
	238	}
	239
	240	public:
	241	InvfStateCache ();
	242	~InvfStateCache ();
	243
	244	void Open (char *filename);
	245	void Close ();
	246
	247	// previous references to state records may be
	248	// invalidated calling GetRec
	249	InvfStateRec &GetRec (unsigned long num);
	250	};
	251
	252
	253	static invf_dict_header idh;
	254	static WordBitPtrs bitPtrs;
	255
	256	static FILE *chunkFile = NULL;
	257	static stdio_bitio_buffer chunkBuf;
	258
	259	static unsigned long ivfMemBufSize = 0;
	260	static char *ivfMemBuf = NULL;
	261
	262	// word and tag dictionaries. a map is used for the tag dictionary
	263	// as it should never be very big (and the perfect hash function
	264	// sometimes has trouble with small values).
	265	static perf_hash_data *wordHashDict = NULL;
	266	static TagMapDict tagMapDict;
	267
	268	// information about all the different levels
	269	static FIvfLevel ivfLevel;
	270
	271	static OccurToDictConverter occurConvert;
	272
	273	// information about the state of the inverted file
	274	static InvfStateCache invfState;
	275
	276	static char collectFilename[512];
	277
	278
	279	void WordBitPtrs::Clear () {
	280	numWords = 0;
	281	numTags = 0;
	282	size=0;
	283	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
	284	wordBitPtrs = NULL;
	285	}
	286
	287	WordBitPtrs::~WordBitPtrs () {
	288	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
	289	}
	290
	291	void WordBitPtrs::SetSize (unsigned long _numWords,
	292	unsigned long _numTags){
	293	Clear();
	294	numWords = _numWords;
	295	numTags = _numTags;
	296	size = numWords + numTags + 1;
	297	wordBitPtrs = new BitPtr [size];
	298	}
	299
	300
	301	void OccurToDictConverter::SeekStart () {
	302	if (transFile == NULL) return;
	303	rbs.SEEK_X (sizeof (unsigned long) * 8);
	304	pos = 0;
	305	}
	306
	307	unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
	308	if (num < pos) SeekStart ();
	309	while (pos <= num) {
	310	if (pos < wordDictSize)
	311	val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
	312	else
	313	val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
	314	pos++;
	315	}
	316	return val;
	317	}
	318
	319	OccurToDictConverter::OccurToDictConverter () {
	320	pos = 0;
	321	val = 0;
	322	transFile = NULL;
	323	wordDictSize = 0;
	324	tagDictSize = 0;
	325	}
	326
	327	OccurToDictConverter::~OccurToDictConverter () {
	328	if (transFile != NULL) Close ();
	329	}
	330
	331	void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
	332	unsigned long _tagDictSize) {
	333	if (transFile != NULL) Close ();
	334
	335	wordDictSize = _wordDictSize;
	336	tagDictSize = _tagDictSize;
	337
	338	transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
	339	MAGIC_CHUNK_TRANS, MG_ABORT);
	340	rbs.attachFile (transFile, RND_BUF_SIZE);
	341	SeekStart ();
	342	val = 0;
	343	}
	344
	345	void OccurToDictConverter::Close () {
	346	if (transFile == NULL) return;
	347
	348	rbs.done ();
	349	fclose (transFile);
	350	transFile = NULL;
	351	pos = 0;
	352	val = 0;
	353
	354	wordDictSize = 0;
	355	tagDictSize = 0;
	356	}
	357
	358
	359
	360
	361	InvfStateCache::InvfStateCache () {
	362	startNum = 0;
	363	stateFile = NULL;
	364	}
	365
	366	InvfStateCache::~InvfStateCache () {
	367	if (stateFile != NULL) Close ();
	368	}
	369
	370	void InvfStateCache::Open (char *filename) {
	371	if (stateFile != NULL) Close();
	372
	373	// open the state file
	374	char path[512];
	375	sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
[2468]	376	".invf.state", (long) getpid ());
[856]	377	if (!(stateFile = fopen (path, "wb+"))) {
	378	Message ("Unable to create \"%s\"", path);
	379	exit (1);
	380	}
	381	unlink (path); // file will be deleted after it is closed
	382	// reset the buffer
	383	startNum = 0;
	384	ClearCache();
	385	}
	386
	387	void InvfStateCache::Close () {
	388	if (stateFile == NULL) return;
	389	fclose (stateFile);
	390	stateFile = NULL;
	391	startNum = 0;
	392	}
	393
	394	InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
	395	// see if cached
	396	if ((num >= startNum) && (num < startNum + ISR_SIZE))
	397	return recCache[num-startNum];
	398
	399	// not cached, write out this lot of records and read in
	400	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
	401	fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
	402
	403	// read in the new set of records
	404	ClearCache ();
	405	startNum = num - (num % ISR_SIZE);
	406	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
	407	fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
	408
	409	return recCache[num-startNum];
	410	}
	411
	412
	413
	414	static void ClearCharBuf (char *buf, unsigned long size) {
	415	char *end = buf + size;
	416	while (buf != end) *buf++ = 0;
	417	}
	418
	419	static void ReadWordDict (char *filename) {
	420	// read in the perfect hash function for the word dictionary
	421	FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
	422	MAGIC_HASH, MG_ABORT);
	423	if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
	424	FatalError (1, "Unable to read in hash data for word dictionary");
	425	}
	426	fclose (wordHashFile);
	427	}
	428
	429	static void ReadTagDict (char *filename, invf_dict_header &_idh) {
	430	// open the file
	431	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
	432	MAGIC_STEM_BUILD, MG_ABORT);
	433
	434	// seek to the start of the tag dictionary
	435	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
	436
	437	unsigned long tagNum;
	438	dict_el thisEl;
	439	for (tagNum = 0; tagNum < _idh.tag_dict_size; tagNum++) {
	440	thisEl.Read (dictFile);
	441	tagMapDict[thisEl.el].tagNum = tagNum;
	442	}
	443
	444	fclose (dictFile);
	445	}
	446
	447	static void ReadLevelFile (char *filename) {
	448	FILE *f;
	449	f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
	450	MAGIC_INVF_LEVELS, MG_ABORT);
	451	ivfLevel.Read (f);
	452	fclose (f);
	453	}
	454
	455	void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
	456	if (totalIBits < lastTotalIBits) {
	457	fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
	458	if (sizeof (mg_ullong) < 8) {
	459	fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
	460	}
	461	fprintf(stderr, " Build aborted.\n");
	462	exit(1);
	463	}
	464	}
	465
	466	// assumes the inverted file state file has been opened
	467	static void InitInvfState (char *filename,
	468	invf_dict_header &_idh,
	469	InvfStateCache &_invfState,
	470	mg_ullong &totalIBits,
	471	bool wordLevelIndex) {
	472	// read in the dictionary, setting inverted state information
	473	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
	474	MAGIC_STEM_BUILD, MG_ABORT);
	475
	476	// seek to the start of the word dictionary
	477	fseek (dictFile, _idh.word_dict_start, SEEK_SET);
	478
	479	// add the word entries
	480	word_dict_el wordEl;
	481	wordEl.SetNumLevels (_idh.num_levels);
	482	unsigned long dictWordNum, p;
	483	mg_ullong lastTotalIBits;
	484	unsigned long N = _idh.num_frags;
	485	for (dictWordNum=0; dictWordNum<_idh.word_dict_size; dictWordNum++) {
	486	// lastTotalIBits is used to detect integer overflow
	487	lastTotalIBits = totalIBits;
	488
	489	// read the next word and associated information
	490	wordEl.Read (dictFile, _idh.num_levels);
	491
	492	// update the state record
	493	p = wordEl.frag_occur;
	494	InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
	495	wisr.start = totalIBits;
	496	wisr.here = totalIBits;
	497	wisr.B = BIO_Bblock_Init (N, p);
	498
	499	// add the length of the fragment numbers
	500	totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
	501
	502	// if needed, add the length of the fragment frequency information
	503	if (!wordLevelIndex)
	504	totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
	505
	506	// align next byte
	507	#ifdef USE_LONG_LONG
	508	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
	509	#else
	510	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
	511	#endif
	512
	513	CheckIntOverflow (totalIBits, lastTotalIBits);
	514	}
	515
	516
	517	// seek to the start of the tag dictionary
	518	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
	519
	520	// add the tag entries
	521	dict_el tagEl;
	522	unsigned long dictTagNum;
	523	N = _idh.num_frags;
	524	for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; dictTagNum++) {
	525	// lastTotalIBits is used to detect integer overflow
	526	lastTotalIBits = totalIBits;
	527
	528	// read the next tag and associated information
	529	tagEl.Read (dictFile);
	530
	531	// update the state record
	532	p = tagEl.frag_occur*2;
	533	InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
	534	tisr.start = totalIBits;
	535	tisr.here = totalIBits;
	536	tisr.B = BIO_Bblock_Init (N+p, p);
	537
	538	// add the length of the fragment numbers (two numbers for each
	539	// tag, one for start and one for end)
	540	totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
	541
	542	// align next byte
	543	#ifdef USE_LONG_LONG
	544	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
	545	#else
	546	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
	547	#endif
	548
	549	CheckIntOverflow (totalIBits, lastTotalIBits);
	550	}
	551
	552	fclose (dictFile);
	553	}
	554
	555	/*
	556	// assumes the chunk tag information has been placed in .first
	557	static void PrintChunkInfo (unsigned long chunkMem,
	558	unsigned long numChunkWords,
	559	unsigned long numChunkTags) {
	560	static unsigned long chunksRead = 0;
	561	chunksRead++;
	562	cout << "Chunk Number: " << chunksRead << "\n";
	563	cout << "numChunkDocs " << numDocsInChunk << "\n";
	564	cout << "numChunkFrags " << numFragsInChunk << "\n";
	565	cout << "mem " << chunkMem << "\n";
	566	cout << "numWords " << numChunkWords << "\n";
	567	cout << "numTags " << numChunkTags << "\n\n";
	568
	569	TagMapDict::iterator tagMapHere = tagMapDict.begin();
	570	TagMapDict::iterator tagMapEnd = tagMapDict.end();
	571	while (tagMapHere != tagMapEnd) {
	572	unsigned long tagMapNum = (*tagMapHere).second.tagNum;
	573	cout << (*tagMapHere).first << " " << tagMapNum << " "
	574	<< bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
	575	tagMapHere++;
	576	}
	577	}
	578	*/
	579
	580	void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
	581	// reset globals
	582	numChunkDocs = 0;
	583	chunkStartFragNum = numFrags;
	584
	585	// read in information about this chunk
	586	numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
	587	if (numDocsInChunk == 0)
	588	FatalError (1, "The number of docs in the current chunk is 0");
	589
	590	numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
	591	unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
	592
	593	if (chunkMem > ivfMemBufSize)
	594	FatalError (1, "Chunk memory size is greater than maximum");
	595
	596	unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
	597	unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
	598
	599
	600	// reset stuff
	601	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
	602	bitPtrs.ResetPtrs();
	603
	604	// read in the entries in occurrence order storing the
	605	// "chunkWordCount" in "start" and the "chunkFragCount"
	606	// in "here"
	607	unsigned long numOccur;
	608	unsigned long wordNum;
	609	for (numOccur=0; numOccur<numChunkWords; numOccur++) {
	610	wordNum = occurConvert.TranslateWord (numOccur);
	611	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
	612	wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
	613	if (wordPtr.start >= 2)
	614	wordPtr.here = chunkBuf.gamma_decode (NULL);
	615	else wordPtr.here = wordPtr.start;
	616	}
	617	unsigned long tagNum;
	618	for (numOccur=0; numOccur<numChunkTags; numOccur++) {
	619	tagNum = occurConvert.TranslateTag (numOccur);
	620	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
	621	// only chunkFragCount is encoded for tags
	622	tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
	623	tagPtr.here = tagPtr.start;
	624	}
	625
	626	/* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
	627
	628	// create the bit ptrs in dictionary order
	629	unsigned long totalIBits = 0; // only dealing with memory
	630	unsigned long chunkWordCount, chunkFragCount;
	631	for (wordNum=0; wordNum<_idh.word_dict_size; wordNum++) {
	632	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
	633	chunkWordCount = wordPtr.start;
	634	chunkFragCount = wordPtr.here;
	635	wordPtr.start = totalIBits;
	636	wordPtr.here = totalIBits;
	637	wordPtr.lastFragNum = chunkStartFragNum;
	638	wordPtr.lgB = 0;
	639	if (chunkWordCount > 0) {
	640	wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
	641	chunkFragCount));
	642	totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
	643	// use unary encoding for memory buffer encoding of fragment freq
	644	if (!wordLevelIndex) {
	645	totalIBits += chunkWordCount;
	646	}
	647	}
	648	}
	649	for (tagNum=0; tagNum<_idh.tag_dict_size; tagNum++) {
	650	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
	651	chunkFragCount = tagPtr.here;
	652	tagPtr.start = totalIBits;
	653	tagPtr.here = totalIBits;
	654	tagPtr.lastFragNum = chunkStartFragNum;
	655	tagPtr.lgB = 0;
	656	if (chunkFragCount > 0) {
	657	unsigned long pTag = chunkFragCount*2;
	658	tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
	659	pTag));
	660	unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
	661	pTag);
	662	// cout << tagNum + _idh.word_dict_size << " ";
	663	// cout << "numFrags: " << numFragsInChunk
	664	// << " chunkFragCount: " << chunkFragCount
	665	// << " B: " << 1 << tagPtr.lgB
	666	// << " blen: " << blen << "\n";
	667	totalIBits += bLen;
	668	}
	669	}
	670	bitPtrs.GetEndStart() = totalIBits;
	671	bitPtrs.GetEndHere() = totalIBits;
	672
	673	if ((totalIBits + 7ul) >> 3ul > chunkMem) {
	674	cerr << "totalIBits: " << totalIBits << "\n";
	675	cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
	676	cerr << "chunkMem: " << chunkMem << "\n";
	677	FatalError (1, "Pointers exceed buffer size");
	678	}
	679	}
	680
	681
	682
	683
	684	int init_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
	685	// read in compressed dictionary header
	686	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
	687	MAGIC_STEM_BUILD, MG_ABORT);
	688	idh.Read (dictFile);
	689	fclose (dictFile);
	690
	691	// set the size of the bit ptrs
	692	bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
	693
	694	// open the chunk file and read in the maximum memory needed
	695	// for the inverted memory buffer
	696	chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
	697	MAGIC_CHUNK, MG_ABORT);
	698	ReadUL (chunkFile, ivfMemBufSize);
	699	chunkBuf.attachFile (chunkFile);
	700
	701	// allocate memory for the inverted buffer
	702	ivfMemBuf = new char [ivfMemBufSize];
	703	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
	704
	705	// read in the word dictionary
	706	ReadWordDict (filename);
	707
	708	// read in the tag dictionary
	709	ReadTagDict (filename, idh);
	710
	711	// read in the level information
	712	ReadLevelFile (filename);
	713	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	714
	715	// set up the translation table file
	716	occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
	717
	718	// reset some globals
	719	numDocs = 0;
	720	numChunkDocs = 0;
	721	numDocsInChunk = 0;
	722	numFrags = 0;
	723	numFragsInChunk = 0;
	724	chunkStartFragNum = 0;
	725
	726	strcpy (collectFilename, filename);
	727
	728
	729	// create the inverted file
	730	mg_ullong totalIBits = 0;
	731	FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
	732	MAGIC_INVF, MG_ABORT);
	733	totalIBits += sizeof (unsigned long) * 8; // magic number
	734	totalIBits += 8 * 200; // 200 byte gap -- why??????
	735	fclose (invfFile);
	736
	737	// init the inverted file state cache
	738	invfState.Open (filename);
	739	InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
	740
	741	return COMPALLOK;
	742	}
	743
	744	static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/tagName/) {
	745	if (!tInfo.inTag) return;
	746
	747	// add this tag to the inverted list
	748	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
	749	unsigned long endFrag = numFrags;
	750	int b = 1 << tagBitPtr.lgB;
	751
	752	/*
	753	cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
	754	<< tInfo.startFrag << " " << endFrag << "\n";
	755	*/
	756
	757	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
	758	buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
	759	b, NULL);
	760	buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
	761	tagBitPtr.lastFragNum = endFrag;
	762	tagBitPtr.here = buffer.position();
	763	buffer.encodeDone();
	764
	765	// check for buffer overrun
	766	bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
	767
	768	// reset information about this tag
	769	tInfo.inTag = false;
	770	tInfo.startFrag = 0;
	771	}
	772
	773	static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
	774	// close tag if open
	775	IP2TagInfo &tInfo = tagMapDict[el.tagName];
	776	if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
	777
	778	// open this tag
	779	tInfo.inTag = true;
	780	tInfo.startFrag = numFrags;
	781
	782	// check for start of next fragment
	783	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	784	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
	785	numFrags++;
	786	inFrag = true;
	787	}
	788	}
	789
	790	static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
	791	// check for end of fragment
	792	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	793	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
	794	inFrag = false;
	795	}
	796
	797	IP2TagInfo &tInfo = tagMapDict[el.tagName];
	798	CloseTextTag (tInfo, el.tagName);
	799	}
	800
	801	static void ProcessText (const TextEl &el, bool &inFrag) {
	802	// make sure this text is to be indexed
	803	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	804	if (!wordLevelIndex && !inFrag) return;
	805
	806	const unsigned char *textHere = el.text.begin();
	807	const unsigned char *textEnd = el.text.end() - 1;
	808	unsigned char mgWord[MAXSTEMLEN + 1];
	809
	810	if (!inaword (textHere, textEnd))
	811	ParseNonindexWord (textHere, textEnd);
	812
	813
	814	// Alternately parse off words and non-words from the input
	815
	816	while (textHere <= textEnd) {
	817	textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
	818	textHere = ParseNonindexWord (textHere, textEnd);
	819
	820	if (mgWord[0] > 0) {
	821	if (wordLevelIndex) numFrags++;
	822
	823	unsigned long wordNum = perf_hash (wordHashDict, mgWord);
	824
	825	/*
	826	cout << wordNum << " \"";
	827	cout.write (mgWord+1, *mgWord);
	828	cout << "\" " << numFrags << "\n";
	829	*/
	830
	831	// add this word to the inverted list
	832	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
	833	unsigned long fragNum = numFrags;
	834	int b = 1 << wordBitPtr.lgB;
	835
	836	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
	837
	838	// note: this assumes that fragments don't carry over between
	839	// chunks (which they don't because all tags are closed at the
	840	// end of each document and chunks are based on document
	841	// boundaries), i.e. the first fragment number must be greater
	842	// than the starting fragment number of the chunk.
	843	if (fragNum > wordBitPtr.lastFragNum) {
	844	buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
	845	b, NULL);
	846	if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
	847
	848	} else if (!wordLevelIndex) {
	849	// add one to the frequency count for this word
	850	buffer.seek (buffer.position()-1);
	851	buffer.encodeBit (0); // unary encoding -- last = 1
	852	buffer.encodeBit (1);
	853	}
	854
	855	wordBitPtr.lastFragNum = fragNum;
	856	wordBitPtr.here = buffer.position();
	857	buffer.encodeDone();
	858
	859	// check for buffer overrun
	860	bitPtrs.CheckWordBufOverrun (wordNum);
	861	}
	862	}
	863	}
	864
	865	// combine the in memory inverted buffer with the disk
	866	// based inverted file
	867	static void DiskMerge (char *filename) {
	868	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	869
	870	// make sure we have something to process
	871	if (numChunkDocs <= 0) return;
	872
	873	// open the inverted file
	874	FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
	875	MAGIC_INVF, MG_ABORT);
	876	random_bitio_buffer invfOutBuf (invfFile);
	877
	878	// set up to decode the entries in memory
	879	mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
	880
	881	// write out the word information
	882	unsigned long wordNum;
	883	int b;
	884	unsigned long currFragNum;
	885	unsigned long delta;
	886	unsigned long currFreq;
	887	for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
	888	// go to the end of the last inverted file entry
	889	InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
	890	invfOutBuf.SEEK_X (wordDiskState.here);
	891
	892	// go to the start of the inverted chunk info in memory
	893	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
	894	memInBuf.seek(wordBitPtr.start);
	895
	896	// decode each entry and re-write to disk
	897	currFragNum = chunkStartFragNum;
	898	while (memInBuf.position() < wordBitPtr.here) {
	899	// decode word entry
	900	b = 1 << wordBitPtr.lgB;
	901	delta = memInBuf.bblock_decode (b, NULL);
	902	currFragNum += delta;
	903	if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
	904	else currFreq = 1;
	905
	906	// recode on disk
	907	invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
	908	wordDiskState.B, NULL);
	909	if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
	910	wordDiskState.lastFragNum = currFragNum;
	911	}
	912
	913	wordDiskState.here = invfOutBuf.TELL_X();
	914	}
	915
	916	// write out the tag information
	917	unsigned long tagNum;
	918	unsigned long currTagStart;
	919	unsigned long currTagEnd;
	920	for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
	921	// go to the end of the last inverted file entry
	922	InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
	923	invfOutBuf.SEEK_X (tagDiskState.here);
	924
	925	// go to the start of the inverted chunk info in memory
	926	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
	927	memInBuf.seek(tagBitPtr.start);
	928
	929	// decode each entry and re-write to disk
	930	currTagEnd = chunkStartFragNum;
	931	while (memInBuf.position() < tagBitPtr.here) {
	932	// decode tag entry
	933	b = 1 << tagBitPtr.lgB;
	934	delta = memInBuf.bblock_decode (b, NULL) - 1;
	935	currTagStart = currTagEnd + delta;
	936	delta = memInBuf.bblock_decode (b, NULL) - 1;
	937	currTagEnd = currTagStart + delta;
	938
	939	// recode on disk
	940	invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
	941	tagDiskState.B, NULL);
	942	invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
	943	tagDiskState.B, NULL);
	944
	945	tagDiskState.lastFragNum = currTagEnd;
	946	}
	947
	948	tagDiskState.here = invfOutBuf.TELL_X();
	949	}
	950
	951	memInBuf.done();
	952
	953	invfOutBuf.encodeDone();
	954	fclose (invfFile);
	955	}
	956
	957
	958	int process_ivf_2 (const TagInfo &/tagInfo/, const TextElArray &doc) {
	959	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	960	bool inFrag = false;
	961	if (wordLevelIndex) inFrag = true; // unconditional
	962
	963	// get next chunk information if need to. the chunk information
	964	// is needed before the first document is processed
	965	if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
	966
	967	// process each text element
	968	TextElArray::const_iterator here = doc.begin();
	969	TextElArray::const_iterator end = doc.end();
	970	while (here != end) {
	971	// process this element
	972	if ((here).elType == OpenTagE) ProcessOpenTag (here, inFrag);
	973	else if ((here).elType == CloseTagE) ProcessCloseTag (here, inFrag);
	974	else ProcessText (*here, inFrag);
	975
	976	here++;
	977	}
	978
	979	// close off any unclosed tags
	980	TagMapDict::iterator tdHere = tagMapDict.begin();
	981	TagMapDict::iterator tdEnd = tagMapDict.end();
	982	while (tdHere != tdEnd) {
	983	CloseTextTag ((tdHere).second, (tdHere).first);
	984	tdHere++;
	985	}
	986
	987	// we've processed one more document
	988	numDocs++;
	989	numChunkDocs++;
	990
	991	// merge the memory based inverted file with the one on
	992	// disk if this is the end of this chunk
	993	if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
	994
	995	return COMPALLOK;
	996	}
	997
	998
	999	static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
	1000	FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
	1001	MAGIC_INVF, MG_ABORT);
	1002	FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
	1003	MAGIC_INVF, MG_ABORT);
	1004
	1005	// skip the magic number
	1006	fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
	1007
	1008	// write the inverted file header -- use defaults for most things
	1009	invf_file_header ifh;
	1010	ifh.no_of_words = idh.word_dict_size;
	1011	ifh.no_of_tags = idh.tag_dict_size;
	1012	ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
	1013	ifh.Write (outInvfFile);
	1014
	1015	bytesOutput = ftell (outInvfFile);
	1016
	1017	// process each meaningful byte in the file
	1018	unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
	1019	unsigned long entryNum;
	1020	mg_ullong lastStart = 0;
	1021	for (entryNum = 0; entryNum < numEntries; entryNum++) {
	1022	InvfStateRec &stateRec = invfState.GetRec (entryNum);
	1023
	1024	// overrun check
	1025	if (stateRec.start < lastStart)
	1026	FatalError (1, "Inverted file Buffer overrun");
	1027	lastStart = stateRec.start;
	1028
	1029	unsigned long oldEntryStart = stateRec.start >> 3;
	1030	unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
	1031	unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
	1032	unsigned long oldEntryEndOver = stateRec.here & 7;
	1033
	1034	fseek (inInvfFile, oldEntryStart, SEEK_SET);
	1035
	1036	stateRec.here -= stateRec.start;
[2448]	1037	stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver;
[856]	1038	stateRec.here += stateRec.start;
	1039	while (oldEntryStart < oldEntryEnd) {
	1040	unsigned char c = getc (inInvfFile);
	1041	if (oldEntryStart == oldEntryEnd - 1) {
	1042	u_char ands[8] =
	1043	{0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
	1044	c &= ands[oldEntryEndOver];
	1045	}
	1046	putc (c, outInvfFile);
	1047	bytesOutput++;
	1048	oldEntryStart++;
	1049	}
	1050	}
	1051
	1052	fclose (inInvfFile);
	1053
	1054	#ifdef __WIN32__
[2468]	1055	if (_chsize (_fileno (outInvfFile), bytesOutput) != 0)
[856]	1056	Message ("Could not truncate invf.");
	1057	#else
	1058	ftruncate (fileno (outInvfFile), bytesOutput);
	1059	#endif
	1060
	1061	fclose (outInvfFile);
	1062	}
	1063
	1064	static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
	1065	FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
	1066	MAGIC_INVI, MG_ABORT);
	1067
	1068	// process each meaningful byte in the file
	1069	unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
	1070	unsigned long entryNum;
	1071	for (entryNum = 0; entryNum < numEntries; entryNum++) {
	1072	InvfStateRec &stateRec = invfState.GetRec (entryNum);
	1073
	1074	// assumes that inverted entries start at beginning of each byte
	1075	if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
	1076	}
	1077
	1078	WriteUL (invfIdxFile, invfNumBytes);
	1079
	1080	fclose (invfIdxFile);
	1081	}
	1082
	1083
	1084	int done_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
	1085	// close most open files
	1086	if (chunkFile != NULL) {
	1087	chunkBuf.done();
	1088	fclose (chunkFile);
	1089	chunkFile = NULL;
	1090	}
	1091	occurConvert.Close();
	1092
	1093	// free allocated memory
	1094	bitPtrs.Clear();
	1095	if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
	1096	free_perf_hash (wordHashDict);
	1097	wordHashDict = NULL;
	1098	tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
	1099
	1100	// condense the inverted file and truncate it
	1101	// this function also writes out the inverted header
	1102	unsigned long invfNumBytes = 0;
	1103	CondenseInvfFile (filename, invfNumBytes);
	1104
	1105	OutputInvfIdx (filename, invfNumBytes);
	1106
	1107	// close the rest of the open files
	1108	invfState.Close ();
	1109
	1110	return COMPALLOK;
	1111	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: