Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/gsdl/src/mgpp/text/ivf.pass2.cpp@ 858

Last change on this file since 858 was 856, checked in by sjboddie, 24 years ago
Rodgers new C++ mg
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.2 KB

Rev	Line
[856]	1	/**************************************************************************
	2	*
	3	* ivf.pass2.cpp -- Memory efficient pass 2 inversion
	4	* Copyright (C) 1999 Rodger McNab
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	* $Id: ivf.pass2.cpp 856 2000-01-14 02:26:25Z sjboddie $
	21	*
	22	**************************************************************************/
	23
	24	/*
	25	$Log$
	26	Revision 1.1 2000/01/14 02:26:07 sjboddie
	27	Rodgers new C++ mg
	28
	29	*/
	30
	31	#include <stdio.h>
	32	#include <unistd.h>
	33
	34	#include "non_ansi.h"
	35
	36	#include "sysfuncs.h"
	37
	38	#include "mg_files.h"
	39	#include "invf.h"
	40	#include "mg.h"
	41	#include "build.h"
	42	#include "locallib.h"
	43	#include "UCArray.h"
	44	#include "bitio_m_random.h"
	45	#include "bitio_m_stdio.h"
	46	#include "bitio_m_mems.h"
	47	#include "bitio_gen.h"
	48	#include <stdio.h>
	49	#include "words.h"
	50	#include "messages.h"
	51	#include "netorder.h"
	52	#include "FIvfLevelInfo.h"
	53	#include "perf_hash.h"
	54	#include "string.h"
	55
	56	#include "longlong.h"
	57
	58	#ifdef __WIN32_
	59	#include <io.h>
	60	#endif
	61
	62	#if defined(GSDL_USE_OBJECTSPACE)
	63	# include <ospace\std\map>
	64	#elif defined(GSDL_USE_STL_H)
	65	# include <map.h>
	66	#else
	67	# include <map>
	68	#endif
	69
	70
	71	#ifdef USE_LONG_LONG
	72	#define SEEK_X seek_LL
	73	#define TELL_X tell_LL
	74	#else
	75	#define SEEK_X seek
	76	#define TELL_X tell
	77	#endif
	78
	79	#ifndef RND_BUF_SIZE
	80	#define RND_BUF_SIZE 8*1024
	81	#endif
	82
	83
	84	static unsigned long numDocs = 0;
	85	static unsigned long numChunkDocs = 0;
	86	static unsigned long numDocsInChunk = 0;
	87
	88	static unsigned long numFrags = 0;
	89	static unsigned long numFragsInChunk = 0;
	90	static unsigned long chunkStartFragNum = 0;
	91
	92
	93
	94
	95	struct BitPtr {
	96	unsigned long start;
	97	unsigned long here;
	98	unsigned long lastFragNum;
	99	unsigned long lgB;
	100
	101	void Clear () { start = here = lastFragNum = lgB = 0; }
	102	BitPtr () { Clear(); }
	103	};
	104
	105	class WordBitPtrs {
	106	protected:
	107	unsigned long numWords;
	108	unsigned long numTags;
	109	unsigned long size;
	110	BitPtr *wordBitPtrs;
	111
	112	void CheckBufOverrun (unsigned long num) {
	113	if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
	114	cerr << "numDocs: " << numDocs << "\n";
	115	cerr << "numChunkDocs: " << numChunkDocs << "\n";
	116	cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
	117	cerr << "numFrags: " << numFrags << "\n";
	118	cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
	119	cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
	120	cerr << "num: " << num << "\n";
	121	cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
	122	cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
	123	cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
	124	FatalError (1, "Bit buffer overrun");
	125	}
	126	}
	127
	128	public:
	129	void Clear ();
	130	WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
	131	~WordBitPtrs ();
	132	void SetSize (unsigned long _numWords,
	133	unsigned long _numTags);
	134
	135	void ResetPtrs () {
	136	if (wordBitPtrs == NULL) return;
	137	unsigned long i;
	138	for (i=0; i<size; i++) wordBitPtrs[i].Clear();
	139	}
	140
	141	BitPtr &GetWordBitPtr (unsigned long wordNum)
	142	{ return wordBitPtrs[wordNum]; }
	143	unsigned long &GetWordStart (unsigned long wordNum)
	144	{ return wordBitPtrs[wordNum].start; }
	145	unsigned long &GetWordHere (unsigned long wordNum)
	146	{ return wordBitPtrs[wordNum].here; }
	147	void CheckWordBufOverrun (unsigned long wordNum)
	148	{ CheckBufOverrun (wordNum); }
	149
	150	BitPtr &GetTagBitPtr (unsigned long tagNum)
	151	{ return wordBitPtrs[tagNum + numWords]; }
	152	unsigned long &GetTagStart (unsigned long tagNum)
	153	{ return wordBitPtrs[tagNum + numWords].start; }
	154	unsigned long &GetTagHere (unsigned long tagNum)
	155	{ return wordBitPtrs[tagNum + numWords].here; }
	156	void CheckTagBufOverrun (unsigned long tagNum)
	157	{ CheckBufOverrun (tagNum + numWords); }
	158
	159	BitPtr &GetEndBitPtr ()
	160	{ return wordBitPtrs[size-1]; }
	161	unsigned long &GetEndStart ()
	162	{ return wordBitPtrs[size-1].start; }
	163	unsigned long &GetEndHere ()
	164	{ return wordBitPtrs[size-1].here; }
	165	};
	166
	167
	168	struct IP2TagInfo {
	169	bool inTag;
	170	unsigned long startFrag;
	171	unsigned long tagNum;
	172
	173	IP2TagInfo () {
	174	inTag = false;
	175	startFrag = 0;
	176	tagNum = 0;
	177	}
	178	};
	179
	180	// maps tags to tag information
	181	typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
	182
	183
	184	// class to handle the translation of occurrence order
	185	// to dictionary order for words and tags
	186	class OccurToDictConverter {
	187	protected:
	188	unsigned long pos;
	189	unsigned long val;
	190	FILE *transFile;
	191	random_bitio_buffer rbs;
	192
	193	unsigned long wordDictSize;
	194	unsigned long tagDictSize;
	195
	196	void SeekStart ();
	197	unsigned long TranslateNum (unsigned long num);
	198
	199	public:
	200	OccurToDictConverter ();
	201	~OccurToDictConverter ();
	202
	203	void Open (char *filename, unsigned long _wordDictSize,
	204	unsigned long _tagDictSize);
	205
	206	// Close frees all allocated memory
	207	void Close ();
	208
	209	unsigned long TranslateWord (unsigned long occurNum)
	210	{ return TranslateNum (occurNum); }
	211	unsigned long TranslateTag (unsigned long occurNum)
	212	{ return TranslateNum (occurNum+wordDictSize); }
	213	};
	214
	215
	216	struct InvfStateRec {
	217	mg_ullong start;
	218	mg_ullong here;
	219	unsigned long lastFragNum;
	220	unsigned long B;
	221
	222	void Clear () {
	223	start = here = 0;
	224	lastFragNum = B = 0;
	225	}
	226	InvfStateRec () { Clear (); }
	227	};
	228
	229
	230	#define ISR_SIZE 1024
	231
	232	class InvfStateCache {
	233	protected:
	234	InvfStateRec recCache [ISR_SIZE];
	235	unsigned long startNum;
	236
	237	FILE *stateFile;
	238
	239	void ClearCache () {
	240	unsigned int i = 0;
	241	for (i=0; i<ISR_SIZE; i++) recCache[i].Clear();
	242	}
	243
	244	public:
	245	InvfStateCache ();
	246	~InvfStateCache ();
	247
	248	void Open (char *filename);
	249	void Close ();
	250
	251	// previous references to state records may be
	252	// invalidated calling GetRec
	253	InvfStateRec &GetRec (unsigned long num);
	254	};
	255
	256
	257	static invf_dict_header idh;
	258	static WordBitPtrs bitPtrs;
	259
	260	static FILE *chunkFile = NULL;
	261	static stdio_bitio_buffer chunkBuf;
	262
	263	static unsigned long ivfMemBufSize = 0;
	264	static char *ivfMemBuf = NULL;
	265
	266	// word and tag dictionaries. a map is used for the tag dictionary
	267	// as it should never be very big (and the perfect hash function
	268	// sometimes has trouble with small values).
	269	static perf_hash_data *wordHashDict = NULL;
	270	static TagMapDict tagMapDict;
	271
	272	// information about all the different levels
	273	static FIvfLevel ivfLevel;
	274
	275	static OccurToDictConverter occurConvert;
	276
	277	// information about the state of the inverted file
	278	static InvfStateCache invfState;
	279
	280	static char collectFilename[512];
	281
	282
	283	void WordBitPtrs::Clear () {
	284	numWords = 0;
	285	numTags = 0;
	286	size=0;
	287	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
	288	wordBitPtrs = NULL;
	289	}
	290
	291	WordBitPtrs::~WordBitPtrs () {
	292	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
	293	}
	294
	295	void WordBitPtrs::SetSize (unsigned long _numWords,
	296	unsigned long _numTags){
	297	Clear();
	298	numWords = _numWords;
	299	numTags = _numTags;
	300	size = numWords + numTags + 1;
	301	wordBitPtrs = new BitPtr [size];
	302	}
	303
	304
	305	void OccurToDictConverter::SeekStart () {
	306	if (transFile == NULL) return;
	307	rbs.SEEK_X (sizeof (unsigned long) * 8);
	308	pos = 0;
	309	}
	310
	311	unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
	312	if (num < pos) SeekStart ();
	313	while (pos <= num) {
	314	if (pos < wordDictSize)
	315	val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
	316	else
	317	val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
	318	pos++;
	319	}
	320	return val;
	321	}
	322
	323	OccurToDictConverter::OccurToDictConverter () {
	324	pos = 0;
	325	val = 0;
	326	transFile = NULL;
	327	wordDictSize = 0;
	328	tagDictSize = 0;
	329	}
	330
	331	OccurToDictConverter::~OccurToDictConverter () {
	332	if (transFile != NULL) Close ();
	333	}
	334
	335	void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
	336	unsigned long _tagDictSize) {
	337	if (transFile != NULL) Close ();
	338
	339	wordDictSize = _wordDictSize;
	340	tagDictSize = _tagDictSize;
	341
	342	transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
	343	MAGIC_CHUNK_TRANS, MG_ABORT);
	344	rbs.attachFile (transFile, RND_BUF_SIZE);
	345	SeekStart ();
	346	val = 0;
	347	}
	348
	349	void OccurToDictConverter::Close () {
	350	if (transFile == NULL) return;
	351
	352	rbs.done ();
	353	fclose (transFile);
	354	transFile = NULL;
	355	pos = 0;
	356	val = 0;
	357
	358	wordDictSize = 0;
	359	tagDictSize = 0;
	360	}
	361
	362
	363
	364
	365	InvfStateCache::InvfStateCache () {
	366	startNum = 0;
	367	stateFile = NULL;
	368	}
	369
	370	InvfStateCache::~InvfStateCache () {
	371	if (stateFile != NULL) Close ();
	372	}
	373
	374	void InvfStateCache::Open (char *filename) {
	375	if (stateFile != NULL) Close();
	376
	377	// open the state file
	378	char path[512];
	379	sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
	380	".invf.state", (long) getpid ());
	381	if (!(stateFile = fopen (path, "wb+"))) {
	382	Message ("Unable to create \"%s\"", path);
	383	exit (1);
	384	}
	385	unlink (path); // file will be deleted after it is closed
	386
	387	// reset the buffer
	388	startNum = 0;
	389	ClearCache();
	390	}
	391
	392	void InvfStateCache::Close () {
	393	if (stateFile == NULL) return;
	394	fclose (stateFile);
	395	stateFile = NULL;
	396	startNum = 0;
	397	}
	398
	399	InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
	400	// see if cached
	401	if ((num >= startNum) && (num < startNum + ISR_SIZE))
	402	return recCache[num-startNum];
	403
	404	// not cached, write out this lot of records and read in
	405	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
	406	fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
	407
	408	// read in the new set of records
	409	ClearCache ();
	410	startNum = num - (num % ISR_SIZE);
	411	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
	412	fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
	413
	414	return recCache[num-startNum];
	415	}
	416
	417
	418
	419	static void ClearCharBuf (char *buf, unsigned long size) {
	420	char *end = buf + size;
	421	while (buf != end) *buf++ = 0;
	422	}
	423
	424	static void ReadWordDict (char *filename) {
	425	// read in the perfect hash function for the word dictionary
	426	FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
	427	MAGIC_HASH, MG_ABORT);
	428	if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
	429	FatalError (1, "Unable to read in hash data for word dictionary");
	430	}
	431	fclose (wordHashFile);
	432	}
	433
	434	static void ReadTagDict (char *filename, invf_dict_header &_idh) {
	435	// open the file
	436	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
	437	MAGIC_STEM_BUILD, MG_ABORT);
	438
	439	// seek to the start of the tag dictionary
	440	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
	441
	442	unsigned long tagNum;
	443	dict_el thisEl;
	444	for (tagNum = 0; tagNum < _idh.tag_dict_size; tagNum++) {
	445	thisEl.Read (dictFile);
	446	tagMapDict[thisEl.el].tagNum = tagNum;
	447	}
	448
	449	fclose (dictFile);
	450	}
	451
	452	static void ReadLevelFile (char *filename) {
	453	FILE *f;
	454	f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
	455	MAGIC_INVF_LEVELS, MG_ABORT);
	456	ivfLevel.Read (f);
	457	fclose (f);
	458	}
	459
	460	void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
	461	if (totalIBits < lastTotalIBits) {
	462	fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
	463	if (sizeof (mg_ullong) < 8) {
	464	fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
	465	}
	466	fprintf(stderr, " Build aborted.\n");
	467	exit(1);
	468	}
	469	}
	470
	471	// assumes the inverted file state file has been opened
	472	static void InitInvfState (char *filename,
	473	invf_dict_header &_idh,
	474	InvfStateCache &_invfState,
	475	mg_ullong &totalIBits,
	476	bool wordLevelIndex) {
	477	// read in the dictionary, setting inverted state information
	478	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
	479	MAGIC_STEM_BUILD, MG_ABORT);
	480
	481	// seek to the start of the word dictionary
	482	fseek (dictFile, _idh.word_dict_start, SEEK_SET);
	483
	484	// add the word entries
	485	word_dict_el wordEl;
	486	wordEl.SetNumLevels (_idh.num_levels);
	487	unsigned long dictWordNum, p;
	488	mg_ullong lastTotalIBits;
	489	unsigned long N = _idh.num_frags;
	490	for (dictWordNum=0; dictWordNum<_idh.word_dict_size; dictWordNum++) {
	491	// lastTotalIBits is used to detect integer overflow
	492	lastTotalIBits = totalIBits;
	493
	494	// read the next word and associated information
	495	wordEl.Read (dictFile, _idh.num_levels);
	496
	497	// update the state record
	498	p = wordEl.frag_occur;
	499	InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
	500	wisr.start = totalIBits;
	501	wisr.here = totalIBits;
	502	wisr.B = BIO_Bblock_Init (N, p);
	503
	504	// add the length of the fragment numbers
	505	totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
	506
	507	// if needed, add the length of the fragment frequency information
	508	if (!wordLevelIndex)
	509	totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
	510
	511	// align next byte
	512	#ifdef USE_LONG_LONG
	513	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
	514	#else
	515	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
	516	#endif
	517
	518	CheckIntOverflow (totalIBits, lastTotalIBits);
	519	}
	520
	521
	522	// seek to the start of the tag dictionary
	523	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
	524
	525	// add the tag entries
	526	dict_el tagEl;
	527	unsigned long dictTagNum;
	528	N = _idh.num_frags;
	529	for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; dictTagNum++) {
	530	// lastTotalIBits is used to detect integer overflow
	531	lastTotalIBits = totalIBits;
	532
	533	// read the next tag and associated information
	534	tagEl.Read (dictFile);
	535
	536	// update the state record
	537	p = tagEl.frag_occur*2;
	538	InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
	539	tisr.start = totalIBits;
	540	tisr.here = totalIBits;
	541	tisr.B = BIO_Bblock_Init (N+p, p);
	542
	543	// add the length of the fragment numbers (two numbers for each
	544	// tag, one for start and one for end)
	545	totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
	546
	547	// align next byte
	548	#ifdef USE_LONG_LONG
	549	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
	550	#else
	551	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
	552	#endif
	553
	554	CheckIntOverflow (totalIBits, lastTotalIBits);
	555	}
	556
	557	fclose (dictFile);
	558	}
	559
	560	/*
	561	// assumes the chunk tag information has been placed in .first
	562	static void PrintChunkInfo (unsigned long chunkMem,
	563	unsigned long numChunkWords,
	564	unsigned long numChunkTags) {
	565	static unsigned long chunksRead = 0;
	566	chunksRead++;
	567	cout << "Chunk Number: " << chunksRead << "\n";
	568	cout << "numChunkDocs " << numDocsInChunk << "\n";
	569	cout << "numChunkFrags " << numFragsInChunk << "\n";
	570	cout << "mem " << chunkMem << "\n";
	571	cout << "numWords " << numChunkWords << "\n";
	572	cout << "numTags " << numChunkTags << "\n\n";
	573
	574	TagMapDict::iterator tagMapHere = tagMapDict.begin();
	575	TagMapDict::iterator tagMapEnd = tagMapDict.end();
	576	while (tagMapHere != tagMapEnd) {
	577	unsigned long tagMapNum = (*tagMapHere).second.tagNum;
	578	cout << (*tagMapHere).first << " " << tagMapNum << " "
	579	<< bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
	580	tagMapHere++;
	581	}
	582	}
	583	*/
	584
	585	void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
	586	// reset globals
	587	numChunkDocs = 0;
	588	chunkStartFragNum = numFrags;
	589
	590	// read in information about this chunk
	591	numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
	592	if (numDocsInChunk == 0)
	593	FatalError (1, "The number of docs in the current chunk is 0");
	594
	595	numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
	596	unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
	597
	598	if (chunkMem > ivfMemBufSize)
	599	FatalError (1, "Chunk memory size is greater than maximum");
	600
	601	unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
	602	unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
	603
	604
	605	// reset stuff
	606	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
	607	bitPtrs.ResetPtrs();
	608
	609	// read in the entries in occurrence order storing the
	610	// "chunkWordCount" in "start" and the "chunkFragCount"
	611	// in "here"
	612	unsigned long numOccur;
	613	unsigned long wordNum;
	614	for (numOccur=0; numOccur<numChunkWords; numOccur++) {
	615	wordNum = occurConvert.TranslateWord (numOccur);
	616	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
	617	wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
	618	if (wordPtr.start >= 2)
	619	wordPtr.here = chunkBuf.gamma_decode (NULL);
	620	else wordPtr.here = wordPtr.start;
	621	}
	622	unsigned long tagNum;
	623	for (numOccur=0; numOccur<numChunkTags; numOccur++) {
	624	tagNum = occurConvert.TranslateTag (numOccur);
	625	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
	626	// only chunkFragCount is encoded for tags
	627	tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
	628	tagPtr.here = tagPtr.start;
	629	}
	630
	631	/* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
	632
	633	// create the bit ptrs in dictionary order
	634	unsigned long totalIBits = 0; // only dealing with memory
	635	unsigned long chunkWordCount, chunkFragCount;
	636	for (wordNum=0; wordNum<_idh.word_dict_size; wordNum++) {
	637	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
	638	chunkWordCount = wordPtr.start;
	639	chunkFragCount = wordPtr.here;
	640	wordPtr.start = totalIBits;
	641	wordPtr.here = totalIBits;
	642	wordPtr.lastFragNum = chunkStartFragNum;
	643	wordPtr.lgB = 0;
	644	if (chunkWordCount > 0) {
	645	wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
	646	chunkFragCount));
	647	totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
	648	// use unary encoding for memory buffer encoding of fragment freq
	649	if (!wordLevelIndex) {
	650	totalIBits += chunkWordCount;
	651	}
	652	}
	653	}
	654	for (tagNum=0; tagNum<_idh.tag_dict_size; tagNum++) {
	655	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
	656	chunkFragCount = tagPtr.here;
	657	tagPtr.start = totalIBits;
	658	tagPtr.here = totalIBits;
	659	tagPtr.lastFragNum = chunkStartFragNum;
	660	tagPtr.lgB = 0;
	661	if (chunkFragCount > 0) {
	662	unsigned long pTag = chunkFragCount*2;
	663	tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
	664	pTag));
	665	unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
	666	pTag);
	667	// cout << tagNum + _idh.word_dict_size << " ";
	668	// cout << "numFrags: " << numFragsInChunk
	669	// << " chunkFragCount: " << chunkFragCount
	670	// << " B: " << 1 << tagPtr.lgB
	671	// << " blen: " << blen << "\n";
	672	totalIBits += bLen;
	673	}
	674	}
	675	bitPtrs.GetEndStart() = totalIBits;
	676	bitPtrs.GetEndHere() = totalIBits;
	677
	678	if ((totalIBits + 7ul) >> 3ul > chunkMem) {
	679	cerr << "totalIBits: " << totalIBits << "\n";
	680	cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
	681	cerr << "chunkMem: " << chunkMem << "\n";
	682	FatalError (1, "Pointers exceed buffer size");
	683	}
	684	}
	685
	686
	687
	688
	689	int init_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
	690	// read in compressed dictionary header
	691	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
	692	MAGIC_STEM_BUILD, MG_ABORT);
	693	idh.Read (dictFile);
	694	fclose (dictFile);
	695
	696	// set the size of the bit ptrs
	697	bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
	698
	699	// open the chunk file and read in the maximum memory needed
	700	// for the inverted memory buffer
	701	chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
	702	MAGIC_CHUNK, MG_ABORT);
	703	ReadUL (chunkFile, ivfMemBufSize);
	704	chunkBuf.attachFile (chunkFile);
	705
	706	// allocate memory for the inverted buffer
	707	ivfMemBuf = new char [ivfMemBufSize];
	708	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
	709
	710	// read in the word dictionary
	711	ReadWordDict (filename);
	712
	713	// read in the tag dictionary
	714	ReadTagDict (filename, idh);
	715
	716	// read in the level information
	717	ReadLevelFile (filename);
	718	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	719
	720	// set up the translation table file
	721	occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
	722
	723	// reset some globals
	724	numDocs = 0;
	725	numChunkDocs = 0;
	726	numDocsInChunk = 0;
	727	numFrags = 0;
	728	numFragsInChunk = 0;
	729	chunkStartFragNum = 0;
	730
	731	strcpy (collectFilename, filename);
	732
	733
	734	// create the inverted file
	735	mg_ullong totalIBits = 0;
	736	FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
	737	MAGIC_INVF, MG_ABORT);
	738	totalIBits += sizeof (unsigned long) * 8; // magic number
	739	totalIBits += 8 * 200; // 200 byte gap -- why??????
	740	fclose (invfFile);
	741
	742	// init the inverted file state cache
	743	invfState.Open (filename);
	744	InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
	745
	746	return COMPALLOK;
	747	}
	748
	749	static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/tagName/) {
	750	if (!tInfo.inTag) return;
	751
	752	// add this tag to the inverted list
	753	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
	754	unsigned long endFrag = numFrags;
	755	int b = 1 << tagBitPtr.lgB;
	756
	757	/*
	758	cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
	759	<< tInfo.startFrag << " " << endFrag << "\n";
	760	*/
	761
	762	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
	763	buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
	764	b, NULL);
	765	buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
	766	tagBitPtr.lastFragNum = endFrag;
	767	tagBitPtr.here = buffer.position();
	768	buffer.encodeDone();
	769
	770	// check for buffer overrun
	771	bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
	772
	773	// reset information about this tag
	774	tInfo.inTag = false;
	775	tInfo.startFrag = 0;
	776	}
	777
	778	static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
	779	// close tag if open
	780	IP2TagInfo &tInfo = tagMapDict[el.tagName];
	781	if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
	782
	783	// open this tag
	784	tInfo.inTag = true;
	785	tInfo.startFrag = numFrags;
	786
	787	// check for start of next fragment
	788	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	789	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
	790	numFrags++;
	791	inFrag = true;
	792	}
	793	}
	794
	795	static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
	796	// check for end of fragment
	797	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	798	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
	799	inFrag = false;
	800	}
	801
	802	IP2TagInfo &tInfo = tagMapDict[el.tagName];
	803	CloseTextTag (tInfo, el.tagName);
	804	}
	805
	806	static void ProcessText (const TextEl &el, bool &inFrag) {
	807	// make sure this text is to be indexed
	808	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	809	if (!wordLevelIndex && !inFrag) return;
	810
	811	const unsigned char *textHere = el.text.begin();
	812	const unsigned char *textEnd = el.text.end() - 1;
	813	unsigned char mgWord[MAXSTEMLEN + 1];
	814
	815	if (!inaword (textHere, textEnd))
	816	ParseNonindexWord (textHere, textEnd);
	817
	818
	819	// Alternately parse off words and non-words from the input
	820
	821	while (textHere <= textEnd) {
	822	textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
	823	textHere = ParseNonindexWord (textHere, textEnd);
	824
	825	if (mgWord[0] > 0) {
	826	if (wordLevelIndex) numFrags++;
	827
	828	unsigned long wordNum = perf_hash (wordHashDict, mgWord);
	829
	830	/*
	831	cout << wordNum << " \"";
	832	cout.write (mgWord+1, *mgWord);
	833	cout << "\" " << numFrags << "\n";
	834	*/
	835
	836	// add this word to the inverted list
	837	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
	838	unsigned long fragNum = numFrags;
	839	int b = 1 << wordBitPtr.lgB;
	840
	841	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
	842
	843	// note: this assumes that fragments don't carry over between
	844	// chunks (which they don't because all tags are closed at the
	845	// end of each document and chunks are based on document
	846	// boundaries), i.e. the first fragment number must be greater
	847	// than the starting fragment number of the chunk.
	848	if (fragNum > wordBitPtr.lastFragNum) {
	849	buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
	850	b, NULL);
	851	if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
	852
	853	} else if (!wordLevelIndex) {
	854	// add one to the frequency count for this word
	855	buffer.seek (buffer.position()-1);
	856	buffer.encodeBit (0); // unary encoding -- last = 1
	857	buffer.encodeBit (1);
	858	}
	859
	860	wordBitPtr.lastFragNum = fragNum;
	861	wordBitPtr.here = buffer.position();
	862	buffer.encodeDone();
	863
	864	// check for buffer overrun
	865	bitPtrs.CheckWordBufOverrun (wordNum);
	866	}
	867	}
	868	}
	869
	870	// combine the in memory inverted buffer with the disk
	871	// based inverted file
	872	static void DiskMerge (char *filename) {
	873	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	874
	875	// make sure we have something to process
	876	if (numChunkDocs <= 0) return;
	877
	878	// open the inverted file
	879	FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
	880	MAGIC_INVF, MG_ABORT);
	881	random_bitio_buffer invfOutBuf (invfFile);
	882
	883	// set up to decode the entries in memory
	884	mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
	885
	886	// write out the word information
	887	unsigned long wordNum;
	888	int b;
	889	unsigned long currFragNum;
	890	unsigned long delta;
	891	unsigned long currFreq;
	892	for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
	893	// go to the end of the last inverted file entry
	894	InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
	895	invfOutBuf.SEEK_X (wordDiskState.here);
	896
	897	// go to the start of the inverted chunk info in memory
	898	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
	899	memInBuf.seek(wordBitPtr.start);
	900
	901	// decode each entry and re-write to disk
	902	currFragNum = chunkStartFragNum;
	903	while (memInBuf.position() < wordBitPtr.here) {
	904	// decode word entry
	905	b = 1 << wordBitPtr.lgB;
	906	delta = memInBuf.bblock_decode (b, NULL);
	907	currFragNum += delta;
	908	if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
	909	else currFreq = 1;
	910
	911	// recode on disk
	912	invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
	913	wordDiskState.B, NULL);
	914	if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
	915	wordDiskState.lastFragNum = currFragNum;
	916	}
	917
	918	wordDiskState.here = invfOutBuf.TELL_X();
	919	}
	920
	921	// write out the tag information
	922	unsigned long tagNum;
	923	unsigned long currTagStart;
	924	unsigned long currTagEnd;
	925	for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
	926	// go to the end of the last inverted file entry
	927	InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
	928	invfOutBuf.SEEK_X (tagDiskState.here);
	929
	930	// go to the start of the inverted chunk info in memory
	931	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
	932	memInBuf.seek(tagBitPtr.start);
	933
	934	// decode each entry and re-write to disk
	935	currTagEnd = chunkStartFragNum;
	936	while (memInBuf.position() < tagBitPtr.here) {
	937	// decode tag entry
	938	b = 1 << tagBitPtr.lgB;
	939	delta = memInBuf.bblock_decode (b, NULL) - 1;
	940	currTagStart = currTagEnd + delta;
	941	delta = memInBuf.bblock_decode (b, NULL) - 1;
	942	currTagEnd = currTagStart + delta;
	943
	944	// recode on disk
	945	invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
	946	tagDiskState.B, NULL);
	947	invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
	948	tagDiskState.B, NULL);
	949
	950	tagDiskState.lastFragNum = currTagEnd;
	951	}
	952
	953	tagDiskState.here = invfOutBuf.TELL_X();
	954	}
	955
	956	memInBuf.done();
	957
	958	invfOutBuf.encodeDone();
	959	fclose (invfFile);
	960	}
	961
	962
	963	int process_ivf_2 (const TagInfo &/tagInfo/, const TextElArray &doc) {
	964	bool wordLevelIndex = ivfLevel.indexLevel.empty();
	965	bool inFrag = false;
	966	if (wordLevelIndex) inFrag = true; // unconditional
	967
	968	// get next chunk information if need to. the chunk information
	969	// is needed before the first document is processed
	970	if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
	971
	972	// process each text element
	973	TextElArray::const_iterator here = doc.begin();
	974	TextElArray::const_iterator end = doc.end();
	975	while (here != end) {
	976	// process this element
	977	if ((here).elType == OpenTagE) ProcessOpenTag (here, inFrag);
	978	else if ((here).elType == CloseTagE) ProcessCloseTag (here, inFrag);
	979	else ProcessText (*here, inFrag);
	980
	981	here++;
	982	}
	983
	984	// close off any unclosed tags
	985	TagMapDict::iterator tdHere = tagMapDict.begin();
	986	TagMapDict::iterator tdEnd = tagMapDict.end();
	987	while (tdHere != tdEnd) {
	988	CloseTextTag ((tdHere).second, (tdHere).first);
	989	tdHere++;
	990	}
	991
	992	// we've processed one more document
	993	numDocs++;
	994	numChunkDocs++;
	995
	996	// merge the memory based inverted file with the one on
	997	// disk if this is the end of this chunk
	998	if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
	999
	1000	return COMPALLOK;
	1001	}
	1002
	1003
	1004	static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
	1005	FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
	1006	MAGIC_INVF, MG_ABORT);
	1007	FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
	1008	MAGIC_INVF, MG_ABORT);
	1009
	1010	// skip the magic number
	1011	fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
	1012
	1013	// write the inverted file header -- use defaults for most things
	1014	invf_file_header ifh;
	1015	ifh.no_of_words = idh.word_dict_size;
	1016	ifh.no_of_tags = idh.tag_dict_size;
	1017	ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
	1018	ifh.Write (outInvfFile);
	1019
	1020	bytesOutput = ftell (outInvfFile);
	1021
	1022	// process each meaningful byte in the file
	1023	unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
	1024	unsigned long entryNum;
	1025	mg_ullong lastStart = 0;
	1026	for (entryNum = 0; entryNum < numEntries; entryNum++) {
	1027	InvfStateRec &stateRec = invfState.GetRec (entryNum);
	1028
	1029	// overrun check
	1030	if (stateRec.start < lastStart)
	1031	FatalError (1, "Inverted file Buffer overrun");
	1032	lastStart = stateRec.start;
	1033
	1034	unsigned long oldEntryStart = stateRec.start >> 3;
	1035	unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
	1036	unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
	1037	unsigned long oldEntryEndOver = stateRec.here & 7;
	1038
	1039	fseek (inInvfFile, oldEntryStart, SEEK_SET);
	1040
	1041	stateRec.here -= stateRec.start;
	1042	stateRec.start = bytesOutput * 8 + oldEntryStartOver;
	1043	stateRec.here += stateRec.start;
	1044	while (oldEntryStart < oldEntryEnd) {
	1045	unsigned char c = getc (inInvfFile);
	1046	if (oldEntryStart == oldEntryEnd - 1) {
	1047	u_char ands[8] =
	1048	{0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
	1049	c &= ands[oldEntryEndOver];
	1050	}
	1051	putc (c, outInvfFile);
	1052	bytesOutput++;
	1053	oldEntryStart++;
	1054	}
	1055	}
	1056
	1057	fclose (inInvfFile);
	1058
	1059	#ifdef __WIN32__
	1060	if (!(_chsize (_fileno (outInvfFile), bytesOutput)))
	1061	Message ("Could not truncate invf.");
	1062	#else
	1063	ftruncate (fileno (outInvfFile), bytesOutput);
	1064	#endif
	1065
	1066	fclose (outInvfFile);
	1067	}
	1068
	1069	static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
	1070	FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
	1071	MAGIC_INVI, MG_ABORT);
	1072
	1073	// process each meaningful byte in the file
	1074	unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
	1075	unsigned long entryNum;
	1076	for (entryNum = 0; entryNum < numEntries; entryNum++) {
	1077	InvfStateRec &stateRec = invfState.GetRec (entryNum);
	1078
	1079	// assumes that inverted entries start at beginning of each byte
	1080	if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
	1081	}
	1082
	1083	WriteUL (invfIdxFile, invfNumBytes);
	1084
	1085	fclose (invfIdxFile);
	1086	}
	1087
	1088
	1089	int done_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
	1090	// close most open files
	1091	if (chunkFile != NULL) {
	1092	chunkBuf.done();
	1093	fclose (chunkFile);
	1094	chunkFile = NULL;
	1095	}
	1096	occurConvert.Close();
	1097
	1098	// free allocated memory
	1099	bitPtrs.Clear();
	1100	if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
	1101	free_perf_hash (wordHashDict);
	1102	wordHashDict = NULL;
	1103	tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
	1104
	1105	// condense the inverted file and truncate it
	1106	// this function also writes out the inverted header
	1107	unsigned long invfNumBytes = 0;
	1108	CondenseInvfFile (filename, invfNumBytes);
	1109
	1110	OutputInvfIdx (filename, invfNumBytes);
	1111
	1112	// close the rest of the open files
	1113	invfState.Close ();
	1114
	1115	return COMPALLOK;
	1116	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: