Context Navigation

source: trunk/gsdl/src/mgpp/text/ivf.pass2.cpp@ 2448

Last change on this file since 2448 was 2448, checked in by kjm18, 23 years ago
fixed unsigned long overflow error in Condense Invf function bytesOutput cast to mg_ullong.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.6 KB

Line
1	/**************************************************************************
2	*
3	* ivf.pass2.cpp -- Memory efficient pass 2 inversion
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: ivf.pass2.cpp 2448 2001-05-18 03:20:10Z kjm18 $
21	*
22	**************************************************************************/
23
24	/*
25	$Log$
26	Revision 1.3 2001/05/18 03:20:10 kjm18
27	fixed unsigned long overflow error in Condense Invf function
28	bytesOutput cast to mg_ullong.
29
30	Revision 1.2 2001/05/17 04:52:38 jrm21
31	added some #defines for proper portability.
32	(eg ftruncate needs _XOPEN_SOURCE_EXTENDED)
33
34	Revision 1.1 2000/01/14 02:26:07 sjboddie
35	Rodgers new C++ mg
36
37	*/
38	#define _XOPEN_SOURCE 1
39	#define _XOPEN_SOURCE_EXTENDED 1
40	#include <stdio.h>
41	#include <unistd.h>
42
43	#include "non_ansi.h"
44
45	#include "sysfuncs.h"
46
47	#include "mg_files.h"
48	#include "invf.h"
49	#include "mg.h"
50	#include "build.h"
51	#include "locallib.h"
52	#include "UCArray.h"
53	#include "bitio_m_random.h"
54	#include "bitio_m_stdio.h"
55	#include "bitio_m_mems.h"
56	#include "bitio_gen.h"
57	#include <stdio.h>
58	#include "words.h"
59	#include "messages.h"
60	#include "netorder.h"
61	#include "FIvfLevelInfo.h"
62	#include "perf_hash.h"
63	#include "string.h"
64
65	#include "longlong.h"
66
67	#ifdef __WIN32_
68	#include <io.h>
69	#endif
70
71	#if defined(GSDL_USE_OBJECTSPACE)
72	# include <ospace\std\map>
73	#elif defined(GSDL_USE_STL_H)
74	# include <map.h>
75	#else
76	# include <map>
77	#endif
78
79
80	#ifdef USE_LONG_LONG
81	#define SEEK_X seek_LL
82	#define TELL_X tell_LL
83	#else
84	#define SEEK_X seek
85	#define TELL_X tell
86	#endif
87
88	#ifndef RND_BUF_SIZE
89	#define RND_BUF_SIZE 8*1024
90	#endif
91
92
93	static unsigned long numDocs = 0;
94	static unsigned long numChunkDocs = 0;
95	static unsigned long numDocsInChunk = 0;
96
97	static unsigned long numFrags = 0;
98	static unsigned long numFragsInChunk = 0;
99	static unsigned long chunkStartFragNum = 0;
100
101
102
103
104	struct BitPtr {
105	unsigned long start;
106	unsigned long here;
107	unsigned long lastFragNum;
108	unsigned long lgB;
109
110	void Clear () { start = here = lastFragNum = lgB = 0; }
111	BitPtr () { Clear(); }
112	};
113
114	class WordBitPtrs {
115	protected:
116	unsigned long numWords;
117	unsigned long numTags;
118	unsigned long size;
119	BitPtr *wordBitPtrs;
120
121	void CheckBufOverrun (unsigned long num) {
122	if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
123	cerr << "numDocs: " << numDocs << "\n";
124	cerr << "numChunkDocs: " << numChunkDocs << "\n";
125	cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
126	cerr << "numFrags: " << numFrags << "\n";
127	cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
128	cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
129	cerr << "num: " << num << "\n";
130	cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
131	cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
132	cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
133	FatalError (1, "Bit buffer overrun");
134	}
135	}
136
137	public:
138	void Clear ();
139	WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
140	~WordBitPtrs ();
141	void SetSize (unsigned long _numWords,
142	unsigned long _numTags);
143
144	void ResetPtrs () {
145	if (wordBitPtrs == NULL) return;
146	unsigned long i;
147	for (i=0; i<size; i++) wordBitPtrs[i].Clear();
148	}
149
150	BitPtr &GetWordBitPtr (unsigned long wordNum)
151	{ return wordBitPtrs[wordNum]; }
152	unsigned long &GetWordStart (unsigned long wordNum)
153	{ return wordBitPtrs[wordNum].start; }
154	unsigned long &GetWordHere (unsigned long wordNum)
155	{ return wordBitPtrs[wordNum].here; }
156	void CheckWordBufOverrun (unsigned long wordNum)
157	{ CheckBufOverrun (wordNum); }
158
159	BitPtr &GetTagBitPtr (unsigned long tagNum)
160	{ return wordBitPtrs[tagNum + numWords]; }
161	unsigned long &GetTagStart (unsigned long tagNum)
162	{ return wordBitPtrs[tagNum + numWords].start; }
163	unsigned long &GetTagHere (unsigned long tagNum)
164	{ return wordBitPtrs[tagNum + numWords].here; }
165	void CheckTagBufOverrun (unsigned long tagNum)
166	{ CheckBufOverrun (tagNum + numWords); }
167
168	BitPtr &GetEndBitPtr ()
169	{ return wordBitPtrs[size-1]; }
170	unsigned long &GetEndStart ()
171	{ return wordBitPtrs[size-1].start; }
172	unsigned long &GetEndHere ()
173	{ return wordBitPtrs[size-1].here; }
174	};
175
176
177	struct IP2TagInfo {
178	bool inTag;
179	unsigned long startFrag;
180	unsigned long tagNum;
181
182	IP2TagInfo () {
183	inTag = false;
184	startFrag = 0;
185	tagNum = 0;
186	}
187	};
188
189	// maps tags to tag information
190	typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
191
192
193	// class to handle the translation of occurrence order
194	// to dictionary order for words and tags
195	class OccurToDictConverter {
196	protected:
197	unsigned long pos;
198	unsigned long val;
199	FILE *transFile;
200	random_bitio_buffer rbs;
201
202	unsigned long wordDictSize;
203	unsigned long tagDictSize;
204
205	void SeekStart ();
206	unsigned long TranslateNum (unsigned long num);
207
208	public:
209	OccurToDictConverter ();
210	~OccurToDictConverter ();
211
212	void Open (char *filename, unsigned long _wordDictSize,
213	unsigned long _tagDictSize);
214
215	// Close frees all allocated memory
216	void Close ();
217
218	unsigned long TranslateWord (unsigned long occurNum)
219	{ return TranslateNum (occurNum); }
220	unsigned long TranslateTag (unsigned long occurNum)
221	{ return TranslateNum (occurNum+wordDictSize); }
222	};
223
224
225	struct InvfStateRec {
226	mg_ullong start;
227	mg_ullong here;
228	unsigned long lastFragNum;
229	unsigned long B;
230
231	void Clear () {
232	start = here = 0;
233	lastFragNum = B = 0;
234	}
235	InvfStateRec () { Clear (); }
236	};
237
238
239	#define ISR_SIZE 1024
240
241	class InvfStateCache {
242	protected:
243	InvfStateRec recCache [ISR_SIZE];
244	unsigned long startNum;
245
246	FILE *stateFile;
247
248	void ClearCache () {
249	unsigned int i = 0;
250	for (i=0; i<ISR_SIZE; i++) recCache[i].Clear();
251	}
252
253	public:
254	InvfStateCache ();
255	~InvfStateCache ();
256
257	void Open (char *filename);
258	void Close ();
259
260	// previous references to state records may be
261	// invalidated calling GetRec
262	InvfStateRec &GetRec (unsigned long num);
263	};
264
265
266	static invf_dict_header idh;
267	static WordBitPtrs bitPtrs;
268
269	static FILE *chunkFile = NULL;
270	static stdio_bitio_buffer chunkBuf;
271
272	static unsigned long ivfMemBufSize = 0;
273	static char *ivfMemBuf = NULL;
274
275	// word and tag dictionaries. a map is used for the tag dictionary
276	// as it should never be very big (and the perfect hash function
277	// sometimes has trouble with small values).
278	static perf_hash_data *wordHashDict = NULL;
279	static TagMapDict tagMapDict;
280
281	// information about all the different levels
282	static FIvfLevel ivfLevel;
283
284	static OccurToDictConverter occurConvert;
285
286	// information about the state of the inverted file
287	static InvfStateCache invfState;
288
289	static char collectFilename[512];
290
291
292	void WordBitPtrs::Clear () {
293	numWords = 0;
294	numTags = 0;
295	size=0;
296	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
297	wordBitPtrs = NULL;
298	}
299
300	WordBitPtrs::~WordBitPtrs () {
301	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
302	}
303
304	void WordBitPtrs::SetSize (unsigned long _numWords,
305	unsigned long _numTags){
306	Clear();
307	numWords = _numWords;
308	numTags = _numTags;
309	size = numWords + numTags + 1;
310	wordBitPtrs = new BitPtr [size];
311	}
312
313
314	void OccurToDictConverter::SeekStart () {
315	if (transFile == NULL) return;
316	rbs.SEEK_X (sizeof (unsigned long) * 8);
317	pos = 0;
318	}
319
320	unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
321	if (num < pos) SeekStart ();
322	while (pos <= num) {
323	if (pos < wordDictSize)
324	val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
325	else
326	val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
327	pos++;
328	}
329	return val;
330	}
331
332	OccurToDictConverter::OccurToDictConverter () {
333	pos = 0;
334	val = 0;
335	transFile = NULL;
336	wordDictSize = 0;
337	tagDictSize = 0;
338	}
339
340	OccurToDictConverter::~OccurToDictConverter () {
341	if (transFile != NULL) Close ();
342	}
343
344	void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
345	unsigned long _tagDictSize) {
346	if (transFile != NULL) Close ();
347
348	wordDictSize = _wordDictSize;
349	tagDictSize = _tagDictSize;
350
351	transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
352	MAGIC_CHUNK_TRANS, MG_ABORT);
353	rbs.attachFile (transFile, RND_BUF_SIZE);
354	SeekStart ();
355	val = 0;
356	}
357
358	void OccurToDictConverter::Close () {
359	if (transFile == NULL) return;
360
361	rbs.done ();
362	fclose (transFile);
363	transFile = NULL;
364	pos = 0;
365	val = 0;
366
367	wordDictSize = 0;
368	tagDictSize = 0;
369	}
370
371
372
373
374	InvfStateCache::InvfStateCache () {
375	startNum = 0;
376	stateFile = NULL;
377	}
378
379	InvfStateCache::~InvfStateCache () {
380	if (stateFile != NULL) Close ();
381	}
382
383	void InvfStateCache::Open (char *filename) {
384	if (stateFile != NULL) Close();
385
386	// open the state file
387	char path[512];
388	sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
389	".invf.state", (long) getpid ());
390	if (!(stateFile = fopen (path, "wb+"))) {
391	Message ("Unable to create \"%s\"", path);
392	exit (1);
393	}
394	unlink (path); // file will be deleted after it is closed
395
396	// reset the buffer
397	startNum = 0;
398	ClearCache();
399	}
400
401	void InvfStateCache::Close () {
402	if (stateFile == NULL) return;
403	fclose (stateFile);
404	stateFile = NULL;
405	startNum = 0;
406	}
407
408	InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
409	// see if cached
410	if ((num >= startNum) && (num < startNum + ISR_SIZE))
411	return recCache[num-startNum];
412
413	// not cached, write out this lot of records and read in
414	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
415	fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
416
417	// read in the new set of records
418	ClearCache ();
419	startNum = num - (num % ISR_SIZE);
420	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
421	fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
422
423	return recCache[num-startNum];
424	}
425
426
427
428	static void ClearCharBuf (char *buf, unsigned long size) {
429	char *end = buf + size;
430	while (buf != end) *buf++ = 0;
431	}
432
433	static void ReadWordDict (char *filename) {
434	// read in the perfect hash function for the word dictionary
435	FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
436	MAGIC_HASH, MG_ABORT);
437	if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
438	FatalError (1, "Unable to read in hash data for word dictionary");
439	}
440	fclose (wordHashFile);
441	}
442
443	static void ReadTagDict (char *filename, invf_dict_header &_idh) {
444	// open the file
445	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
446	MAGIC_STEM_BUILD, MG_ABORT);
447
448	// seek to the start of the tag dictionary
449	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
450
451	unsigned long tagNum;
452	dict_el thisEl;
453	for (tagNum = 0; tagNum < _idh.tag_dict_size; tagNum++) {
454	thisEl.Read (dictFile);
455	tagMapDict[thisEl.el].tagNum = tagNum;
456	}
457
458	fclose (dictFile);
459	}
460
461	static void ReadLevelFile (char *filename) {
462	FILE *f;
463	f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
464	MAGIC_INVF_LEVELS, MG_ABORT);
465	ivfLevel.Read (f);
466	fclose (f);
467	}
468
469	void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
470	if (totalIBits < lastTotalIBits) {
471	fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
472	if (sizeof (mg_ullong) < 8) {
473	fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
474	}
475	fprintf(stderr, " Build aborted.\n");
476	exit(1);
477	}
478	}
479
480	// assumes the inverted file state file has been opened
481	static void InitInvfState (char *filename,
482	invf_dict_header &_idh,
483	InvfStateCache &_invfState,
484	mg_ullong &totalIBits,
485	bool wordLevelIndex) {
486	// read in the dictionary, setting inverted state information
487	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
488	MAGIC_STEM_BUILD, MG_ABORT);
489
490	// seek to the start of the word dictionary
491	fseek (dictFile, _idh.word_dict_start, SEEK_SET);
492
493	// add the word entries
494	word_dict_el wordEl;
495	wordEl.SetNumLevels (_idh.num_levels);
496	unsigned long dictWordNum, p;
497	mg_ullong lastTotalIBits;
498	unsigned long N = _idh.num_frags;
499	for (dictWordNum=0; dictWordNum<_idh.word_dict_size; dictWordNum++) {
500	// lastTotalIBits is used to detect integer overflow
501	lastTotalIBits = totalIBits;
502
503	// read the next word and associated information
504	wordEl.Read (dictFile, _idh.num_levels);
505
506	// update the state record
507	p = wordEl.frag_occur;
508	InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
509	wisr.start = totalIBits;
510	wisr.here = totalIBits;
511	wisr.B = BIO_Bblock_Init (N, p);
512
513	// add the length of the fragment numbers
514	totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
515
516	// if needed, add the length of the fragment frequency information
517	if (!wordLevelIndex)
518	totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
519
520	// align next byte
521	#ifdef USE_LONG_LONG
522	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
523	#else
524	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
525	#endif
526
527	CheckIntOverflow (totalIBits, lastTotalIBits);
528	}
529
530
531	// seek to the start of the tag dictionary
532	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
533
534	// add the tag entries
535	dict_el tagEl;
536	unsigned long dictTagNum;
537	N = _idh.num_frags;
538	for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; dictTagNum++) {
539	// lastTotalIBits is used to detect integer overflow
540	lastTotalIBits = totalIBits;
541
542	// read the next tag and associated information
543	tagEl.Read (dictFile);
544
545	// update the state record
546	p = tagEl.frag_occur*2;
547	InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
548	tisr.start = totalIBits;
549	tisr.here = totalIBits;
550	tisr.B = BIO_Bblock_Init (N+p, p);
551
552	// add the length of the fragment numbers (two numbers for each
553	// tag, one for start and one for end)
554	totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
555
556	// align next byte
557	#ifdef USE_LONG_LONG
558	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
559	#else
560	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
561	#endif
562
563	CheckIntOverflow (totalIBits, lastTotalIBits);
564	}
565
566	fclose (dictFile);
567	}
568
569	/*
570	// assumes the chunk tag information has been placed in .first
571	static void PrintChunkInfo (unsigned long chunkMem,
572	unsigned long numChunkWords,
573	unsigned long numChunkTags) {
574	static unsigned long chunksRead = 0;
575	chunksRead++;
576	cout << "Chunk Number: " << chunksRead << "\n";
577	cout << "numChunkDocs " << numDocsInChunk << "\n";
578	cout << "numChunkFrags " << numFragsInChunk << "\n";
579	cout << "mem " << chunkMem << "\n";
580	cout << "numWords " << numChunkWords << "\n";
581	cout << "numTags " << numChunkTags << "\n\n";
582
583	TagMapDict::iterator tagMapHere = tagMapDict.begin();
584	TagMapDict::iterator tagMapEnd = tagMapDict.end();
585	while (tagMapHere != tagMapEnd) {
586	unsigned long tagMapNum = (*tagMapHere).second.tagNum;
587	cout << (*tagMapHere).first << " " << tagMapNum << " "
588	<< bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
589	tagMapHere++;
590	}
591	}
592	*/
593
594	void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
595	// reset globals
596	numChunkDocs = 0;
597	chunkStartFragNum = numFrags;
598
599	// read in information about this chunk
600	numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
601	if (numDocsInChunk == 0)
602	FatalError (1, "The number of docs in the current chunk is 0");
603
604	numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
605	unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
606
607	if (chunkMem > ivfMemBufSize)
608	FatalError (1, "Chunk memory size is greater than maximum");
609
610	unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
611	unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
612
613
614	// reset stuff
615	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
616	bitPtrs.ResetPtrs();
617
618	// read in the entries in occurrence order storing the
619	// "chunkWordCount" in "start" and the "chunkFragCount"
620	// in "here"
621	unsigned long numOccur;
622	unsigned long wordNum;
623	for (numOccur=0; numOccur<numChunkWords; numOccur++) {
624	wordNum = occurConvert.TranslateWord (numOccur);
625	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
626	wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
627	if (wordPtr.start >= 2)
628	wordPtr.here = chunkBuf.gamma_decode (NULL);
629	else wordPtr.here = wordPtr.start;
630	}
631	unsigned long tagNum;
632	for (numOccur=0; numOccur<numChunkTags; numOccur++) {
633	tagNum = occurConvert.TranslateTag (numOccur);
634	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
635	// only chunkFragCount is encoded for tags
636	tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
637	tagPtr.here = tagPtr.start;
638	}
639
640	/* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
641
642	// create the bit ptrs in dictionary order
643	unsigned long totalIBits = 0; // only dealing with memory
644	unsigned long chunkWordCount, chunkFragCount;
645	for (wordNum=0; wordNum<_idh.word_dict_size; wordNum++) {
646	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
647	chunkWordCount = wordPtr.start;
648	chunkFragCount = wordPtr.here;
649	wordPtr.start = totalIBits;
650	wordPtr.here = totalIBits;
651	wordPtr.lastFragNum = chunkStartFragNum;
652	wordPtr.lgB = 0;
653	if (chunkWordCount > 0) {
654	wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
655	chunkFragCount));
656	totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
657	// use unary encoding for memory buffer encoding of fragment freq
658	if (!wordLevelIndex) {
659	totalIBits += chunkWordCount;
660	}
661	}
662	}
663	for (tagNum=0; tagNum<_idh.tag_dict_size; tagNum++) {
664	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
665	chunkFragCount = tagPtr.here;
666	tagPtr.start = totalIBits;
667	tagPtr.here = totalIBits;
668	tagPtr.lastFragNum = chunkStartFragNum;
669	tagPtr.lgB = 0;
670	if (chunkFragCount > 0) {
671	unsigned long pTag = chunkFragCount*2;
672	tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
673	pTag));
674	unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
675	pTag);
676	// cout << tagNum + _idh.word_dict_size << " ";
677	// cout << "numFrags: " << numFragsInChunk
678	// << " chunkFragCount: " << chunkFragCount
679	// << " B: " << 1 << tagPtr.lgB
680	// << " blen: " << blen << "\n";
681	totalIBits += bLen;
682	}
683	}
684	bitPtrs.GetEndStart() = totalIBits;
685	bitPtrs.GetEndHere() = totalIBits;
686
687	if ((totalIBits + 7ul) >> 3ul > chunkMem) {
688	cerr << "totalIBits: " << totalIBits << "\n";
689	cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
690	cerr << "chunkMem: " << chunkMem << "\n";
691	FatalError (1, "Pointers exceed buffer size");
692	}
693	}
694
695
696
697
698	int init_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
699	// read in compressed dictionary header
700	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
701	MAGIC_STEM_BUILD, MG_ABORT);
702	idh.Read (dictFile);
703	fclose (dictFile);
704
705	// set the size of the bit ptrs
706	bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
707
708	// open the chunk file and read in the maximum memory needed
709	// for the inverted memory buffer
710	chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
711	MAGIC_CHUNK, MG_ABORT);
712	ReadUL (chunkFile, ivfMemBufSize);
713	chunkBuf.attachFile (chunkFile);
714
715	// allocate memory for the inverted buffer
716	ivfMemBuf = new char [ivfMemBufSize];
717	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
718
719	// read in the word dictionary
720	ReadWordDict (filename);
721
722	// read in the tag dictionary
723	ReadTagDict (filename, idh);
724
725	// read in the level information
726	ReadLevelFile (filename);
727	bool wordLevelIndex = ivfLevel.indexLevel.empty();
728
729	// set up the translation table file
730	occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
731
732	// reset some globals
733	numDocs = 0;
734	numChunkDocs = 0;
735	numDocsInChunk = 0;
736	numFrags = 0;
737	numFragsInChunk = 0;
738	chunkStartFragNum = 0;
739
740	strcpy (collectFilename, filename);
741
742
743	// create the inverted file
744	mg_ullong totalIBits = 0;
745	FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
746	MAGIC_INVF, MG_ABORT);
747	totalIBits += sizeof (unsigned long) * 8; // magic number
748	totalIBits += 8 * 200; // 200 byte gap -- why??????
749	fclose (invfFile);
750
751	// init the inverted file state cache
752	invfState.Open (filename);
753	InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
754
755	return COMPALLOK;
756	}
757
758	static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/tagName/) {
759	if (!tInfo.inTag) return;
760
761	// add this tag to the inverted list
762	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
763	unsigned long endFrag = numFrags;
764	int b = 1 << tagBitPtr.lgB;
765
766	/*
767	cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
768	<< tInfo.startFrag << " " << endFrag << "\n";
769	*/
770
771	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
772	buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
773	b, NULL);
774	buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
775	tagBitPtr.lastFragNum = endFrag;
776	tagBitPtr.here = buffer.position();
777	buffer.encodeDone();
778
779	// check for buffer overrun
780	bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
781
782	// reset information about this tag
783	tInfo.inTag = false;
784	tInfo.startFrag = 0;
785	}
786
787	static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
788	// close tag if open
789	IP2TagInfo &tInfo = tagMapDict[el.tagName];
790	if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
791
792	// open this tag
793	tInfo.inTag = true;
794	tInfo.startFrag = numFrags;
795
796	// check for start of next fragment
797	bool wordLevelIndex = ivfLevel.indexLevel.empty();
798	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
799	numFrags++;
800	inFrag = true;
801	}
802	}
803
804	static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
805	// check for end of fragment
806	bool wordLevelIndex = ivfLevel.indexLevel.empty();
807	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
808	inFrag = false;
809	}
810
811	IP2TagInfo &tInfo = tagMapDict[el.tagName];
812	CloseTextTag (tInfo, el.tagName);
813	}
814
815	static void ProcessText (const TextEl &el, bool &inFrag) {
816	// make sure this text is to be indexed
817	bool wordLevelIndex = ivfLevel.indexLevel.empty();
818	if (!wordLevelIndex && !inFrag) return;
819
820	const unsigned char *textHere = el.text.begin();
821	const unsigned char *textEnd = el.text.end() - 1;
822	unsigned char mgWord[MAXSTEMLEN + 1];
823
824	if (!inaword (textHere, textEnd))
825	ParseNonindexWord (textHere, textEnd);
826
827
828	// Alternately parse off words and non-words from the input
829
830	while (textHere <= textEnd) {
831	textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
832	textHere = ParseNonindexWord (textHere, textEnd);
833
834	if (mgWord[0] > 0) {
835	if (wordLevelIndex) numFrags++;
836
837	unsigned long wordNum = perf_hash (wordHashDict, mgWord);
838
839	/*
840	cout << wordNum << " \"";
841	cout.write (mgWord+1, *mgWord);
842	cout << "\" " << numFrags << "\n";
843	*/
844
845	// add this word to the inverted list
846	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
847	unsigned long fragNum = numFrags;
848	int b = 1 << wordBitPtr.lgB;
849
850	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
851
852	// note: this assumes that fragments don't carry over between
853	// chunks (which they don't because all tags are closed at the
854	// end of each document and chunks are based on document
855	// boundaries), i.e. the first fragment number must be greater
856	// than the starting fragment number of the chunk.
857	if (fragNum > wordBitPtr.lastFragNum) {
858	buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
859	b, NULL);
860	if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
861
862	} else if (!wordLevelIndex) {
863	// add one to the frequency count for this word
864	buffer.seek (buffer.position()-1);
865	buffer.encodeBit (0); // unary encoding -- last = 1
866	buffer.encodeBit (1);
867	}
868
869	wordBitPtr.lastFragNum = fragNum;
870	wordBitPtr.here = buffer.position();
871	buffer.encodeDone();
872
873	// check for buffer overrun
874	bitPtrs.CheckWordBufOverrun (wordNum);
875	}
876	}
877	}
878
879	// combine the in memory inverted buffer with the disk
880	// based inverted file
881	static void DiskMerge (char *filename) {
882	bool wordLevelIndex = ivfLevel.indexLevel.empty();
883
884	// make sure we have something to process
885	if (numChunkDocs <= 0) return;
886
887	// open the inverted file
888	FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
889	MAGIC_INVF, MG_ABORT);
890	random_bitio_buffer invfOutBuf (invfFile);
891
892	// set up to decode the entries in memory
893	mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
894
895	// write out the word information
896	unsigned long wordNum;
897	int b;
898	unsigned long currFragNum;
899	unsigned long delta;
900	unsigned long currFreq;
901	for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
902	// go to the end of the last inverted file entry
903	InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
904	invfOutBuf.SEEK_X (wordDiskState.here);
905
906	// go to the start of the inverted chunk info in memory
907	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
908	memInBuf.seek(wordBitPtr.start);
909
910	// decode each entry and re-write to disk
911	currFragNum = chunkStartFragNum;
912	while (memInBuf.position() < wordBitPtr.here) {
913	// decode word entry
914	b = 1 << wordBitPtr.lgB;
915	delta = memInBuf.bblock_decode (b, NULL);
916	currFragNum += delta;
917	if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
918	else currFreq = 1;
919
920	// recode on disk
921	invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
922	wordDiskState.B, NULL);
923	if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
924	wordDiskState.lastFragNum = currFragNum;
925	}
926
927	wordDiskState.here = invfOutBuf.TELL_X();
928	}
929
930	// write out the tag information
931	unsigned long tagNum;
932	unsigned long currTagStart;
933	unsigned long currTagEnd;
934	for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
935	// go to the end of the last inverted file entry
936	InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
937	invfOutBuf.SEEK_X (tagDiskState.here);
938
939	// go to the start of the inverted chunk info in memory
940	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
941	memInBuf.seek(tagBitPtr.start);
942
943	// decode each entry and re-write to disk
944	currTagEnd = chunkStartFragNum;
945	while (memInBuf.position() < tagBitPtr.here) {
946	// decode tag entry
947	b = 1 << tagBitPtr.lgB;
948	delta = memInBuf.bblock_decode (b, NULL) - 1;
949	currTagStart = currTagEnd + delta;
950	delta = memInBuf.bblock_decode (b, NULL) - 1;
951	currTagEnd = currTagStart + delta;
952
953	// recode on disk
954	invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
955	tagDiskState.B, NULL);
956	invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
957	tagDiskState.B, NULL);
958
959	tagDiskState.lastFragNum = currTagEnd;
960	}
961
962	tagDiskState.here = invfOutBuf.TELL_X();
963	}
964
965	memInBuf.done();
966
967	invfOutBuf.encodeDone();
968	fclose (invfFile);
969	}
970
971
972	int process_ivf_2 (const TagInfo &/tagInfo/, const TextElArray &doc) {
973	bool wordLevelIndex = ivfLevel.indexLevel.empty();
974	bool inFrag = false;
975	if (wordLevelIndex) inFrag = true; // unconditional
976
977	// get next chunk information if need to. the chunk information
978	// is needed before the first document is processed
979	if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
980
981	// process each text element
982	TextElArray::const_iterator here = doc.begin();
983	TextElArray::const_iterator end = doc.end();
984	while (here != end) {
985	// process this element
986	if ((here).elType == OpenTagE) ProcessOpenTag (here, inFrag);
987	else if ((here).elType == CloseTagE) ProcessCloseTag (here, inFrag);
988	else ProcessText (*here, inFrag);
989
990	here++;
991	}
992
993	// close off any unclosed tags
994	TagMapDict::iterator tdHere = tagMapDict.begin();
995	TagMapDict::iterator tdEnd = tagMapDict.end();
996	while (tdHere != tdEnd) {
997	CloseTextTag ((tdHere).second, (tdHere).first);
998	tdHere++;
999	}
1000
1001	// we've processed one more document
1002	numDocs++;
1003	numChunkDocs++;
1004
1005	// merge the memory based inverted file with the one on
1006	// disk if this is the end of this chunk
1007	if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
1008
1009	return COMPALLOK;
1010	}
1011
1012
1013	static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
1014	FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1015	MAGIC_INVF, MG_ABORT);
1016	FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1017	MAGIC_INVF, MG_ABORT);
1018
1019	// skip the magic number
1020	fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
1021
1022	// write the inverted file header -- use defaults for most things
1023	invf_file_header ifh;
1024	ifh.no_of_words = idh.word_dict_size;
1025	ifh.no_of_tags = idh.tag_dict_size;
1026	ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1027	ifh.Write (outInvfFile);
1028
1029	bytesOutput = ftell (outInvfFile);
1030
1031	// process each meaningful byte in the file
1032	unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
1033	unsigned long entryNum;
1034	mg_ullong lastStart = 0;
1035	for (entryNum = 0; entryNum < numEntries; entryNum++) {
1036	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1037
1038	// overrun check
1039	if (stateRec.start < lastStart)
1040	FatalError (1, "Inverted file Buffer overrun");
1041	lastStart = stateRec.start;
1042
1043	unsigned long oldEntryStart = stateRec.start >> 3;
1044	unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
1045	unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1046	unsigned long oldEntryEndOver = stateRec.here & 7;
1047
1048	fseek (inInvfFile, oldEntryStart, SEEK_SET);
1049
1050	stateRec.here -= stateRec.start;
1051	stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver;
1052	stateRec.here += stateRec.start;
1053	while (oldEntryStart < oldEntryEnd) {
1054	unsigned char c = getc (inInvfFile);
1055	if (oldEntryStart == oldEntryEnd - 1) {
1056	u_char ands[8] =
1057	{0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1058	c &= ands[oldEntryEndOver];
1059	}
1060	putc (c, outInvfFile);
1061	bytesOutput++;
1062	oldEntryStart++;
1063	}
1064	}
1065
1066	fclose (inInvfFile);
1067
1068	#ifdef __WIN32__
1069	if (!(_chsize (_fileno (outInvfFile), bytesOutput)))
1070	Message ("Could not truncate invf.");
1071	#else
1072	ftruncate (fileno (outInvfFile), bytesOutput);
1073	#endif
1074
1075	fclose (outInvfFile);
1076	}
1077
1078	static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
1079	FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1080	MAGIC_INVI, MG_ABORT);
1081
1082	// process each meaningful byte in the file
1083	unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
1084	unsigned long entryNum;
1085	for (entryNum = 0; entryNum < numEntries; entryNum++) {
1086	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1087
1088	// assumes that inverted entries start at beginning of each byte
1089	if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1090	}
1091
1092	WriteUL (invfIdxFile, invfNumBytes);
1093
1094	fclose (invfIdxFile);
1095	}
1096
1097
1098	int done_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
1099	// close most open files
1100	if (chunkFile != NULL) {
1101	chunkBuf.done();
1102	fclose (chunkFile);
1103	chunkFile = NULL;
1104	}
1105	occurConvert.Close();
1106
1107	// free allocated memory
1108	bitPtrs.Clear();
1109	if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1110	free_perf_hash (wordHashDict);
1111	wordHashDict = NULL;
1112	tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1113
1114	// condense the inverted file and truncate it
1115	// this function also writes out the inverted header
1116	unsigned long invfNumBytes = 0;
1117	CondenseInvfFile (filename, invfNumBytes);
1118
1119	OutputInvfIdx (filename, invfNumBytes);
1120
1121	// close the rest of the open files
1122	invfState.Close ();
1123
1124	return COMPALLOK;
1125	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: