Context Navigation

source: trunk/mgpp/text/ivf.pass2.cpp@ 12581

Last change on this file since 12581 was 9613, checked in by kjdon, 19 years ago
added in x++ -> ++x changes submitted by Emanuel Dejanu
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.3 KB

Line
1	/**************************************************************************
2	*
3	* ivf.pass2.cpp -- Memory efficient pass 2 inversion
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#define _XOPEN_SOURCE 1
23	#define _XOPEN_SOURCE_EXTENDED 1
24
25	#include <stdio.h>
26
27	#if defined __WIN32__
28	# include <process.h>
29	# include <io.h>
30	# define getpid _getpid
31	# define unlink _unlink
32	#else
33	# include <unistd.h>
34	# include "non_ansi.h"
35	#endif
36
37	#include "UCArray.h"
38	#include "sysfuncs.h"
39	#include "mg_files.h"
40	#include "invf.h"
41	#include "mg.h"
42	#include "build.h"
43	#include "locallib.h"
44	#include "bitio_m_random.h"
45	#include "bitio_m_stdio.h"
46	#include "bitio_m_mems.h"
47	#include "bitio_gen.h"
48	#include <stdio.h>
49	#include "words.h"
50	#include "messages.h"
51	#include "netorder.h"
52	#include "FIvfLevelInfo.h"
53	#include "perf_hash.h"
54	#include "string.h"
55
56	#include "longlong.h"
57
58	#if defined(GSDL_USE_OBJECTSPACE)
59	# include <ospace\std\map>
60	#elif defined(GSDL_USE_STL_H)
61	# include <map.h>
62	#else
63	# include <map>
64	#endif
65
66
67	#ifdef USE_LONG_LONG
68	#define SEEK_X seek_LL
69	#define TELL_X tell_LL
70	#else
71	#define SEEK_X seek
72	#define TELL_X tell
73	#endif
74
75	#ifndef RND_BUF_SIZE
76	#define RND_BUF_SIZE 8*1024
77	#endif
78
79
80	static unsigned long numDocs = 0;
81	static unsigned long numChunkDocs = 0;
82	static unsigned long numDocsInChunk = 0;
83
84	static unsigned long numFrags = 0;
85	static unsigned long numFragsInChunk = 0;
86	static unsigned long chunkStartFragNum = 0;
87
88
89
90
91	struct BitPtr {
92	unsigned long start;
93	unsigned long here;
94	unsigned long lastFragNum;
95	unsigned long lgB;
96
97	void Clear () { start = here = lastFragNum = lgB = 0; }
98	BitPtr () { Clear(); }
99	};
100
101	class WordBitPtrs {
102	protected:
103	unsigned long numWords;
104	unsigned long numTags;
105	unsigned long size;
106	BitPtr *wordBitPtrs;
107
108	void CheckBufOverrun (unsigned long num) {
109	if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
110	cerr << "numDocs: " << numDocs << "\n";
111	cerr << "numChunkDocs: " << numChunkDocs << "\n";
112	cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
113	cerr << "numFrags: " << numFrags << "\n";
114	cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
115	cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
116	cerr << "num: " << num << "\n";
117	cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
118	cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
119	cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
120	FatalError (1, "Bit buffer overrun");
121	}
122	}
123
124	public:
125	void Clear ();
126	WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
127	~WordBitPtrs ();
128	void SetSize (unsigned long _numWords,
129	unsigned long _numTags);
130
131	void ResetPtrs () {
132	if (wordBitPtrs == NULL) return;
133	unsigned long i;
134	for (i=0; i<size; ++i) wordBitPtrs[i].Clear();
135	}
136
137	BitPtr &GetWordBitPtr (unsigned long wordNum)
138	{ return wordBitPtrs[wordNum]; }
139	unsigned long &GetWordStart (unsigned long wordNum)
140	{ return wordBitPtrs[wordNum].start; }
141	unsigned long &GetWordHere (unsigned long wordNum)
142	{ return wordBitPtrs[wordNum].here; }
143	void CheckWordBufOverrun (unsigned long wordNum)
144	{ CheckBufOverrun (wordNum); }
145
146	BitPtr &GetTagBitPtr (unsigned long tagNum)
147	{ return wordBitPtrs[tagNum + numWords]; }
148	unsigned long &GetTagStart (unsigned long tagNum)
149	{ return wordBitPtrs[tagNum + numWords].start; }
150	unsigned long &GetTagHere (unsigned long tagNum)
151	{ return wordBitPtrs[tagNum + numWords].here; }
152	void CheckTagBufOverrun (unsigned long tagNum)
153	{ CheckBufOverrun (tagNum + numWords); }
154
155	BitPtr &GetEndBitPtr ()
156	{ return wordBitPtrs[size-1]; }
157	unsigned long &GetEndStart ()
158	{ return wordBitPtrs[size-1].start; }
159	unsigned long &GetEndHere ()
160	{ return wordBitPtrs[size-1].here; }
161	};
162
163
164	struct IP2TagInfo {
165	bool inTag;
166	unsigned long startFrag;
167	unsigned long tagNum;
168
169	IP2TagInfo () {
170	inTag = false;
171	startFrag = 0;
172	tagNum = 0;
173	}
174	};
175
176	// maps tags to tag information
177	typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
178
179
180	// class to handle the translation of occurrence order
181	// to dictionary order for words and tags
182	class OccurToDictConverter {
183	protected:
184	unsigned long pos;
185	unsigned long val;
186	FILE *transFile;
187	random_bitio_buffer rbs;
188
189	unsigned long wordDictSize;
190	unsigned long tagDictSize;
191
192	void SeekStart ();
193	unsigned long TranslateNum (unsigned long num);
194
195	public:
196	OccurToDictConverter ();
197	~OccurToDictConverter ();
198
199	void Open (char *filename, unsigned long _wordDictSize,
200	unsigned long _tagDictSize);
201
202	// Close frees all allocated memory
203	void Close ();
204
205	unsigned long TranslateWord (unsigned long occurNum)
206	{ return TranslateNum (occurNum); }
207	unsigned long TranslateTag (unsigned long occurNum)
208	{ return TranslateNum (occurNum+wordDictSize); }
209	};
210
211
212	struct InvfStateRec {
213	mg_ullong start;
214	mg_ullong here;
215	unsigned long lastFragNum;
216	unsigned long B;
217
218	void Clear () {
219	start = here = 0;
220	lastFragNum = B = 0;
221	}
222	InvfStateRec () { Clear (); }
223	};
224
225
226	#define ISR_SIZE 1024
227
228	class InvfStateCache {
229	protected:
230	InvfStateRec recCache [ISR_SIZE];
231	unsigned long startNum;
232
233	FILE *stateFile;
234
235	void ClearCache () {
236	unsigned int i = 0;
237	for (i=0; i<ISR_SIZE; ++i) recCache[i].Clear();
238	}
239
240	public:
241	InvfStateCache ();
242	~InvfStateCache ();
243
244	void Open (char *filename);
245	void Close ();
246
247	// previous references to state records may be
248	// invalidated calling GetRec
249	InvfStateRec &GetRec (unsigned long num);
250	};
251
252
253	static invf_dict_header idh;
254	static WordBitPtrs bitPtrs;
255
256	static FILE *chunkFile = NULL;
257	static stdio_bitio_buffer chunkBuf;
258
259	static unsigned long ivfMemBufSize = 0;
260	static char *ivfMemBuf = NULL;
261
262	// word and tag dictionaries. a map is used for the tag dictionary
263	// as it should never be very big (and the perfect hash function
264	// sometimes has trouble with small values).
265	static perf_hash_data *wordHashDict = NULL;
266	static TagMapDict tagMapDict;
267
268	// information about all the different levels
269	static FIvfLevel ivfLevel;
270
271	static OccurToDictConverter occurConvert;
272
273	// information about the state of the inverted file
274	static InvfStateCache invfState;
275
276	static char collectFilename[512];
277
278
279	void WordBitPtrs::Clear () {
280	numWords = 0;
281	numTags = 0;
282	size=0;
283	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
284	wordBitPtrs = NULL;
285	}
286
287	WordBitPtrs::~WordBitPtrs () {
288	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
289	}
290
291	void WordBitPtrs::SetSize (unsigned long _numWords,
292	unsigned long _numTags){
293	Clear();
294	numWords = _numWords;
295	numTags = _numTags;
296	size = numWords + numTags + 1;
297	wordBitPtrs = new BitPtr [size];
298	}
299
300
301	void OccurToDictConverter::SeekStart () {
302	if (transFile == NULL) return;
303	rbs.SEEK_X (sizeof (unsigned long) * 8);
304	pos = 0;
305	}
306
307	unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
308	if (num < pos) SeekStart ();
309	while (pos <= num) {
310	if (pos < wordDictSize)
311	val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
312	else
313	val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
314	++pos;
315	}
316	return val;
317	}
318
319	OccurToDictConverter::OccurToDictConverter () {
320	pos = 0;
321	val = 0;
322	transFile = NULL;
323	wordDictSize = 0;
324	tagDictSize = 0;
325	}
326
327	OccurToDictConverter::~OccurToDictConverter () {
328	if (transFile != NULL) Close ();
329	}
330
331	void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
332	unsigned long _tagDictSize) {
333	if (transFile != NULL) Close ();
334
335	wordDictSize = _wordDictSize;
336	tagDictSize = _tagDictSize;
337
338	transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
339	MAGIC_CHUNK_TRANS, MG_ABORT);
340	rbs.attachFile (transFile, RND_BUF_SIZE);
341	SeekStart ();
342	val = 0;
343	}
344
345	void OccurToDictConverter::Close () {
346	if (transFile == NULL) return;
347
348	rbs.done ();
349	fclose (transFile);
350	transFile = NULL;
351	pos = 0;
352	val = 0;
353
354	wordDictSize = 0;
355	tagDictSize = 0;
356	}
357
358
359
360
361	InvfStateCache::InvfStateCache () {
362	startNum = 0;
363	stateFile = NULL;
364	}
365
366	InvfStateCache::~InvfStateCache () {
367	if (stateFile != NULL) Close ();
368	}
369
370	void InvfStateCache::Open (char *filename) {
371	if (stateFile != NULL) Close();
372
373	// open the state file
374	char path[512];
375	sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
376	".invf.state", (long) getpid ());
377	if (!(stateFile = fopen (path, "wb+"))) {
378	Message ("Unable to create \"%s\"", path);
379	exit (1);
380	}
381	unlink (path); // file will be deleted after it is closed
382	// reset the buffer
383	startNum = 0;
384	ClearCache();
385	}
386
387	void InvfStateCache::Close () {
388	if (stateFile == NULL) return;
389	fclose (stateFile);
390	stateFile = NULL;
391	startNum = 0;
392	}
393
394	InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
395	// see if cached
396	if ((num >= startNum) && (num < startNum + ISR_SIZE))
397	return recCache[num-startNum];
398
399	// not cached, write out this lot of records and read in
400	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
401	fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
402
403	// read in the new set of records
404	ClearCache ();
405	startNum = num - (num % ISR_SIZE);
406	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
407	fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
408
409	return recCache[num-startNum];
410	}
411
412
413
414	static void ClearCharBuf (char *buf, unsigned long size) {
415	char *end = buf + size;
416	while (buf != end) *buf++ = 0;
417	}
418
419	static void ReadWordDict (char *filename) {
420	// read in the perfect hash function for the word dictionary
421	FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
422	MAGIC_HASH, MG_ABORT);
423	if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
424	FatalError (1, "Unable to read in hash data for word dictionary");
425	}
426	fclose (wordHashFile);
427	}
428
429	static void ReadTagDict (char *filename, invf_dict_header &_idh) {
430	// open the file
431	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
432	MAGIC_STEM_BUILD, MG_ABORT);
433
434	// seek to the start of the tag dictionary
435	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
436
437	unsigned long tagNum;
438	dict_el thisEl;
439	for (tagNum = 0; tagNum < _idh.tag_dict_size; ++tagNum) {
440	thisEl.Read (dictFile);
441	tagMapDict[thisEl.el].tagNum = tagNum;
442	}
443
444	fclose (dictFile);
445	}
446
447	static void ReadLevelFile (char *filename) {
448	FILE *f;
449	f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
450	MAGIC_INVF_LEVELS, MG_ABORT);
451	ivfLevel.Read (f);
452	fclose (f);
453	}
454
455	void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
456	if (totalIBits < lastTotalIBits) {
457	fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
458	if (sizeof (mg_ullong) < 8) {
459	fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
460	}
461	fprintf(stderr, " Build aborted.\n");
462	exit(1);
463	}
464	}
465
466	// assumes the inverted file state file has been opened
467	static void InitInvfState (char *filename,
468	invf_dict_header &_idh,
469	InvfStateCache &_invfState,
470	mg_ullong &totalIBits,
471	bool wordLevelIndex) {
472	// read in the dictionary, setting inverted state information
473	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
474	MAGIC_STEM_BUILD, MG_ABORT);
475
476	// seek to the start of the word dictionary
477	fseek (dictFile, _idh.word_dict_start, SEEK_SET);
478
479	// add the word entries
480	word_dict_el wordEl;
481	wordEl.SetNumLevels (_idh.num_levels);
482	unsigned long dictWordNum, p;
483	mg_ullong lastTotalIBits;
484	unsigned long N = _idh.num_frags;
485	for (dictWordNum=0; dictWordNum<_idh.word_dict_size; ++dictWordNum) {
486	// lastTotalIBits is used to detect integer overflow
487	lastTotalIBits = totalIBits;
488
489	// read the next word and associated information
490	wordEl.Read (dictFile, _idh.num_levels);
491
492	// update the state record
493	p = wordEl.frag_occur;
494	InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
495	wisr.start = totalIBits;
496	wisr.here = totalIBits;
497	wisr.B = BIO_Bblock_Init (N, p);
498
499	// add the length of the fragment numbers
500	totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
501
502	// if needed, add the length of the fragment frequency information
503	if (!wordLevelIndex)
504	totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
505
506	// align next byte
507	#ifdef USE_LONG_LONG
508	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
509	#else
510	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
511	#endif
512
513	CheckIntOverflow (totalIBits, lastTotalIBits);
514	}
515
516
517	// seek to the start of the tag dictionary
518	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
519
520	// add the tag entries
521	dict_el tagEl;
522	unsigned long dictTagNum;
523	N = _idh.num_frags;
524	for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; ++dictTagNum) {
525	// lastTotalIBits is used to detect integer overflow
526	lastTotalIBits = totalIBits;
527
528	// read the next tag and associated information
529	tagEl.Read (dictFile);
530
531	// update the state record
532	p = tagEl.frag_occur*2;
533	InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
534	tisr.start = totalIBits;
535	tisr.here = totalIBits;
536	tisr.B = BIO_Bblock_Init (N+p, p);
537
538	// add the length of the fragment numbers (two numbers for each
539	// tag, one for start and one for end)
540	totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
541
542	// align next byte
543	#ifdef USE_LONG_LONG
544	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
545	#else
546	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
547	#endif
548
549	CheckIntOverflow (totalIBits, lastTotalIBits);
550	}
551
552	fclose (dictFile);
553	}
554
555	/*
556	// assumes the chunk tag information has been placed in .first
557	static void PrintChunkInfo (unsigned long chunkMem,
558	unsigned long numChunkWords,
559	unsigned long numChunkTags) {
560	static unsigned long chunksRead = 0;
561	++chunksRead;
562	cout << "Chunk Number: " << chunksRead << "\n";
563	cout << "numChunkDocs " << numDocsInChunk << "\n";
564	cout << "numChunkFrags " << numFragsInChunk << "\n";
565	cout << "mem " << chunkMem << "\n";
566	cout << "numWords " << numChunkWords << "\n";
567	cout << "numTags " << numChunkTags << "\n\n";
568
569	TagMapDict::iterator tagMapHere = tagMapDict.begin();
570	TagMapDict::iterator tagMapEnd = tagMapDict.end();
571	while (tagMapHere != tagMapEnd) {
572	unsigned long tagMapNum = (*tagMapHere).second.tagNum;
573	cout << (*tagMapHere).first << " " << tagMapNum << " "
574	<< bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
575	++tagMapHere;
576	}
577	}
578	*/
579
580	void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
581	// reset globals
582	numChunkDocs = 0;
583	chunkStartFragNum = numFrags;
584
585	// read in information about this chunk
586	numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
587	if (numDocsInChunk == 0)
588	FatalError (1, "The number of docs in the current chunk is 0");
589
590	numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
591	unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
592
593	if (chunkMem > ivfMemBufSize)
594	FatalError (1, "Chunk memory size is greater than maximum");
595
596	unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
597	unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
598
599
600	// reset stuff
601	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
602	bitPtrs.ResetPtrs();
603
604	// read in the entries in occurrence order storing the
605	// "chunkWordCount" in "start" and the "chunkFragCount"
606	// in "here"
607	unsigned long numOccur;
608	unsigned long wordNum;
609	for (numOccur=0; numOccur<numChunkWords; ++numOccur) {
610	wordNum = occurConvert.TranslateWord (numOccur);
611	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
612	wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
613	if (wordPtr.start >= 2)
614	wordPtr.here = chunkBuf.gamma_decode (NULL);
615	else wordPtr.here = wordPtr.start;
616	}
617	unsigned long tagNum;
618	for (numOccur=0; numOccur<numChunkTags; ++numOccur) {
619	tagNum = occurConvert.TranslateTag (numOccur);
620	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
621	// only chunkFragCount is encoded for tags
622	tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
623	tagPtr.here = tagPtr.start;
624	}
625
626	/* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
627
628	// create the bit ptrs in dictionary order
629	unsigned long totalIBits = 0; // only dealing with memory
630	unsigned long chunkWordCount, chunkFragCount;
631	for (wordNum=0; wordNum<_idh.word_dict_size; ++wordNum) {
632	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
633	chunkWordCount = wordPtr.start;
634	chunkFragCount = wordPtr.here;
635	wordPtr.start = totalIBits;
636	wordPtr.here = totalIBits;
637	wordPtr.lastFragNum = chunkStartFragNum;
638	wordPtr.lgB = 0;
639	if (chunkWordCount > 0) {
640	wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
641	chunkFragCount));
642	totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
643	// use unary encoding for memory buffer encoding of fragment freq
644	if (!wordLevelIndex) {
645	totalIBits += chunkWordCount;
646	}
647	}
648	}
649	for (tagNum=0; tagNum<_idh.tag_dict_size; ++tagNum) {
650	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
651	chunkFragCount = tagPtr.here;
652	tagPtr.start = totalIBits;
653	tagPtr.here = totalIBits;
654	tagPtr.lastFragNum = chunkStartFragNum;
655	tagPtr.lgB = 0;
656	if (chunkFragCount > 0) {
657	unsigned long pTag = chunkFragCount*2;
658	tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
659	pTag));
660	unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
661	pTag);
662	// cout << tagNum + _idh.word_dict_size << " ";
663	// cout << "numFrags: " << numFragsInChunk
664	// << " chunkFragCount: " << chunkFragCount
665	// << " B: " << 1 << tagPtr.lgB
666	// << " blen: " << blen << "\n";
667	totalIBits += bLen;
668	}
669	}
670	bitPtrs.GetEndStart() = totalIBits;
671	bitPtrs.GetEndHere() = totalIBits;
672
673	if ((totalIBits + 7ul) >> 3ul > chunkMem) {
674	cerr << "totalIBits: " << totalIBits << "\n";
675	cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
676	cerr << "chunkMem: " << chunkMem << "\n";
677	FatalError (1, "Pointers exceed buffer size");
678	}
679	}
680
681
682
683
684	int init_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
685	// read in compressed dictionary header
686	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
687	MAGIC_STEM_BUILD, MG_ABORT);
688	idh.Read (dictFile);
689	fclose (dictFile);
690
691	// set the size of the bit ptrs
692	bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
693
694	// open the chunk file and read in the maximum memory needed
695	// for the inverted memory buffer
696	chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
697	MAGIC_CHUNK, MG_ABORT);
698	ReadUL (chunkFile, ivfMemBufSize);
699	chunkBuf.attachFile (chunkFile);
700
701	// allocate memory for the inverted buffer
702	ivfMemBuf = new char [ivfMemBufSize];
703	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
704
705	// read in the word dictionary
706	ReadWordDict (filename);
707
708	// read in the tag dictionary
709	ReadTagDict (filename, idh);
710
711	// read in the level information
712	ReadLevelFile (filename);
713	bool wordLevelIndex = ivfLevel.indexLevel.empty();
714
715	// set up the translation table file
716	occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
717
718	// reset some globals
719	numDocs = 0;
720	numChunkDocs = 0;
721	numDocsInChunk = 0;
722	numFrags = 0;
723	numFragsInChunk = 0;
724	chunkStartFragNum = 0;
725
726	strcpy (collectFilename, filename);
727
728
729	// create the inverted file
730	mg_ullong totalIBits = 0;
731	FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
732	MAGIC_INVF, MG_ABORT);
733	totalIBits += sizeof (unsigned long) * 8; // magic number
734	totalIBits += 8 * 200; // 200 byte gap -- why??????
735	fclose (invfFile);
736
737	// init the inverted file state cache
738	invfState.Open (filename);
739	InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
740
741	return COMPALLOK;
742	}
743
744	static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/tagName/) {
745	if (!tInfo.inTag) return;
746
747	// add this tag to the inverted list
748	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
749	unsigned long endFrag = numFrags;
750	int b = 1 << tagBitPtr.lgB;
751
752	/*
753	cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
754	<< tInfo.startFrag << " " << endFrag << "\n";
755	*/
756
757	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
758	buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
759	b, NULL);
760	buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
761	tagBitPtr.lastFragNum = endFrag;
762	tagBitPtr.here = buffer.position();
763	buffer.encodeDone();
764
765	// check for buffer overrun
766	bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
767
768	// reset information about this tag
769	tInfo.inTag = false;
770	tInfo.startFrag = 0;
771	}
772
773	static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
774	// close tag if open
775	IP2TagInfo &tInfo = tagMapDict[el.tagName];
776	if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
777
778	// open this tag
779	tInfo.inTag = true;
780	tInfo.startFrag = numFrags;
781
782	// check for start of next fragment
783	bool wordLevelIndex = ivfLevel.indexLevel.empty();
784	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
785	++numFrags;
786	inFrag = true;
787	}
788	}
789
790	static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
791	// check for end of fragment
792	bool wordLevelIndex = ivfLevel.indexLevel.empty();
793	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
794	inFrag = false;
795	}
796
797	IP2TagInfo &tInfo = tagMapDict[el.tagName];
798	CloseTextTag (tInfo, el.tagName);
799	}
800
801	static void ProcessText (const TextEl &el, bool &inFrag) {
802	// make sure this text is to be indexed
803	bool wordLevelIndex = ivfLevel.indexLevel.empty();
804	if (!wordLevelIndex && !inFrag) return;
805
806	const unsigned char *textHere = &(el.text[0]);
807	const unsigned char *textEnd = &(el.text[el.text.size() - 1]);
808	unsigned char mgWord[MAXSTEMLEN + 1];
809
810	if (!inaword (textHere, textEnd))
811	ParseNonindexWord (textHere, textEnd);
812
813
814	// Alternately parse off words and non-words from the input
815
816	while (textHere <= textEnd) {
817	textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
818	textHere = ParseNonindexWord (textHere, textEnd);
819
820	if (mgWord[0] > 0) {
821	if (wordLevelIndex) ++numFrags;
822
823	unsigned long wordNum = perf_hash (wordHashDict, mgWord);
824
825	/*
826	cout << wordNum << " \"";
827	cout.write (mgWord+1, *mgWord);
828	cout << "\" " << numFrags << "\n";
829	*/
830
831	// add this word to the inverted list
832	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
833	unsigned long fragNum = numFrags;
834	int b = 1 << wordBitPtr.lgB;
835
836	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
837
838	// note: this assumes that fragments don't carry over between
839	// chunks (which they don't because all tags are closed at the
840	// end of each document and chunks are based on document
841	// boundaries), i.e. the first fragment number must be greater
842	// than the starting fragment number of the chunk.
843	if (fragNum > wordBitPtr.lastFragNum) {
844	buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
845	b, NULL);
846	if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
847
848	} else if (!wordLevelIndex) {
849	// add one to the frequency count for this word
850	buffer.seek (buffer.position()-1);
851	buffer.encodeBit (0); // unary encoding -- last = 1
852	buffer.encodeBit (1);
853	}
854
855	wordBitPtr.lastFragNum = fragNum;
856	wordBitPtr.here = buffer.position();
857	buffer.encodeDone();
858
859	// check for buffer overrun
860	bitPtrs.CheckWordBufOverrun (wordNum);
861	}
862	}
863	}
864
865	// combine the in memory inverted buffer with the disk
866	// based inverted file
867	static void DiskMerge (char *filename) {
868	bool wordLevelIndex = ivfLevel.indexLevel.empty();
869
870	// make sure we have something to process
871	if (numChunkDocs <= 0) return;
872
873	// open the inverted file
874	FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
875	MAGIC_INVF, MG_ABORT);
876	random_bitio_buffer invfOutBuf (invfFile);
877
878	// set up to decode the entries in memory
879	mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
880
881	// write out the word information
882	unsigned long wordNum;
883	int b;
884	unsigned long currFragNum;
885	unsigned long delta;
886	unsigned long currFreq;
887	for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
888	// go to the end of the last inverted file entry
889	InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
890	invfOutBuf.SEEK_X (wordDiskState.here);
891
892	// go to the start of the inverted chunk info in memory
893	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
894	memInBuf.seek(wordBitPtr.start);
895
896	// decode each entry and re-write to disk
897	currFragNum = chunkStartFragNum;
898	while (memInBuf.position() < wordBitPtr.here) {
899	// decode word entry
900	b = 1 << wordBitPtr.lgB;
901	delta = memInBuf.bblock_decode (b, NULL);
902	currFragNum += delta;
903	if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
904	else currFreq = 1;
905
906	// recode on disk
907	invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
908	wordDiskState.B, NULL);
909	if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
910	wordDiskState.lastFragNum = currFragNum;
911	}
912
913	wordDiskState.here = invfOutBuf.TELL_X();
914	}
915
916	// write out the tag information
917	unsigned long tagNum;
918	unsigned long currTagStart;
919	unsigned long currTagEnd;
920	for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
921	// go to the end of the last inverted file entry
922	InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
923	invfOutBuf.SEEK_X (tagDiskState.here);
924
925	// go to the start of the inverted chunk info in memory
926	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
927	memInBuf.seek(tagBitPtr.start);
928
929	// decode each entry and re-write to disk
930	currTagEnd = chunkStartFragNum;
931	while (memInBuf.position() < tagBitPtr.here) {
932	// decode tag entry
933	b = 1 << tagBitPtr.lgB;
934	delta = memInBuf.bblock_decode (b, NULL) - 1;
935	currTagStart = currTagEnd + delta;
936	delta = memInBuf.bblock_decode (b, NULL) - 1;
937	currTagEnd = currTagStart + delta;
938
939	// recode on disk
940	invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
941	tagDiskState.B, NULL);
942	invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
943	tagDiskState.B, NULL);
944
945	tagDiskState.lastFragNum = currTagEnd;
946	}
947
948	tagDiskState.here = invfOutBuf.TELL_X();
949	}
950
951	memInBuf.done();
952
953	invfOutBuf.encodeDone();
954	fclose (invfFile);
955	}
956
957
958	int process_ivf_2 (const TagInfo &/tagInfo/, const TextElArray &doc) {
959	bool wordLevelIndex = ivfLevel.indexLevel.empty();
960	bool inFrag = false;
961	if (wordLevelIndex) inFrag = true; // unconditional
962
963	// get next chunk information if need to. the chunk information
964	// is needed before the first document is processed
965	if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
966
967	// process each text element
968	TextElArray::const_iterator here = doc.begin();
969	TextElArray::const_iterator end = doc.end();
970	while (here != end) {
971	// process this element
972	if ((here).elType == OpenTagE) ProcessOpenTag (here, inFrag);
973	else if ((here).elType == CloseTagE) ProcessCloseTag (here, inFrag);
974	else ProcessText (*here, inFrag);
975
976	++here;
977	}
978
979	// close off any unclosed tags
980	TagMapDict::iterator tdHere = tagMapDict.begin();
981	TagMapDict::iterator tdEnd = tagMapDict.end();
982	while (tdHere != tdEnd) {
983	CloseTextTag ((tdHere).second, (tdHere).first);
984	++tdHere;
985	}
986
987	// we've processed one more document
988	++numDocs;
989	++numChunkDocs;
990
991	// merge the memory based inverted file with the one on
992	// disk if this is the end of this chunk
993	if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
994
995	return COMPALLOK;
996	}
997
998
999	static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
1000	FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1001	MAGIC_INVF, MG_ABORT);
1002	FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1003	MAGIC_INVF, MG_ABORT);
1004
1005	// skip the magic number
1006	fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
1007
1008	// write the inverted file header -- use defaults for most things
1009	invf_file_header ifh;
1010	ifh.no_of_words = idh.word_dict_size;
1011	ifh.no_of_tags = idh.tag_dict_size;
1012	ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1013	ifh.Write (outInvfFile);
1014
1015	bytesOutput = ftell (outInvfFile);
1016
1017	// process each meaningful byte in the file
1018	unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
1019	unsigned long entryNum;
1020	mg_ullong lastStart = 0;
1021	for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1022	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1023
1024	// overrun check
1025	if (stateRec.start < lastStart)
1026	FatalError (1, "Inverted file Buffer overrun");
1027	lastStart = stateRec.start;
1028
1029	unsigned long oldEntryStart = stateRec.start >> 3;
1030	unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
1031	unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1032	unsigned long oldEntryEndOver = stateRec.here & 7;
1033
1034	fseek (inInvfFile, oldEntryStart, SEEK_SET);
1035
1036	stateRec.here -= stateRec.start;
1037	stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver;
1038	stateRec.here += stateRec.start;
1039	while (oldEntryStart < oldEntryEnd) {
1040	unsigned char c = getc (inInvfFile);
1041	if (oldEntryStart == oldEntryEnd - 1) {
1042	u_char ands[8] =
1043	{0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1044	c &= ands[oldEntryEndOver];
1045	}
1046	putc (c, outInvfFile);
1047	++bytesOutput;
1048	++oldEntryStart;
1049	}
1050	}
1051
1052	fclose (inInvfFile);
1053
1054	#ifdef __WIN32__
1055	if (_chsize (_fileno (outInvfFile), bytesOutput) != 0)
1056	Message ("Could not truncate invf.");
1057	#else
1058	ftruncate (fileno (outInvfFile), bytesOutput);
1059	#endif
1060
1061	fclose (outInvfFile);
1062	}
1063
1064	static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
1065	FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1066	MAGIC_INVI, MG_ABORT);
1067
1068	// process each meaningful byte in the file
1069	unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
1070	unsigned long entryNum;
1071	for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1072	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1073
1074	// assumes that inverted entries start at beginning of each byte
1075	if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1076	}
1077
1078	WriteUL (invfIdxFile, invfNumBytes);
1079
1080	fclose (invfIdxFile);
1081	}
1082
1083
1084	int done_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
1085	// close most open files
1086	if (chunkFile != NULL) {
1087	chunkBuf.done();
1088	fclose (chunkFile);
1089	chunkFile = NULL;
1090	}
1091	occurConvert.Close();
1092
1093	// free allocated memory
1094	bitPtrs.Clear();
1095	if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1096	free_perf_hash (wordHashDict);
1097	wordHashDict = NULL;
1098	tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1099
1100	// condense the inverted file and truncate it
1101	// this function also writes out the inverted header
1102	unsigned long invfNumBytes = 0;
1103	CondenseInvfFile (filename, invfNumBytes);
1104
1105	OutputInvfIdx (filename, invfNumBytes);
1106
1107	// close the rest of the open files
1108	invfState.Close ();
1109
1110	return COMPALLOK;
1111	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: