Context Navigation

source: trunk/indexers/mgpp/text/ivf.pass2.cpp@ 13715

Last change on this file since 13715 was 13715, checked in by kjdon, 17 years ago
renamed inaword, isaspace and skipspace so they don't conflict with mg versions
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.3 KB

Line
1	/**************************************************************************
2	*
3	* ivf.pass2.cpp -- Memory efficient pass 2 inversion
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#define _XOPEN_SOURCE 1
23	#define _XOPEN_SOURCE_EXTENDED 1
24
25	#include <stdio.h>
26
27	#if defined __WIN32__
28	# include <process.h>
29	# include <io.h>
30	# define getpid _getpid
31	# define unlink _unlink
32	#else
33	# include <unistd.h>
34	# include "non_ansi.h"
35	#endif
36
37	#include "UCArray.h"
38	#include "sysfuncs.h"
39	#include "mg_files.h"
40	#include "invf.h"
41	#include "mg.h"
42	#include "build.h"
43	#include "locallib.h"
44	#include "bitio_m_random.h"
45	#include "bitio_m_stdio.h"
46	#include "bitio_m_mems.h"
47	#include "bitio_gen.h"
48	#include <stdio.h>
49	#include "words.h"
50	#include "messages.h"
51	#include "netorder.h"
52	#include "FIvfLevelInfo.h"
53	#include "perf_hash.h"
54	#include "string.h"
55
56	#include "longlong.h"
57
58	#if defined(GSDL_USE_OBJECTSPACE)
59	# include <ospace\std\map>
60	#elif defined(GSDL_USE_STL_H)
61	# include <map.h>
62	#else
63	# include <map>
64	#endif
65
66
67	#ifdef USE_LONG_LONG
68	#define SEEK_X seek_LL
69	#define TELL_X tell_LL
70	#else
71	#define SEEK_X seek
72	#define TELL_X tell
73	#endif
74
75	#ifndef RND_BUF_SIZE
76	#define RND_BUF_SIZE 8*1024
77	#endif
78
79
80	static unsigned long numDocs = 0;
81	static unsigned long numChunkDocs = 0;
82	static unsigned long numDocsInChunk = 0;
83
84	static unsigned long numFrags = 0;
85	static unsigned long numFragsInChunk = 0;
86	static unsigned long chunkStartFragNum = 0;
87
88
89
90
91	struct BitPtr {
92	unsigned long start;
93	unsigned long here;
94	unsigned long lastFragNum;
95	unsigned long lgB;
96
97	void Clear () { start = here = lastFragNum = lgB = 0; }
98	BitPtr () { Clear(); }
99	};
100
101	class WordBitPtrs {
102	protected:
103	unsigned long numWords;
104	unsigned long numTags;
105	unsigned long size;
106	BitPtr *wordBitPtrs;
107
108	void CheckBufOverrun (unsigned long num) {
109	if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
110	cerr << "numDocs: " << numDocs << "\n";
111	cerr << "numChunkDocs: " << numChunkDocs << "\n";
112	cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
113	cerr << "numFrags: " << numFrags << "\n";
114	cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
115	cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
116	cerr << "num: " << num << "\n";
117	cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
118	cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
119	cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
120	FatalError (1, "Bit buffer overrun");
121	}
122	}
123
124	public:
125	void Clear ();
126	WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
127	~WordBitPtrs ();
128	void SetSize (unsigned long _numWords,
129	unsigned long _numTags);
130
131	void ResetPtrs () {
132	if (wordBitPtrs == NULL) return;
133	unsigned long i;
134	for (i=0; i<size; ++i) wordBitPtrs[i].Clear();
135	}
136
137	BitPtr &GetWordBitPtr (unsigned long wordNum)
138	{ return wordBitPtrs[wordNum]; }
139	unsigned long &GetWordStart (unsigned long wordNum)
140	{ return wordBitPtrs[wordNum].start; }
141	unsigned long &GetWordHere (unsigned long wordNum)
142	{ return wordBitPtrs[wordNum].here; }
143	void CheckWordBufOverrun (unsigned long wordNum)
144	{ CheckBufOverrun (wordNum); }
145
146	BitPtr &GetTagBitPtr (unsigned long tagNum)
147	{ return wordBitPtrs[tagNum + numWords]; }
148	unsigned long &GetTagStart (unsigned long tagNum)
149	{ return wordBitPtrs[tagNum + numWords].start; }
150	unsigned long &GetTagHere (unsigned long tagNum)
151	{ return wordBitPtrs[tagNum + numWords].here; }
152	void CheckTagBufOverrun (unsigned long tagNum)
153	{ CheckBufOverrun (tagNum + numWords); }
154
155	BitPtr &GetEndBitPtr ()
156	{ return wordBitPtrs[size-1]; }
157	unsigned long &GetEndStart ()
158	{ return wordBitPtrs[size-1].start; }
159	unsigned long &GetEndHere ()
160	{ return wordBitPtrs[size-1].here; }
161	};
162
163
164	struct IP2TagInfo {
165	bool inTag;
166	unsigned long startFrag;
167	unsigned long tagNum;
168
169	IP2TagInfo () {
170	inTag = false;
171	startFrag = 0;
172	tagNum = 0;
173	}
174	};
175
176	// maps tags to tag information
177	typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
178
179
180	// class to handle the translation of occurrence order
181	// to dictionary order for words and tags
182	class OccurToDictConverter {
183	protected:
184	unsigned long pos;
185	unsigned long val;
186	FILE *transFile;
187	random_bitio_buffer rbs;
188
189	unsigned long wordDictSize;
190	unsigned long tagDictSize;
191
192	void SeekStart ();
193	unsigned long TranslateNum (unsigned long num);
194
195	public:
196	OccurToDictConverter ();
197	~OccurToDictConverter ();
198
199	void Open (char *filename, unsigned long _wordDictSize,
200	unsigned long _tagDictSize);
201
202	// Close frees all allocated memory
203	void Close ();
204
205	unsigned long TranslateWord (unsigned long occurNum)
206	{ return TranslateNum (occurNum); }
207	unsigned long TranslateTag (unsigned long occurNum)
208	{ return TranslateNum (occurNum+wordDictSize); }
209	};
210
211
212	struct InvfStateRec {
213	mg_ullong start;
214	mg_ullong here;
215	unsigned long lastFragNum;
216	unsigned long B;
217
218	void Clear () {
219	start = here = 0;
220	lastFragNum = B = 0;
221	}
222	InvfStateRec () { Clear (); }
223	};
224
225
226	#define ISR_SIZE 1024
227
228	class InvfStateCache {
229	protected:
230	InvfStateRec recCache [ISR_SIZE];
231	unsigned long startNum;
232
233	FILE *stateFile;
234
235	void ClearCache () {
236	unsigned int i = 0;
237	for (i=0; i<ISR_SIZE; ++i) recCache[i].Clear();
238	}
239
240	public:
241	InvfStateCache ();
242	~InvfStateCache ();
243
244	void Open (char *filename);
245	void Close ();
246
247	// previous references to state records may be
248	// invalidated calling GetRec
249	InvfStateRec &GetRec (unsigned long num);
250	};
251
252
253	static invf_dict_header idh;
254	static WordBitPtrs bitPtrs;
255
256	static FILE *chunkFile = NULL;
257	static stdio_bitio_buffer chunkBuf;
258
259	static unsigned long ivfMemBufSize = 0;
260	static char *ivfMemBuf = NULL;
261
262	// word and tag dictionaries. a map is used for the tag dictionary
263	// as it should never be very big (and the perfect hash function
264	// sometimes has trouble with small values).
265	static perf_hash_data *wordHashDict = NULL;
266	static TagMapDict tagMapDict;
267
268	// information about all the different levels
269	static FIvfLevel ivfLevel;
270
271	static OccurToDictConverter occurConvert;
272
273	// information about the state of the inverted file
274	static InvfStateCache invfState;
275
276	static char collectFilename[512];
277
278
279	void WordBitPtrs::Clear () {
280	numWords = 0;
281	numTags = 0;
282	size=0;
283	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
284	wordBitPtrs = NULL;
285	}
286
287	WordBitPtrs::~WordBitPtrs () {
288	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
289	}
290
291	void WordBitPtrs::SetSize (unsigned long _numWords,
292	unsigned long _numTags){
293	Clear();
294	numWords = _numWords;
295	numTags = _numTags;
296	size = numWords + numTags + 1;
297	wordBitPtrs = new BitPtr [size];
298	}
299
300
301	void OccurToDictConverter::SeekStart () {
302	if (transFile == NULL) return;
303	rbs.SEEK_X (sizeof (unsigned long) * 8);
304	pos = 0;
305	}
306
307	unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
308	if (num < pos) SeekStart ();
309	while (pos <= num) {
310	if (pos < wordDictSize)
311	val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
312	else
313	val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
314	++pos;
315	}
316	return val;
317	}
318
319	OccurToDictConverter::OccurToDictConverter () {
320	pos = 0;
321	val = 0;
322	transFile = NULL;
323	wordDictSize = 0;
324	tagDictSize = 0;
325	}
326
327	OccurToDictConverter::~OccurToDictConverter () {
328	if (transFile != NULL) Close ();
329	}
330
331	void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
332	unsigned long _tagDictSize) {
333	if (transFile != NULL) Close ();
334
335	wordDictSize = _wordDictSize;
336	tagDictSize = _tagDictSize;
337
338	transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
339	MAGIC_CHUNK_TRANS, MG_ABORT);
340	rbs.attachFile (transFile, RND_BUF_SIZE);
341	SeekStart ();
342	val = 0;
343	}
344
345	void OccurToDictConverter::Close () {
346	if (transFile == NULL) return;
347
348	rbs.done ();
349	fclose (transFile);
350	transFile = NULL;
351	pos = 0;
352	val = 0;
353
354	wordDictSize = 0;
355	tagDictSize = 0;
356	}
357
358
359
360
361	InvfStateCache::InvfStateCache () {
362	startNum = 0;
363	stateFile = NULL;
364	}
365
366	InvfStateCache::~InvfStateCache () {
367	if (stateFile != NULL) Close ();
368	}
369
370	void InvfStateCache::Open (char *filename) {
371	if (stateFile != NULL) Close();
372
373	// open the state file
374	char path[512];
375	sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
376	".invf.state", (long) getpid ());
377	if (!(stateFile = fopen (path, "wb+"))) {
378	Message ("Unable to create \"%s\"", path);
379	exit (1);
380	}
381	unlink (path); // file will be deleted after it is closed
382	// reset the buffer
383	startNum = 0;
384	ClearCache();
385	}
386
387	void InvfStateCache::Close () {
388	if (stateFile == NULL) return;
389	fclose (stateFile);
390	stateFile = NULL;
391	startNum = 0;
392	}
393
394	InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
395	// see if cached
396	if ((num >= startNum) && (num < startNum + ISR_SIZE))
397	return recCache[num-startNum];
398
399	// not cached, write out this lot of records and read in
400	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
401	fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
402
403	// read in the new set of records
404	ClearCache ();
405	startNum = num - (num % ISR_SIZE);
406	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
407	fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
408
409	return recCache[num-startNum];
410	}
411
412
413
414	static void ClearCharBuf (char *buf, unsigned long size) {
415	char *end = buf + size;
416	while (buf != end) *buf++ = 0;
417	}
418
419	static void ReadWordDict (char *filename) {
420	// read in the perfect hash function for the word dictionary
421	FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
422	MAGIC_HASH, MG_ABORT);
423	if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
424	FatalError (1, "Unable to read in hash data for word dictionary");
425	}
426	fclose (wordHashFile);
427	}
428
429	static void ReadTagDict (char *filename, invf_dict_header &_idh) {
430	// open the file
431	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
432	MAGIC_STEM_BUILD, MG_ABORT);
433
434	// seek to the start of the tag dictionary
435	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
436
437	unsigned long tagNum;
438	dict_el thisEl;
439	for (tagNum = 0; tagNum < _idh.tag_dict_size; ++tagNum) {
440	thisEl.Read (dictFile);
441	tagMapDict[thisEl.el].tagNum = tagNum;
442	}
443
444	fclose (dictFile);
445	}
446
447	static void ReadLevelFile (char *filename) {
448	FILE *f;
449	f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
450	MAGIC_INVF_LEVELS, MG_ABORT);
451	ivfLevel.Read (f);
452	fclose (f);
453	}
454
455	void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
456	if (totalIBits < lastTotalIBits) {
457	fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
458	if (sizeof (mg_ullong) < 8) {
459	fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
460	}
461	fprintf(stderr, " Build aborted.\n");
462	exit(1);
463	}
464	}
465
466	// assumes the inverted file state file has been opened
467	static void InitInvfState (char *filename,
468	invf_dict_header &_idh,
469	InvfStateCache &_invfState,
470	mg_ullong &totalIBits,
471	bool wordLevelIndex) {
472	// read in the dictionary, setting inverted state information
473	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
474	MAGIC_STEM_BUILD, MG_ABORT);
475
476	// seek to the start of the word dictionary
477	fseek (dictFile, _idh.word_dict_start, SEEK_SET);
478
479	// add the word entries
480	word_dict_el wordEl;
481	wordEl.SetNumLevels (_idh.num_levels);
482	unsigned long dictWordNum, p;
483	mg_ullong lastTotalIBits;
484	unsigned long N = _idh.num_frags;
485	for (dictWordNum=0; dictWordNum<_idh.word_dict_size; ++dictWordNum) {
486	// lastTotalIBits is used to detect integer overflow
487	lastTotalIBits = totalIBits;
488
489	// read the next word and associated information
490	wordEl.Read (dictFile, _idh.num_levels);
491
492	// update the state record
493	p = wordEl.frag_occur;
494	InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
495	wisr.start = totalIBits;
496	wisr.here = totalIBits;
497	wisr.B = BIO_Bblock_Init (N, p);
498
499	// add the length of the fragment numbers
500	totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
501
502	// if needed, add the length of the fragment frequency information
503	if (!wordLevelIndex)
504	totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
505
506	// align next byte
507	#ifdef USE_LONG_LONG
508	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
509	#else
510	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
511	#endif
512
513	CheckIntOverflow (totalIBits, lastTotalIBits);
514	}
515
516
517	// seek to the start of the tag dictionary
518	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
519
520	// add the tag entries
521	dict_el tagEl;
522	unsigned long dictTagNum;
523	N = _idh.num_frags;
524	for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; ++dictTagNum) {
525	// lastTotalIBits is used to detect integer overflow
526	lastTotalIBits = totalIBits;
527
528	// read the next tag and associated information
529	tagEl.Read (dictFile);
530
531	// update the state record
532	p = tagEl.frag_occur*2;
533	InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
534	tisr.start = totalIBits;
535	tisr.here = totalIBits;
536	tisr.B = BIO_Bblock_Init (N+p, p);
537
538	// add the length of the fragment numbers (two numbers for each
539	// tag, one for start and one for end)
540	totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
541
542	// align next byte
543	#ifdef USE_LONG_LONG
544	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
545	#else
546	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
547	#endif
548
549	CheckIntOverflow (totalIBits, lastTotalIBits);
550	}
551
552	fclose (dictFile);
553	}
554
555	/*
556	// assumes the chunk tag information has been placed in .first
557	static void PrintChunkInfo (unsigned long chunkMem,
558	unsigned long numChunkWords,
559	unsigned long numChunkTags) {
560	static unsigned long chunksRead = 0;
561	++chunksRead;
562	cout << "Chunk Number: " << chunksRead << "\n";
563	cout << "numChunkDocs " << numDocsInChunk << "\n";
564	cout << "numChunkFrags " << numFragsInChunk << "\n";
565	cout << "mem " << chunkMem << "\n";
566	cout << "numWords " << numChunkWords << "\n";
567	cout << "numTags " << numChunkTags << "\n\n";
568
569	TagMapDict::iterator tagMapHere = tagMapDict.begin();
570	TagMapDict::iterator tagMapEnd = tagMapDict.end();
571	while (tagMapHere != tagMapEnd) {
572	unsigned long tagMapNum = (*tagMapHere).second.tagNum;
573	cout << (*tagMapHere).first << " " << tagMapNum << " "
574	<< bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
575	++tagMapHere;
576	}
577	}
578	*/
579
580	void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
581	// reset globals
582	numChunkDocs = 0;
583	chunkStartFragNum = numFrags;
584
585	// read in information about this chunk
586	numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
587	if (numDocsInChunk == 0)
588	FatalError (1, "The number of docs in the current chunk is 0");
589
590	numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
591	unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
592
593	if (chunkMem > ivfMemBufSize)
594	FatalError (1, "Chunk memory size is greater than maximum");
595
596	unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
597	unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
598
599
600	// reset stuff
601	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
602	bitPtrs.ResetPtrs();
603
604	// read in the entries in occurrence order storing the
605	// "chunkWordCount" in "start" and the "chunkFragCount"
606	// in "here"
607	unsigned long numOccur;
608	unsigned long wordNum;
609	for (numOccur=0; numOccur<numChunkWords; ++numOccur) {
610	wordNum = occurConvert.TranslateWord (numOccur);
611	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
612	wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
613	if (wordPtr.start >= 2)
614	wordPtr.here = chunkBuf.gamma_decode (NULL);
615	else wordPtr.here = wordPtr.start;
616	}
617	unsigned long tagNum;
618	for (numOccur=0; numOccur<numChunkTags; ++numOccur) {
619	tagNum = occurConvert.TranslateTag (numOccur);
620	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
621	// only chunkFragCount is encoded for tags
622	tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
623	tagPtr.here = tagPtr.start;
624	}
625
626	/* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
627
628	// create the bit ptrs in dictionary order
629	unsigned long totalIBits = 0; // only dealing with memory
630	unsigned long chunkWordCount, chunkFragCount;
631	for (wordNum=0; wordNum<_idh.word_dict_size; ++wordNum) {
632	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
633	chunkWordCount = wordPtr.start;
634	chunkFragCount = wordPtr.here;
635	wordPtr.start = totalIBits;
636	wordPtr.here = totalIBits;
637	wordPtr.lastFragNum = chunkStartFragNum;
638	wordPtr.lgB = 0;
639	if (chunkWordCount > 0) {
640	wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
641	chunkFragCount));
642	totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
643	// use unary encoding for memory buffer encoding of fragment freq
644	if (!wordLevelIndex) {
645	totalIBits += chunkWordCount;
646	}
647	}
648	}
649	for (tagNum=0; tagNum<_idh.tag_dict_size; ++tagNum) {
650	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
651	chunkFragCount = tagPtr.here;
652	tagPtr.start = totalIBits;
653	tagPtr.here = totalIBits;
654	tagPtr.lastFragNum = chunkStartFragNum;
655	tagPtr.lgB = 0;
656	if (chunkFragCount > 0) {
657	unsigned long pTag = chunkFragCount*2;
658	tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
659	pTag));
660	unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
661	pTag);
662	// cout << tagNum + _idh.word_dict_size << " ";
663	// cout << "numFrags: " << numFragsInChunk
664	// << " chunkFragCount: " << chunkFragCount
665	// << " B: " << 1 << tagPtr.lgB
666	// << " blen: " << blen << "\n";
667	totalIBits += bLen;
668	}
669	}
670	bitPtrs.GetEndStart() = totalIBits;
671	bitPtrs.GetEndHere() = totalIBits;
672
673	if ((totalIBits + 7ul) >> 3ul > chunkMem) {
674	cerr << "totalIBits: " << totalIBits << "\n";
675	cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
676	cerr << "chunkMem: " << chunkMem << "\n";
677	FatalError (1, "Pointers exceed buffer size");
678	}
679	}
680
681
682
683
684	int init_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
685	// read in compressed dictionary header
686	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
687	MAGIC_STEM_BUILD, MG_ABORT);
688	idh.Read (dictFile);
689	fclose (dictFile);
690
691	// set the size of the bit ptrs
692	bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
693
694	// open the chunk file and read in the maximum memory needed
695	// for the inverted memory buffer
696	chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
697	MAGIC_CHUNK, MG_ABORT);
698	ReadUL (chunkFile, ivfMemBufSize);
699	chunkBuf.attachFile (chunkFile);
700
701	// allocate memory for the inverted buffer
702	ivfMemBuf = new char [ivfMemBufSize];
703	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
704
705	// read in the word dictionary
706	ReadWordDict (filename);
707
708	// read in the tag dictionary
709	ReadTagDict (filename, idh);
710
711	// read in the level information
712	ReadLevelFile (filename);
713	bool wordLevelIndex = ivfLevel.indexLevel.empty();
714
715	// set up the translation table file
716	occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
717
718	// reset some globals
719	numDocs = 0;
720	numChunkDocs = 0;
721	numDocsInChunk = 0;
722	numFrags = 0;
723	numFragsInChunk = 0;
724	chunkStartFragNum = 0;
725
726	strcpy (collectFilename, filename);
727
728
729	// create the inverted file
730	mg_ullong totalIBits = 0;
731	FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
732	MAGIC_INVF, MG_ABORT);
733	totalIBits += sizeof (unsigned long) * 8; // magic number
734	totalIBits += 8 * 200; // 200 byte gap -- why??????
735	fclose (invfFile);
736
737	// init the inverted file state cache
738	invfState.Open (filename);
739	InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
740
741	return COMPALLOK;
742	}
743
744	static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/tagName/) {
745	if (!tInfo.inTag) return;
746
747	// add this tag to the inverted list
748	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
749	unsigned long endFrag = numFrags;
750	int b = 1 << tagBitPtr.lgB;
751
752	/*
753	cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
754	<< tInfo.startFrag << " " << endFrag << "\n";
755	*/
756
757	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
758	buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
759	b, NULL);
760	buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
761	tagBitPtr.lastFragNum = endFrag;
762	tagBitPtr.here = buffer.position();
763	buffer.encodeDone();
764
765	// check for buffer overrun
766	bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
767
768	// reset information about this tag
769	tInfo.inTag = false;
770	tInfo.startFrag = 0;
771	}
772
773	static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
774	// close tag if open
775	IP2TagInfo &tInfo = tagMapDict[el.tagName];
776	if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
777
778	// open this tag
779	tInfo.inTag = true;
780	tInfo.startFrag = numFrags;
781
782	// check for start of next fragment
783	bool wordLevelIndex = ivfLevel.indexLevel.empty();
784	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
785	++numFrags;
786	inFrag = true;
787	}
788	}
789
790	static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
791	// check for end of fragment
792	bool wordLevelIndex = ivfLevel.indexLevel.empty();
793	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
794	inFrag = false;
795	}
796
797	IP2TagInfo &tInfo = tagMapDict[el.tagName];
798	CloseTextTag (tInfo, el.tagName);
799	}
800
801	static void ProcessText (const TextEl &el, bool &inFrag) {
802	// make sure this text is to be indexed
803	bool wordLevelIndex = ivfLevel.indexLevel.empty();
804	if (!wordLevelIndex && !inFrag) return;
805
806	const unsigned char *textHere = &(el.text[0]);
807	const unsigned char *textEnd = &(el.text[el.text.size() - 1]);
808	unsigned char mgWord[MAXSTEMLEN + 1];
809
810	if (!inaword_mgpp (textHere, textEnd))
811	ParseNonindexWord (textHere, textEnd);
812
813
814	// Alternately parse off words and non-words from the input
815
816	while (textHere <= textEnd) {
817	textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
818	textHere = ParseNonindexWord (textHere, textEnd);
819
820	if (mgWord[0] > 0) {
821	if (wordLevelIndex) ++numFrags;
822
823	unsigned long wordNum = perf_hash (wordHashDict, mgWord);
824
825	/*
826	cout << wordNum << " \"";
827	cout.write (mgWord+1, *mgWord);
828	cout << "\" " << numFrags << "\n";
829	*/
830
831	// add this word to the inverted list
832	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
833	unsigned long fragNum = numFrags;
834	int b = 1 << wordBitPtr.lgB;
835
836	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
837
838	// note: this assumes that fragments don't carry over between
839	// chunks (which they don't because all tags are closed at the
840	// end of each document and chunks are based on document
841	// boundaries), i.e. the first fragment number must be greater
842	// than the starting fragment number of the chunk.
843	if (fragNum > wordBitPtr.lastFragNum) {
844	buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
845	b, NULL);
846	if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
847
848	} else if (!wordLevelIndex) {
849	// add one to the frequency count for this word
850	buffer.seek (buffer.position()-1);
851	buffer.encodeBit (0); // unary encoding -- last = 1
852	buffer.encodeBit (1);
853	}
854
855	wordBitPtr.lastFragNum = fragNum;
856	wordBitPtr.here = buffer.position();
857	buffer.encodeDone();
858
859	// check for buffer overrun
860	bitPtrs.CheckWordBufOverrun (wordNum);
861	}
862	}
863	}
864
865	// combine the in memory inverted buffer with the disk
866	// based inverted file
867	static void DiskMerge (char *filename) {
868	bool wordLevelIndex = ivfLevel.indexLevel.empty();
869
870	// make sure we have something to process
871	if (numChunkDocs <= 0) return;
872
873	// open the inverted file
874	FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
875	MAGIC_INVF, MG_ABORT);
876	random_bitio_buffer invfOutBuf (invfFile);
877
878	// set up to decode the entries in memory
879	mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
880
881	// write out the word information
882	unsigned long wordNum;
883	int b;
884	unsigned long currFragNum;
885	unsigned long delta;
886	unsigned long currFreq;
887	for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
888	// go to the end of the last inverted file entry
889	InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
890	invfOutBuf.SEEK_X (wordDiskState.here);
891
892	// go to the start of the inverted chunk info in memory
893	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
894	memInBuf.seek(wordBitPtr.start);
895
896	// decode each entry and re-write to disk
897	currFragNum = chunkStartFragNum;
898	while (memInBuf.position() < wordBitPtr.here) {
899	// decode word entry
900	b = 1 << wordBitPtr.lgB;
901	delta = memInBuf.bblock_decode (b, NULL);
902	currFragNum += delta;
903	if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
904	else currFreq = 1;
905
906	// recode on disk
907	invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
908	wordDiskState.B, NULL);
909	if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
910	wordDiskState.lastFragNum = currFragNum;
911	}
912
913	wordDiskState.here = invfOutBuf.TELL_X();
914	}
915
916	// write out the tag information
917	unsigned long tagNum;
918	unsigned long currTagStart;
919	unsigned long currTagEnd;
920	for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
921	// go to the end of the last inverted file entry
922	InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
923	invfOutBuf.SEEK_X (tagDiskState.here);
924
925	// go to the start of the inverted chunk info in memory
926	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
927	memInBuf.seek(tagBitPtr.start);
928
929	// decode each entry and re-write to disk
930	currTagEnd = chunkStartFragNum;
931	while (memInBuf.position() < tagBitPtr.here) {
932	// decode tag entry
933	b = 1 << tagBitPtr.lgB;
934	delta = memInBuf.bblock_decode (b, NULL) - 1;
935	currTagStart = currTagEnd + delta;
936	delta = memInBuf.bblock_decode (b, NULL) - 1;
937	currTagEnd = currTagStart + delta;
938
939	// recode on disk
940	invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
941	tagDiskState.B, NULL);
942	invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
943	tagDiskState.B, NULL);
944
945	tagDiskState.lastFragNum = currTagEnd;
946	}
947
948	tagDiskState.here = invfOutBuf.TELL_X();
949	}
950
951	memInBuf.done();
952
953	invfOutBuf.encodeDone();
954	fclose (invfFile);
955	}
956
957
958	int process_ivf_2 (const TagInfo &/tagInfo/, const TextElArray &doc) {
959	bool wordLevelIndex = ivfLevel.indexLevel.empty();
960	bool inFrag = false;
961	if (wordLevelIndex) inFrag = true; // unconditional
962
963	// get next chunk information if need to. the chunk information
964	// is needed before the first document is processed
965	if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
966
967	// process each text element
968	TextElArray::const_iterator here = doc.begin();
969	TextElArray::const_iterator end = doc.end();
970	while (here != end) {
971	// process this element
972	if ((here).elType == OpenTagE) ProcessOpenTag (here, inFrag);
973	else if ((here).elType == CloseTagE) ProcessCloseTag (here, inFrag);
974	else ProcessText (*here, inFrag);
975
976	++here;
977	}
978
979	// close off any unclosed tags
980	TagMapDict::iterator tdHere = tagMapDict.begin();
981	TagMapDict::iterator tdEnd = tagMapDict.end();
982	while (tdHere != tdEnd) {
983	CloseTextTag ((tdHere).second, (tdHere).first);
984	++tdHere;
985	}
986
987	// we've processed one more document
988	++numDocs;
989	++numChunkDocs;
990
991	// merge the memory based inverted file with the one on
992	// disk if this is the end of this chunk
993	if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
994
995	return COMPALLOK;
996	}
997
998
999	static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
1000	FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1001	MAGIC_INVF, MG_ABORT);
1002	FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1003	MAGIC_INVF, MG_ABORT);
1004
1005	// skip the magic number
1006	fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
1007
1008	// write the inverted file header -- use defaults for most things
1009	invf_file_header ifh;
1010	ifh.no_of_words = idh.word_dict_size;
1011	ifh.no_of_tags = idh.tag_dict_size;
1012	ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1013	ifh.Write (outInvfFile);
1014
1015	bytesOutput = ftell (outInvfFile);
1016
1017	// process each meaningful byte in the file
1018	unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
1019	unsigned long entryNum;
1020	mg_ullong lastStart = 0;
1021	for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1022	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1023
1024	// overrun check
1025	if (stateRec.start < lastStart)
1026	FatalError (1, "Inverted file Buffer overrun");
1027	lastStart = stateRec.start;
1028
1029	unsigned long oldEntryStart = stateRec.start >> 3;
1030	unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
1031	unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1032	unsigned long oldEntryEndOver = stateRec.here & 7;
1033
1034	fseek (inInvfFile, oldEntryStart, SEEK_SET);
1035
1036	stateRec.here -= stateRec.start;
1037	stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver;
1038	stateRec.here += stateRec.start;
1039	while (oldEntryStart < oldEntryEnd) {
1040	unsigned char c = getc (inInvfFile);
1041	if (oldEntryStart == oldEntryEnd - 1) {
1042	u_char ands[8] =
1043	{0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1044	c &= ands[oldEntryEndOver];
1045	}
1046	putc (c, outInvfFile);
1047	++bytesOutput;
1048	++oldEntryStart;
1049	}
1050	}
1051
1052	fclose (inInvfFile);
1053
1054	#ifdef __WIN32__
1055	if (_chsize (_fileno (outInvfFile), bytesOutput) != 0)
1056	Message ("Could not truncate invf.");
1057	#else
1058	ftruncate (fileno (outInvfFile), bytesOutput);
1059	#endif
1060
1061	fclose (outInvfFile);
1062	}
1063
1064	static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
1065	FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1066	MAGIC_INVI, MG_ABORT);
1067
1068	// process each meaningful byte in the file
1069	unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
1070	unsigned long entryNum;
1071	for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1072	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1073
1074	// assumes that inverted entries start at beginning of each byte
1075	if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1076	}
1077
1078	WriteUL (invfIdxFile, invfNumBytes);
1079
1080	fclose (invfIdxFile);
1081	}
1082
1083
1084	int done_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
1085	// close most open files
1086	if (chunkFile != NULL) {
1087	chunkBuf.done();
1088	fclose (chunkFile);
1089	chunkFile = NULL;
1090	}
1091	occurConvert.Close();
1092
1093	// free allocated memory
1094	bitPtrs.Clear();
1095	if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1096	free_perf_hash (wordHashDict);
1097	wordHashDict = NULL;
1098	tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1099
1100	// condense the inverted file and truncate it
1101	// this function also writes out the inverted header
1102	unsigned long invfNumBytes = 0;
1103	CondenseInvfFile (filename, invfNumBytes);
1104
1105	OutputInvfIdx (filename, invfNumBytes);
1106
1107	// close the rest of the open files
1108	invfState.Close ();
1109
1110	return COMPALLOK;
1111	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: