Context Navigation

ivf.pass2.cpp@ 25147

Last change on this file since 25147 was 25147, checked in by kjdon, 12 years ago
merged 64_bit_Greenstone branch into trunk, rev 25139
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.3 KB

Line
1	/**************************************************************************
2	*
3	* ivf.pass2.cpp -- Memory efficient pass 2 inversion
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#define _XOPEN_SOURCE 1
23	// This was added for Solaris, but it makes things worse on Solaris for me...
24	// #define _XOPEN_SOURCE_EXTENDED 1
25
26	#include <stdio.h>
27
28	#if defined __WIN32__
29	# include <process.h>
30	# include <io.h>
31	# define getpid _getpid
32	# define unlink _unlink
33	#else
34	# include <unistd.h>
35	# include "non_ansi.h"
36	#endif
37
38	#include "UCArray.h"
39	#include "sysfuncs.h"
40	#include "mg_files.h"
41	#include "invf.h"
42	#include "mg.h"
43	#include "build.h"
44	#include "locallib.h"
45	#include "bitio_m_random.h"
46	#include "bitio_m_stdio.h"
47	#include "bitio_m_mems.h"
48	#include "bitio_gen.h"
49	#include <stdio.h>
50	#include "words.h"
51	#include "messages.h"
52	#include "netorder.h"
53	#include "FIvfLevelInfo.h"
54	#include "perf_hash.h"
55	#include "string.h"
56
57	#include "longlong.h"
58
59	#if defined(GSDL_USE_OBJECTSPACE)
60	# include <ospace\std\map>
61	#elif defined(GSDL_USE_STL_H)
62	# include <map.h>
63	#else
64	# include <map>
65	#endif
66
67
68	#ifdef USE_LONG_LONG
69	#define SEEK_X seek_LL
70	#define TELL_X tell_LL
71	#else
72	#define SEEK_X seek
73	#define TELL_X tell
74	#endif
75
76	#ifndef RND_BUF_SIZE
77	#define RND_BUF_SIZE 8*1024
78	#endif
79
80
81	static mg_u_long numDocs = 0;
82	static mg_u_long numChunkDocs = 0;
83	static mg_u_long numDocsInChunk = 0;
84
85	static mg_u_long numFrags = 0;
86	static mg_u_long numFragsInChunk = 0;
87	static mg_u_long chunkStartFragNum = 0;
88
89
90
91
92	struct BitPtr {
93	mg_u_long start;
94	mg_u_long here;
95	mg_u_long lastFragNum;
96	mg_u_long lgB;
97
98	void Clear () { start = here = lastFragNum = lgB = 0; }
99	BitPtr () { Clear(); }
100	};
101
102	class WordBitPtrs {
103	protected:
104	mg_u_long numWords;
105	mg_u_long numTags;
106	mg_u_long size;
107	BitPtr *wordBitPtrs;
108
109	void CheckBufOverrun (mg_u_long num) {
110	if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
111	cerr << "numDocs: " << numDocs << "\n";
112	cerr << "numChunkDocs: " << numChunkDocs << "\n";
113	cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
114	cerr << "numFrags: " << numFrags << "\n";
115	cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
116	cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
117	cerr << "num: " << num << "\n";
118	cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
119	cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
120	cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
121	FatalError (1, "Bit buffer overrun");
122	}
123	}
124
125	public:
126	void Clear ();
127	WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
128	~WordBitPtrs ();
129	void SetSize (mg_u_long _numWords,
130	mg_u_long _numTags);
131
132	void ResetPtrs () {
133	if (wordBitPtrs == NULL) return;
134	mg_u_long i;
135	for (i=0; i<size; ++i) wordBitPtrs[i].Clear();
136	}
137
138	BitPtr &GetWordBitPtr (mg_u_long wordNum)
139	{ return wordBitPtrs[wordNum]; }
140	mg_u_long &GetWordStart (mg_u_long wordNum)
141	{ return wordBitPtrs[wordNum].start; }
142	mg_u_long &GetWordHere (mg_u_long wordNum)
143	{ return wordBitPtrs[wordNum].here; }
144	void CheckWordBufOverrun (mg_u_long wordNum)
145	{ CheckBufOverrun (wordNum); }
146
147	BitPtr &GetTagBitPtr (mg_u_long tagNum)
148	{ return wordBitPtrs[tagNum + numWords]; }
149	mg_u_long &GetTagStart (mg_u_long tagNum)
150	{ return wordBitPtrs[tagNum + numWords].start; }
151	mg_u_long &GetTagHere (mg_u_long tagNum)
152	{ return wordBitPtrs[tagNum + numWords].here; }
153	void CheckTagBufOverrun (mg_u_long tagNum)
154	{ CheckBufOverrun (tagNum + numWords); }
155
156	BitPtr &GetEndBitPtr ()
157	{ return wordBitPtrs[size-1]; }
158	mg_u_long &GetEndStart ()
159	{ return wordBitPtrs[size-1].start; }
160	mg_u_long &GetEndHere ()
161	{ return wordBitPtrs[size-1].here; }
162	};
163
164
165	struct IP2TagInfo {
166	bool inTag;
167	mg_u_long startFrag;
168	mg_u_long tagNum;
169
170	IP2TagInfo () {
171	inTag = false;
172	startFrag = 0;
173	tagNum = 0;
174	}
175	};
176
177	// maps tags to tag information
178	typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
179
180
181	// class to handle the translation of occurrence order
182	// to dictionary order for words and tags
183	class OccurToDictConverter {
184	protected:
185	mg_u_long pos;
186	mg_u_long val;
187	FILE *transFile;
188	random_bitio_buffer rbs;
189
190	mg_u_long wordDictSize;
191	mg_u_long tagDictSize;
192
193	void SeekStart ();
194	mg_u_long TranslateNum (mg_u_long num);
195
196	public:
197	OccurToDictConverter ();
198	~OccurToDictConverter ();
199
200	void Open (char *filename, mg_u_long _wordDictSize,
201	mg_u_long _tagDictSize);
202
203	// Close frees all allocated memory
204	void Close ();
205
206	mg_u_long TranslateWord (mg_u_long occurNum)
207	{ return TranslateNum (occurNum); }
208	mg_u_long TranslateTag (mg_u_long occurNum)
209	{ return TranslateNum (occurNum+wordDictSize); }
210	};
211
212
213	struct InvfStateRec {
214	mg_ullong start;
215	mg_ullong here;
216	mg_u_long lastFragNum;
217	mg_u_long B;
218
219	void Clear () {
220	start = here = 0;
221	lastFragNum = B = 0;
222	}
223	InvfStateRec () { Clear (); }
224	};
225
226
227	#define ISR_SIZE 1024
228
229	class InvfStateCache {
230	protected:
231	InvfStateRec recCache [ISR_SIZE];
232	mg_u_long startNum;
233
234	FILE *stateFile;
235
236	void ClearCache () {
237	unsigned int i = 0;
238	for (i=0; i<ISR_SIZE; ++i) recCache[i].Clear();
239	}
240
241	public:
242	InvfStateCache ();
243	~InvfStateCache ();
244
245	void Open (char *filename);
246	void Close ();
247
248	// previous references to state records may be
249	// invalidated calling GetRec
250	InvfStateRec &GetRec (mg_u_long num);
251	};
252
253
254	static invf_dict_header idh;
255	static WordBitPtrs bitPtrs;
256
257	static FILE *chunkFile = NULL;
258	static stdio_bitio_buffer chunkBuf;
259
260	static mg_u_long ivfMemBufSize = 0;
261	static char *ivfMemBuf = NULL;
262
263	// word and tag dictionaries. a map is used for the tag dictionary
264	// as it should never be very big (and the perfect hash function
265	// sometimes has trouble with small values).
266	static perf_hash_data *wordHashDict = NULL;
267	static TagMapDict tagMapDict;
268
269	// information about all the different levels
270	static FIvfLevel ivfLevel;
271
272	static OccurToDictConverter occurConvert;
273
274	// information about the state of the inverted file
275	static InvfStateCache invfState;
276
277	static char collectFilename[512];
278
279
280	void WordBitPtrs::Clear () {
281	numWords = 0;
282	numTags = 0;
283	size=0;
284	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
285	wordBitPtrs = NULL;
286	}
287
288	WordBitPtrs::~WordBitPtrs () {
289	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
290	}
291
292	void WordBitPtrs::SetSize (mg_u_long _numWords,
293	mg_u_long _numTags){
294	Clear();
295	numWords = _numWords;
296	numTags = _numTags;
297	size = numWords + numTags + 1;
298	wordBitPtrs = new BitPtr [size];
299	}
300
301
302	void OccurToDictConverter::SeekStart () {
303	if (transFile == NULL) return;
304	rbs.SEEK_X (sizeof (mg_u_long) * 8);
305	pos = 0;
306	}
307
308	mg_u_long OccurToDictConverter::TranslateNum (mg_u_long num) {
309	if (num < pos) SeekStart ();
310	while (pos <= num) {
311	if (pos < wordDictSize)
312	val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
313	else
314	val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
315	++pos;
316	}
317	return val;
318	}
319
320	OccurToDictConverter::OccurToDictConverter () {
321	pos = 0;
322	val = 0;
323	transFile = NULL;
324	wordDictSize = 0;
325	tagDictSize = 0;
326	}
327
328	OccurToDictConverter::~OccurToDictConverter () {
329	if (transFile != NULL) Close ();
330	}
331
332	void OccurToDictConverter::Open (char *filename, mg_u_long _wordDictSize,
333	mg_u_long _tagDictSize) {
334	if (transFile != NULL) Close ();
335
336	wordDictSize = _wordDictSize;
337	tagDictSize = _tagDictSize;
338
339	transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
340	MAGIC_CHUNK_TRANS, MG_ABORT);
341	rbs.attachFile (transFile, RND_BUF_SIZE);
342	SeekStart ();
343	val = 0;
344	}
345
346	void OccurToDictConverter::Close () {
347	if (transFile == NULL) return;
348
349	rbs.done ();
350	fclose (transFile);
351	transFile = NULL;
352	pos = 0;
353	val = 0;
354
355	wordDictSize = 0;
356	tagDictSize = 0;
357	}
358
359
360
361
362	InvfStateCache::InvfStateCache () {
363	startNum = 0;
364	stateFile = NULL;
365	}
366
367	InvfStateCache::~InvfStateCache () {
368	if (stateFile != NULL) Close ();
369	}
370
371	void InvfStateCache::Open (char *filename) {
372	if (stateFile != NULL) Close();
373
374	// open the state file
375	char path[512];
376	sprintf (path, FILE_NAME_FORMAT ".%d", get_basepath (), filename,
377	".invf.state", (mg_s_long) getpid ());
378	if (!(stateFile = fopen (path, "wb+"))) {
379	Message ("Unable to create \"%s\"", path);
380	exit (1);
381	}
382	unlink (path); // file will be deleted after it is closed
383	// reset the buffer
384	startNum = 0;
385	ClearCache();
386	}
387
388	void InvfStateCache::Close () {
389	if (stateFile == NULL) return;
390	fclose (stateFile);
391	stateFile = NULL;
392	startNum = 0;
393	}
394
395	InvfStateRec &InvfStateCache::GetRec (mg_u_long num) {
396	// see if cached
397	if ((num >= startNum) && (num < startNum + ISR_SIZE))
398	return recCache[num-startNum];
399
400	// not cached, write out this lot of records and read in
401	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
402	fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
403
404	// read in the new set of records
405	ClearCache ();
406	startNum = num - (num % ISR_SIZE);
407	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
408	size_t numbytes = fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
409	if (numbytes != ISR_SIZE){
410	#ifdef DEBUG
411	fprintf(stderr, "[mgpp/text/ivf.pass2.cpp L410] number of bytes read by fread does not match the requested amount\n");
412	#endif
413	}
414
415	return recCache[num-startNum];
416	}
417
418
419
420	static void ClearCharBuf (char *buf, mg_u_long size) {
421	char *end = buf + size;
422	while (buf != end) *buf++ = 0;
423	}
424
425	static void ReadWordDict (char *filename) {
426	// read in the perfect hash function for the word dictionary
427	FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
428	MAGIC_HASH, MG_ABORT);
429	if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
430	FatalError (1, "Unable to read in hash data for word dictionary");
431	}
432	fclose (wordHashFile);
433	}
434
435	static void ReadTagDict (char *filename, invf_dict_header &_idh) {
436	// open the file
437	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
438	MAGIC_STEM_BUILD, MG_ABORT);
439
440	// seek to the start of the tag dictionary
441	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
442
443	mg_u_long tagNum;
444	dict_el thisEl;
445	for (tagNum = 0; tagNum < _idh.tag_dict_size; ++tagNum) {
446	thisEl.Read (dictFile);
447	tagMapDict[thisEl.el].tagNum = tagNum;
448	}
449
450	fclose (dictFile);
451	}
452
453	static void ReadLevelFile (char *filename) {
454	FILE *f;
455	f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
456	MAGIC_INVF_LEVELS, MG_ABORT);
457	ivfLevel.Read (f);
458	fclose (f);
459	}
460
461	void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
462	if (totalIBits < lastTotalIBits) {
463	fprintf(stderr, "ERROR: The totalIBits counter (%lu byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
464	if (sizeof (mg_ullong) < 8) {
465	fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
466	}
467	fprintf(stderr, " Build aborted.\n");
468	exit(1);
469	}
470	}
471
472	// assumes the inverted file state file has been opened
473	static void InitInvfState (char *filename,
474	invf_dict_header &_idh,
475	InvfStateCache &_invfState,
476	mg_ullong &totalIBits,
477	bool wordLevelIndex) {
478	// read in the dictionary, setting inverted state information
479	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
480	MAGIC_STEM_BUILD, MG_ABORT);
481
482	// seek to the start of the word dictionary
483	fseek (dictFile, _idh.word_dict_start, SEEK_SET);
484
485	// add the word entries
486	word_dict_el wordEl;
487	wordEl.SetNumLevels (_idh.num_levels);
488	mg_u_long dictWordNum, p;
489	mg_ullong lastTotalIBits;
490	mg_u_long N = _idh.num_frags;
491	for (dictWordNum=0; dictWordNum<_idh.word_dict_size; ++dictWordNum) {
492	// lastTotalIBits is used to detect integer overflow
493	lastTotalIBits = totalIBits;
494
495	// read the next word and associated information
496	wordEl.Read (dictFile, _idh.num_levels);
497
498	// update the state record
499	p = wordEl.frag_occur;
500	InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
501	wisr.start = totalIBits;
502	wisr.here = totalIBits;
503	wisr.B = BIO_Bblock_Init (N, p);
504
505	// add the length of the fragment numbers
506	totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
507
508	// if needed, add the length of the fragment frequency information
509	if (!wordLevelIndex)
510	totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
511
512	// align next byte
513	#ifdef USE_LONG_LONG
514	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
515	#else
516	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
517	#endif
518
519	CheckIntOverflow (totalIBits, lastTotalIBits);
520	}
521
522
523	// seek to the start of the tag dictionary
524	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
525
526	// add the tag entries
527	dict_el tagEl;
528	mg_u_long dictTagNum;
529	N = _idh.num_frags;
530	for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; ++dictTagNum) {
531	// lastTotalIBits is used to detect integer overflow
532	lastTotalIBits = totalIBits;
533
534	// read the next tag and associated information
535	tagEl.Read (dictFile);
536
537	// update the state record
538	p = tagEl.frag_occur*2;
539	InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
540	tisr.start = totalIBits;
541	tisr.here = totalIBits;
542	tisr.B = BIO_Bblock_Init (N+p, p);
543
544	// add the length of the fragment numbers (two numbers for each
545	// tag, one for start and one for end)
546	totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
547
548	// align next byte
549	#ifdef USE_LONG_LONG
550	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
551	#else
552	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
553	#endif
554
555	CheckIntOverflow (totalIBits, lastTotalIBits);
556	}
557
558	fclose (dictFile);
559	}
560
561	/*
562	// assumes the chunk tag information has been placed in .first
563	static void PrintChunkInfo (mg_u_long chunkMem,
564	mg_u_long numChunkWords,
565	mg_u_long numChunkTags) {
566	static mg_u_long chunksRead = 0;
567	++chunksRead;
568	cout << "Chunk Number: " << chunksRead << "\n";
569	cout << "numChunkDocs " << numDocsInChunk << "\n";
570	cout << "numChunkFrags " << numFragsInChunk << "\n";
571	cout << "mem " << chunkMem << "\n";
572	cout << "numWords " << numChunkWords << "\n";
573	cout << "numTags " << numChunkTags << "\n\n";
574
575	TagMapDict::iterator tagMapHere = tagMapDict.begin();
576	TagMapDict::iterator tagMapEnd = tagMapDict.end();
577	while (tagMapHere != tagMapEnd) {
578	mg_u_long tagMapNum = (*tagMapHere).second.tagNum;
579	cout << (*tagMapHere).first << " " << tagMapNum << " "
580	<< bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
581	++tagMapHere;
582	}
583	}
584	*/
585
586	void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
587	// reset globals
588	numChunkDocs = 0;
589	chunkStartFragNum = numFrags;
590
591	// read in information about this chunk
592	numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
593	if (numDocsInChunk == 0)
594	FatalError (1, "The number of docs in the current chunk is 0");
595
596	numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
597	mg_u_long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
598
599	if (chunkMem > ivfMemBufSize)
600	FatalError (1, "Chunk memory size is greater than maximum");
601
602	mg_u_long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
603	mg_u_long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
604
605
606	// reset stuff
607	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
608	bitPtrs.ResetPtrs();
609
610	// read in the entries in occurrence order storing the
611	// "chunkWordCount" in "start" and the "chunkFragCount"
612	// in "here"
613	mg_u_long numOccur;
614	mg_u_long wordNum;
615	for (numOccur=0; numOccur<numChunkWords; ++numOccur) {
616	wordNum = occurConvert.TranslateWord (numOccur);
617	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
618	wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
619	if (wordPtr.start >= 2)
620	wordPtr.here = chunkBuf.gamma_decode (NULL);
621	else wordPtr.here = wordPtr.start;
622	}
623	mg_u_long tagNum;
624	for (numOccur=0; numOccur<numChunkTags; ++numOccur) {
625	tagNum = occurConvert.TranslateTag (numOccur);
626	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
627	// only chunkFragCount is encoded for tags
628	tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
629	tagPtr.here = tagPtr.start;
630	}
631
632	/* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
633
634	// create the bit ptrs in dictionary order
635	mg_u_long totalIBits = 0; // only dealing with memory
636	mg_u_long chunkWordCount, chunkFragCount;
637	for (wordNum=0; wordNum<_idh.word_dict_size; ++wordNum) {
638	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
639	chunkWordCount = wordPtr.start;
640	chunkFragCount = wordPtr.here;
641	wordPtr.start = totalIBits;
642	wordPtr.here = totalIBits;
643	wordPtr.lastFragNum = chunkStartFragNum;
644	wordPtr.lgB = 0;
645	if (chunkWordCount > 0) {
646	wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
647	chunkFragCount));
648	totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
649	// use unary encoding for memory buffer encoding of fragment freq
650	if (!wordLevelIndex) {
651	totalIBits += chunkWordCount;
652	}
653	}
654	}
655	for (tagNum=0; tagNum<_idh.tag_dict_size; ++tagNum) {
656	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
657	chunkFragCount = tagPtr.here;
658	tagPtr.start = totalIBits;
659	tagPtr.here = totalIBits;
660	tagPtr.lastFragNum = chunkStartFragNum;
661	tagPtr.lgB = 0;
662	if (chunkFragCount > 0) {
663	mg_u_long pTag = chunkFragCount*2;
664	tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
665	pTag));
666	mg_u_long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
667	pTag);
668	// cout << tagNum + _idh.word_dict_size << " ";
669	// cout << "numFrags: " << numFragsInChunk
670	// << " chunkFragCount: " << chunkFragCount
671	// << " B: " << 1 << tagPtr.lgB
672	// << " blen: " << blen << "\n";
673	totalIBits += bLen;
674	}
675	}
676	bitPtrs.GetEndStart() = totalIBits;
677	bitPtrs.GetEndHere() = totalIBits;
678
679	if ((totalIBits + 7ul) >> 3ul > chunkMem) {
680	cerr << "totalIBits: " << totalIBits << "\n";
681	cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
682	cerr << "chunkMem: " << chunkMem << "\n";
683	FatalError (1, "Pointers exceed buffer size");
684	}
685	}
686
687
688
689
690	int init_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
691	// read in compressed dictionary header
692	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
693	MAGIC_STEM_BUILD, MG_ABORT);
694	idh.Read (dictFile);
695	fclose (dictFile);
696
697	// set the size of the bit ptrs
698	bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
699
700	// open the chunk file and read in the maximum memory needed
701	// for the inverted memory buffer
702	chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
703	MAGIC_CHUNK, MG_ABORT);
704	ReadUL (chunkFile, ivfMemBufSize);
705	chunkBuf.attachFile (chunkFile);
706
707	// allocate memory for the inverted buffer
708	ivfMemBuf = new char [ivfMemBufSize];
709	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
710
711	// read in the word dictionary
712	ReadWordDict (filename);
713
714	// read in the tag dictionary
715	ReadTagDict (filename, idh);
716
717	// read in the level information
718	ReadLevelFile (filename);
719	bool wordLevelIndex = ivfLevel.indexLevel.empty();
720
721	// set up the translation table file
722	occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
723
724	// reset some globals
725	numDocs = 0;
726	numChunkDocs = 0;
727	numDocsInChunk = 0;
728	numFrags = 0;
729	numFragsInChunk = 0;
730	chunkStartFragNum = 0;
731
732	strcpy (collectFilename, filename);
733
734
735	// create the inverted file
736	mg_ullong totalIBits = 0;
737	FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
738	MAGIC_INVF, MG_ABORT);
739	totalIBits += sizeof (mg_u_long) * 8; // magic number
740	totalIBits += 8 * 200; // 200 byte gap -- why??????
741	fclose (invfFile);
742
743	// init the inverted file state cache
744	invfState.Open (filename);
745	InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
746
747	return COMPALLOK;
748	}
749
750	static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/tagName/) {
751	if (!tInfo.inTag) return;
752
753	// add this tag to the inverted list
754	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
755	mg_u_long endFrag = numFrags;
756	int b = 1 << tagBitPtr.lgB;
757
758	/*
759	cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
760	<< tInfo.startFrag << " " << endFrag << "\n";
761	*/
762
763	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
764	buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
765	b, NULL);
766	buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
767	tagBitPtr.lastFragNum = endFrag;
768	tagBitPtr.here = buffer.position();
769	buffer.encodeDone();
770
771	// check for buffer overrun
772	bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
773
774	// reset information about this tag
775	tInfo.inTag = false;
776	tInfo.startFrag = 0;
777	}
778
779	static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
780	// close tag if open
781	IP2TagInfo &tInfo = tagMapDict[el.tagName];
782	if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
783
784	// open this tag
785	tInfo.inTag = true;
786	tInfo.startFrag = numFrags;
787
788	// check for start of next fragment
789	bool wordLevelIndex = ivfLevel.indexLevel.empty();
790	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
791	++numFrags;
792	inFrag = true;
793	}
794	}
795
796	static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
797	// check for end of fragment
798	bool wordLevelIndex = ivfLevel.indexLevel.empty();
799	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
800	inFrag = false;
801	}
802
803	IP2TagInfo &tInfo = tagMapDict[el.tagName];
804	CloseTextTag (tInfo, el.tagName);
805	}
806
807	static void ProcessText (const TextEl &el, bool &inFrag) {
808	// make sure this text is to be indexed
809	bool wordLevelIndex = ivfLevel.indexLevel.empty();
810	if (!wordLevelIndex && !inFrag) return;
811
812	const unsigned char *textHere = &(el.text[0]);
813	const unsigned char *textEnd = &(el.text[el.text.size() - 1]);
814	unsigned char mgWord[MAXSTEMLEN + 1];
815
816	if (!inaword_mgpp (textHere, textEnd))
817	ParseNonindexWord (textHere, textEnd);
818
819
820	// Alternately parse off words and non-words from the input
821
822	while (textHere <= textEnd) {
823	textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
824	textHere = ParseNonindexWord (textHere, textEnd);
825
826	if (mgWord[0] > 0) {
827	if (wordLevelIndex) ++numFrags;
828
829	mg_u_long wordNum = perf_hash (wordHashDict, mgWord);
830
831	/*
832	cout << wordNum << " \"";
833	cout.write (mgWord+1, *mgWord);
834	cout << "\" " << numFrags << "\n";
835	*/
836
837	// add this word to the inverted list
838	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
839	mg_u_long fragNum = numFrags;
840	int b = 1 << wordBitPtr.lgB;
841
842	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
843
844	// note: this assumes that fragments don't carry over between
845	// chunks (which they don't because all tags are closed at the
846	// end of each document and chunks are based on document
847	// boundaries), i.e. the first fragment number must be greater
848	// than the starting fragment number of the chunk.
849	if (fragNum > wordBitPtr.lastFragNum) {
850	buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
851	b, NULL);
852	if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
853
854	} else if (!wordLevelIndex) {
855	// add one to the frequency count for this word
856	buffer.seek (buffer.position()-1);
857	buffer.encodeBit (0); // unary encoding -- last = 1
858	buffer.encodeBit (1);
859	}
860
861	wordBitPtr.lastFragNum = fragNum;
862	wordBitPtr.here = buffer.position();
863	buffer.encodeDone();
864
865	// check for buffer overrun
866	bitPtrs.CheckWordBufOverrun (wordNum);
867	}
868	}
869	}
870
871	// combine the in memory inverted buffer with the disk
872	// based inverted file
873	static void DiskMerge (char *filename) {
874	bool wordLevelIndex = ivfLevel.indexLevel.empty();
875
876	// make sure we have something to process
877	if (numChunkDocs <= 0) return;
878
879	// open the inverted file
880	FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
881	MAGIC_INVF, MG_ABORT);
882	random_bitio_buffer invfOutBuf (invfFile);
883
884	// set up to decode the entries in memory
885	mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
886
887	// write out the word information
888	mg_u_long wordNum;
889	int b;
890	mg_u_long currFragNum;
891	mg_u_long delta;
892	mg_u_long currFreq;
893	for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
894	// go to the end of the last inverted file entry
895	InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
896	invfOutBuf.SEEK_X (wordDiskState.here);
897
898	// go to the start of the inverted chunk info in memory
899	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
900	memInBuf.seek(wordBitPtr.start);
901
902	// decode each entry and re-write to disk
903	currFragNum = chunkStartFragNum;
904	while (memInBuf.position() < wordBitPtr.here) {
905	// decode word entry
906	b = 1 << wordBitPtr.lgB;
907	delta = memInBuf.bblock_decode (b, NULL);
908	currFragNum += delta;
909	if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
910	else currFreq = 1;
911
912	// recode on disk
913	invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
914	wordDiskState.B, NULL);
915	if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
916	wordDiskState.lastFragNum = currFragNum;
917	}
918
919	wordDiskState.here = invfOutBuf.TELL_X();
920	}
921
922	// write out the tag information
923	mg_u_long tagNum;
924	mg_u_long currTagStart;
925	mg_u_long currTagEnd;
926	for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
927	// go to the end of the last inverted file entry
928	InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
929	invfOutBuf.SEEK_X (tagDiskState.here);
930
931	// go to the start of the inverted chunk info in memory
932	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
933	memInBuf.seek(tagBitPtr.start);
934
935	// decode each entry and re-write to disk
936	currTagEnd = chunkStartFragNum;
937	while (memInBuf.position() < tagBitPtr.here) {
938	// decode tag entry
939	b = 1 << tagBitPtr.lgB;
940	delta = memInBuf.bblock_decode (b, NULL) - 1;
941	currTagStart = currTagEnd + delta;
942	delta = memInBuf.bblock_decode (b, NULL) - 1;
943	currTagEnd = currTagStart + delta;
944
945	// recode on disk
946	invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
947	tagDiskState.B, NULL);
948	invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
949	tagDiskState.B, NULL);
950
951	tagDiskState.lastFragNum = currTagEnd;
952	}
953
954	tagDiskState.here = invfOutBuf.TELL_X();
955	}
956
957	memInBuf.done();
958
959	invfOutBuf.encodeDone();
960	fclose (invfFile);
961	}
962
963
964	int process_ivf_2 (const TagInfo &/tagInfo/, const TextElArray &doc) {
965	bool wordLevelIndex = ivfLevel.indexLevel.empty();
966	bool inFrag = false;
967	if (wordLevelIndex) inFrag = true; // unconditional
968
969	// get next chunk information if need to. the chunk information
970	// is needed before the first document is processed
971	if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
972
973	// process each text element
974	TextElArray::const_iterator here = doc.begin();
975	TextElArray::const_iterator end = doc.end();
976	while (here != end) {
977	// process this element
978	if ((here).elType == OpenTagE) ProcessOpenTag (here, inFrag);
979	else if ((here).elType == CloseTagE) ProcessCloseTag (here, inFrag);
980	else ProcessText (*here, inFrag);
981
982	++here;
983	}
984
985	// close off any unclosed tags
986	TagMapDict::iterator tdHere = tagMapDict.begin();
987	TagMapDict::iterator tdEnd = tagMapDict.end();
988	while (tdHere != tdEnd) {
989	CloseTextTag ((tdHere).second, (tdHere).first);
990	++tdHere;
991	}
992
993	// we've processed one more document
994	++numDocs;
995	++numChunkDocs;
996
997	// merge the memory based inverted file with the one on
998	// disk if this is the end of this chunk
999	if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
1000
1001	return COMPALLOK;
1002	}
1003
1004
1005	static void CondenseInvfFile (char *filename, mg_u_long &bytesOutput) {
1006	FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1007	MAGIC_INVF, MG_ABORT);
1008	FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1009	MAGIC_INVF, MG_ABORT);
1010
1011	// skip the magic number
1012	fseek (outInvfFile, sizeof (mg_u_long), SEEK_SET);
1013
1014	// write the inverted file header -- use defaults for most things
1015	invf_file_header ifh;
1016	ifh.no_of_words = idh.word_dict_size;
1017	ifh.no_of_tags = idh.tag_dict_size;
1018	ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1019	ifh.Write (outInvfFile);
1020
1021	bytesOutput = ftell (outInvfFile);
1022
1023	// process each meaningful byte in the file
1024	mg_u_long numEntries = ifh.no_of_words + ifh.no_of_tags;
1025	mg_u_long entryNum;
1026	mg_ullong lastStart = 0;
1027	for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1028	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1029
1030	// overrun check
1031	if (stateRec.start < lastStart)
1032	FatalError (1, "Inverted file Buffer overrun");
1033	lastStart = stateRec.start;
1034
1035	mg_u_long oldEntryStart = stateRec.start >> 3;
1036	mg_u_long oldEntryStartOver = stateRec.start & 7; // should be 0
1037	mg_u_long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1038	mg_u_long oldEntryEndOver = stateRec.here & 7;
1039
1040	fseek (inInvfFile, oldEntryStart, SEEK_SET);
1041
1042	stateRec.here -= stateRec.start;
1043	stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver;
1044	stateRec.here += stateRec.start;
1045	while (oldEntryStart < oldEntryEnd) {
1046	unsigned char c = getc (inInvfFile);
1047	if (oldEntryStart == oldEntryEnd - 1) {
1048	u_char ands[8] =
1049	{0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1050	c &= ands[oldEntryEndOver];
1051	}
1052	putc (c, outInvfFile);
1053	++bytesOutput;
1054	++oldEntryStart;
1055	}
1056	}
1057
1058	fclose (inInvfFile);
1059
1060	#ifdef __WIN32__
1061	if (_chsize (_fileno (outInvfFile), bytesOutput) != 0)
1062	Message ("Could not truncate invf.");
1063	#else
1064	if(ftruncate (fileno (outInvfFile), bytesOutput) != 0)
1065	{
1066	fprintf(stderr, "Could not truncate invf");
1067	}
1068	#endif
1069
1070	fclose (outInvfFile);
1071	}
1072
1073	static void OutputInvfIdx (char *filename, mg_u_long invfNumBytes) {
1074	FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1075	MAGIC_INVI, MG_ABORT);
1076
1077	// process each meaningful byte in the file
1078	mg_u_long numEntries = idh.word_dict_size + idh.tag_dict_size;
1079	mg_u_long entryNum;
1080	for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1081	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1082
1083	// assumes that inverted entries start at beginning of each byte
1084	if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1085	}
1086
1087	WriteUL (invfIdxFile, invfNumBytes);
1088
1089	fclose (invfIdxFile);
1090	}
1091
1092
1093	int done_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
1094	// close most open files
1095	if (chunkFile != NULL) {
1096	chunkBuf.done();
1097	fclose (chunkFile);
1098	chunkFile = NULL;
1099	}
1100	occurConvert.Close();
1101
1102	// free allocated memory
1103	bitPtrs.Clear();
1104	if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1105	free_perf_hash (wordHashDict);
1106	wordHashDict = NULL;
1107	tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1108
1109	// condense the inverted file and truncate it
1110	// this function also writes out the inverted header
1111	mg_u_long invfNumBytes = 0;
1112	CondenseInvfFile (filename, invfNumBytes);
1113
1114	OutputInvfIdx (filename, invfNumBytes);
1115
1116	// close the rest of the open files
1117	invfState.Close ();
1118
1119	return COMPALLOK;
1120	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/mgpp/text/ivf.pass2.cpp@ 25147

Download in other formats: