Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/mgpp/text/ivf.pass2.cpp@ 879

Last change on this file since 879 was 856, checked in by sjboddie, 24 years ago
Rodgers new C++ mg
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.2 KB

Line
1	/**************************************************************************
2	*
3	* ivf.pass2.cpp -- Memory efficient pass 2 inversion
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: ivf.pass2.cpp 856 2000-01-14 02:26:25Z sjboddie $
21	*
22	**************************************************************************/
23
24	/*
25	$Log$
26	Revision 1.1 2000/01/14 02:26:07 sjboddie
27	Rodgers new C++ mg
28
29	*/
30
31	#include <stdio.h>
32	#include <unistd.h>
33
34	#include "non_ansi.h"
35
36	#include "sysfuncs.h"
37
38	#include "mg_files.h"
39	#include "invf.h"
40	#include "mg.h"
41	#include "build.h"
42	#include "locallib.h"
43	#include "UCArray.h"
44	#include "bitio_m_random.h"
45	#include "bitio_m_stdio.h"
46	#include "bitio_m_mems.h"
47	#include "bitio_gen.h"
48	#include <stdio.h>
49	#include "words.h"
50	#include "messages.h"
51	#include "netorder.h"
52	#include "FIvfLevelInfo.h"
53	#include "perf_hash.h"
54	#include "string.h"
55
56	#include "longlong.h"
57
58	#ifdef __WIN32_
59	#include <io.h>
60	#endif
61
62	#if defined(GSDL_USE_OBJECTSPACE)
63	# include <ospace\std\map>
64	#elif defined(GSDL_USE_STL_H)
65	# include <map.h>
66	#else
67	# include <map>
68	#endif
69
70
71	#ifdef USE_LONG_LONG
72	#define SEEK_X seek_LL
73	#define TELL_X tell_LL
74	#else
75	#define SEEK_X seek
76	#define TELL_X tell
77	#endif
78
79	#ifndef RND_BUF_SIZE
80	#define RND_BUF_SIZE 8*1024
81	#endif
82
83
84	static unsigned long numDocs = 0;
85	static unsigned long numChunkDocs = 0;
86	static unsigned long numDocsInChunk = 0;
87
88	static unsigned long numFrags = 0;
89	static unsigned long numFragsInChunk = 0;
90	static unsigned long chunkStartFragNum = 0;
91
92
93
94
95	struct BitPtr {
96	unsigned long start;
97	unsigned long here;
98	unsigned long lastFragNum;
99	unsigned long lgB;
100
101	void Clear () { start = here = lastFragNum = lgB = 0; }
102	BitPtr () { Clear(); }
103	};
104
105	class WordBitPtrs {
106	protected:
107	unsigned long numWords;
108	unsigned long numTags;
109	unsigned long size;
110	BitPtr *wordBitPtrs;
111
112	void CheckBufOverrun (unsigned long num) {
113	if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
114	cerr << "numDocs: " << numDocs << "\n";
115	cerr << "numChunkDocs: " << numChunkDocs << "\n";
116	cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
117	cerr << "numFrags: " << numFrags << "\n";
118	cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
119	cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
120	cerr << "num: " << num << "\n";
121	cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
122	cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
123	cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
124	FatalError (1, "Bit buffer overrun");
125	}
126	}
127
128	public:
129	void Clear ();
130	WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
131	~WordBitPtrs ();
132	void SetSize (unsigned long _numWords,
133	unsigned long _numTags);
134
135	void ResetPtrs () {
136	if (wordBitPtrs == NULL) return;
137	unsigned long i;
138	for (i=0; i<size; i++) wordBitPtrs[i].Clear();
139	}
140
141	BitPtr &GetWordBitPtr (unsigned long wordNum)
142	{ return wordBitPtrs[wordNum]; }
143	unsigned long &GetWordStart (unsigned long wordNum)
144	{ return wordBitPtrs[wordNum].start; }
145	unsigned long &GetWordHere (unsigned long wordNum)
146	{ return wordBitPtrs[wordNum].here; }
147	void CheckWordBufOverrun (unsigned long wordNum)
148	{ CheckBufOverrun (wordNum); }
149
150	BitPtr &GetTagBitPtr (unsigned long tagNum)
151	{ return wordBitPtrs[tagNum + numWords]; }
152	unsigned long &GetTagStart (unsigned long tagNum)
153	{ return wordBitPtrs[tagNum + numWords].start; }
154	unsigned long &GetTagHere (unsigned long tagNum)
155	{ return wordBitPtrs[tagNum + numWords].here; }
156	void CheckTagBufOverrun (unsigned long tagNum)
157	{ CheckBufOverrun (tagNum + numWords); }
158
159	BitPtr &GetEndBitPtr ()
160	{ return wordBitPtrs[size-1]; }
161	unsigned long &GetEndStart ()
162	{ return wordBitPtrs[size-1].start; }
163	unsigned long &GetEndHere ()
164	{ return wordBitPtrs[size-1].here; }
165	};
166
167
168	struct IP2TagInfo {
169	bool inTag;
170	unsigned long startFrag;
171	unsigned long tagNum;
172
173	IP2TagInfo () {
174	inTag = false;
175	startFrag = 0;
176	tagNum = 0;
177	}
178	};
179
180	// maps tags to tag information
181	typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
182
183
184	// class to handle the translation of occurrence order
185	// to dictionary order for words and tags
186	class OccurToDictConverter {
187	protected:
188	unsigned long pos;
189	unsigned long val;
190	FILE *transFile;
191	random_bitio_buffer rbs;
192
193	unsigned long wordDictSize;
194	unsigned long tagDictSize;
195
196	void SeekStart ();
197	unsigned long TranslateNum (unsigned long num);
198
199	public:
200	OccurToDictConverter ();
201	~OccurToDictConverter ();
202
203	void Open (char *filename, unsigned long _wordDictSize,
204	unsigned long _tagDictSize);
205
206	// Close frees all allocated memory
207	void Close ();
208
209	unsigned long TranslateWord (unsigned long occurNum)
210	{ return TranslateNum (occurNum); }
211	unsigned long TranslateTag (unsigned long occurNum)
212	{ return TranslateNum (occurNum+wordDictSize); }
213	};
214
215
216	struct InvfStateRec {
217	mg_ullong start;
218	mg_ullong here;
219	unsigned long lastFragNum;
220	unsigned long B;
221
222	void Clear () {
223	start = here = 0;
224	lastFragNum = B = 0;
225	}
226	InvfStateRec () { Clear (); }
227	};
228
229
230	#define ISR_SIZE 1024
231
232	class InvfStateCache {
233	protected:
234	InvfStateRec recCache [ISR_SIZE];
235	unsigned long startNum;
236
237	FILE *stateFile;
238
239	void ClearCache () {
240	unsigned int i = 0;
241	for (i=0; i<ISR_SIZE; i++) recCache[i].Clear();
242	}
243
244	public:
245	InvfStateCache ();
246	~InvfStateCache ();
247
248	void Open (char *filename);
249	void Close ();
250
251	// previous references to state records may be
252	// invalidated calling GetRec
253	InvfStateRec &GetRec (unsigned long num);
254	};
255
256
257	static invf_dict_header idh;
258	static WordBitPtrs bitPtrs;
259
260	static FILE *chunkFile = NULL;
261	static stdio_bitio_buffer chunkBuf;
262
263	static unsigned long ivfMemBufSize = 0;
264	static char *ivfMemBuf = NULL;
265
266	// word and tag dictionaries. a map is used for the tag dictionary
267	// as it should never be very big (and the perfect hash function
268	// sometimes has trouble with small values).
269	static perf_hash_data *wordHashDict = NULL;
270	static TagMapDict tagMapDict;
271
272	// information about all the different levels
273	static FIvfLevel ivfLevel;
274
275	static OccurToDictConverter occurConvert;
276
277	// information about the state of the inverted file
278	static InvfStateCache invfState;
279
280	static char collectFilename[512];
281
282
283	void WordBitPtrs::Clear () {
284	numWords = 0;
285	numTags = 0;
286	size=0;
287	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
288	wordBitPtrs = NULL;
289	}
290
291	WordBitPtrs::~WordBitPtrs () {
292	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
293	}
294
295	void WordBitPtrs::SetSize (unsigned long _numWords,
296	unsigned long _numTags){
297	Clear();
298	numWords = _numWords;
299	numTags = _numTags;
300	size = numWords + numTags + 1;
301	wordBitPtrs = new BitPtr [size];
302	}
303
304
305	void OccurToDictConverter::SeekStart () {
306	if (transFile == NULL) return;
307	rbs.SEEK_X (sizeof (unsigned long) * 8);
308	pos = 0;
309	}
310
311	unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
312	if (num < pos) SeekStart ();
313	while (pos <= num) {
314	if (pos < wordDictSize)
315	val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
316	else
317	val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
318	pos++;
319	}
320	return val;
321	}
322
323	OccurToDictConverter::OccurToDictConverter () {
324	pos = 0;
325	val = 0;
326	transFile = NULL;
327	wordDictSize = 0;
328	tagDictSize = 0;
329	}
330
331	OccurToDictConverter::~OccurToDictConverter () {
332	if (transFile != NULL) Close ();
333	}
334
335	void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
336	unsigned long _tagDictSize) {
337	if (transFile != NULL) Close ();
338
339	wordDictSize = _wordDictSize;
340	tagDictSize = _tagDictSize;
341
342	transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
343	MAGIC_CHUNK_TRANS, MG_ABORT);
344	rbs.attachFile (transFile, RND_BUF_SIZE);
345	SeekStart ();
346	val = 0;
347	}
348
349	void OccurToDictConverter::Close () {
350	if (transFile == NULL) return;
351
352	rbs.done ();
353	fclose (transFile);
354	transFile = NULL;
355	pos = 0;
356	val = 0;
357
358	wordDictSize = 0;
359	tagDictSize = 0;
360	}
361
362
363
364
365	InvfStateCache::InvfStateCache () {
366	startNum = 0;
367	stateFile = NULL;
368	}
369
370	InvfStateCache::~InvfStateCache () {
371	if (stateFile != NULL) Close ();
372	}
373
374	void InvfStateCache::Open (char *filename) {
375	if (stateFile != NULL) Close();
376
377	// open the state file
378	char path[512];
379	sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
380	".invf.state", (long) getpid ());
381	if (!(stateFile = fopen (path, "wb+"))) {
382	Message ("Unable to create \"%s\"", path);
383	exit (1);
384	}
385	unlink (path); // file will be deleted after it is closed
386
387	// reset the buffer
388	startNum = 0;
389	ClearCache();
390	}
391
392	void InvfStateCache::Close () {
393	if (stateFile == NULL) return;
394	fclose (stateFile);
395	stateFile = NULL;
396	startNum = 0;
397	}
398
399	InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
400	// see if cached
401	if ((num >= startNum) && (num < startNum + ISR_SIZE))
402	return recCache[num-startNum];
403
404	// not cached, write out this lot of records and read in
405	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
406	fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
407
408	// read in the new set of records
409	ClearCache ();
410	startNum = num - (num % ISR_SIZE);
411	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
412	fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
413
414	return recCache[num-startNum];
415	}
416
417
418
419	static void ClearCharBuf (char *buf, unsigned long size) {
420	char *end = buf + size;
421	while (buf != end) *buf++ = 0;
422	}
423
424	static void ReadWordDict (char *filename) {
425	// read in the perfect hash function for the word dictionary
426	FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
427	MAGIC_HASH, MG_ABORT);
428	if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
429	FatalError (1, "Unable to read in hash data for word dictionary");
430	}
431	fclose (wordHashFile);
432	}
433
434	static void ReadTagDict (char *filename, invf_dict_header &_idh) {
435	// open the file
436	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
437	MAGIC_STEM_BUILD, MG_ABORT);
438
439	// seek to the start of the tag dictionary
440	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
441
442	unsigned long tagNum;
443	dict_el thisEl;
444	for (tagNum = 0; tagNum < _idh.tag_dict_size; tagNum++) {
445	thisEl.Read (dictFile);
446	tagMapDict[thisEl.el].tagNum = tagNum;
447	}
448
449	fclose (dictFile);
450	}
451
452	static void ReadLevelFile (char *filename) {
453	FILE *f;
454	f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
455	MAGIC_INVF_LEVELS, MG_ABORT);
456	ivfLevel.Read (f);
457	fclose (f);
458	}
459
460	void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
461	if (totalIBits < lastTotalIBits) {
462	fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
463	if (sizeof (mg_ullong) < 8) {
464	fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
465	}
466	fprintf(stderr, " Build aborted.\n");
467	exit(1);
468	}
469	}
470
471	// assumes the inverted file state file has been opened
472	static void InitInvfState (char *filename,
473	invf_dict_header &_idh,
474	InvfStateCache &_invfState,
475	mg_ullong &totalIBits,
476	bool wordLevelIndex) {
477	// read in the dictionary, setting inverted state information
478	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
479	MAGIC_STEM_BUILD, MG_ABORT);
480
481	// seek to the start of the word dictionary
482	fseek (dictFile, _idh.word_dict_start, SEEK_SET);
483
484	// add the word entries
485	word_dict_el wordEl;
486	wordEl.SetNumLevels (_idh.num_levels);
487	unsigned long dictWordNum, p;
488	mg_ullong lastTotalIBits;
489	unsigned long N = _idh.num_frags;
490	for (dictWordNum=0; dictWordNum<_idh.word_dict_size; dictWordNum++) {
491	// lastTotalIBits is used to detect integer overflow
492	lastTotalIBits = totalIBits;
493
494	// read the next word and associated information
495	wordEl.Read (dictFile, _idh.num_levels);
496
497	// update the state record
498	p = wordEl.frag_occur;
499	InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
500	wisr.start = totalIBits;
501	wisr.here = totalIBits;
502	wisr.B = BIO_Bblock_Init (N, p);
503
504	// add the length of the fragment numbers
505	totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
506
507	// if needed, add the length of the fragment frequency information
508	if (!wordLevelIndex)
509	totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
510
511	// align next byte
512	#ifdef USE_LONG_LONG
513	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
514	#else
515	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
516	#endif
517
518	CheckIntOverflow (totalIBits, lastTotalIBits);
519	}
520
521
522	// seek to the start of the tag dictionary
523	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
524
525	// add the tag entries
526	dict_el tagEl;
527	unsigned long dictTagNum;
528	N = _idh.num_frags;
529	for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; dictTagNum++) {
530	// lastTotalIBits is used to detect integer overflow
531	lastTotalIBits = totalIBits;
532
533	// read the next tag and associated information
534	tagEl.Read (dictFile);
535
536	// update the state record
537	p = tagEl.frag_occur*2;
538	InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
539	tisr.start = totalIBits;
540	tisr.here = totalIBits;
541	tisr.B = BIO_Bblock_Init (N+p, p);
542
543	// add the length of the fragment numbers (two numbers for each
544	// tag, one for start and one for end)
545	totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
546
547	// align next byte
548	#ifdef USE_LONG_LONG
549	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
550	#else
551	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
552	#endif
553
554	CheckIntOverflow (totalIBits, lastTotalIBits);
555	}
556
557	fclose (dictFile);
558	}
559
560	/*
561	// assumes the chunk tag information has been placed in .first
562	static void PrintChunkInfo (unsigned long chunkMem,
563	unsigned long numChunkWords,
564	unsigned long numChunkTags) {
565	static unsigned long chunksRead = 0;
566	chunksRead++;
567	cout << "Chunk Number: " << chunksRead << "\n";
568	cout << "numChunkDocs " << numDocsInChunk << "\n";
569	cout << "numChunkFrags " << numFragsInChunk << "\n";
570	cout << "mem " << chunkMem << "\n";
571	cout << "numWords " << numChunkWords << "\n";
572	cout << "numTags " << numChunkTags << "\n\n";
573
574	TagMapDict::iterator tagMapHere = tagMapDict.begin();
575	TagMapDict::iterator tagMapEnd = tagMapDict.end();
576	while (tagMapHere != tagMapEnd) {
577	unsigned long tagMapNum = (*tagMapHere).second.tagNum;
578	cout << (*tagMapHere).first << " " << tagMapNum << " "
579	<< bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
580	tagMapHere++;
581	}
582	}
583	*/
584
585	void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
586	// reset globals
587	numChunkDocs = 0;
588	chunkStartFragNum = numFrags;
589
590	// read in information about this chunk
591	numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
592	if (numDocsInChunk == 0)
593	FatalError (1, "The number of docs in the current chunk is 0");
594
595	numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
596	unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
597
598	if (chunkMem > ivfMemBufSize)
599	FatalError (1, "Chunk memory size is greater than maximum");
600
601	unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
602	unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
603
604
605	// reset stuff
606	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
607	bitPtrs.ResetPtrs();
608
609	// read in the entries in occurrence order storing the
610	// "chunkWordCount" in "start" and the "chunkFragCount"
611	// in "here"
612	unsigned long numOccur;
613	unsigned long wordNum;
614	for (numOccur=0; numOccur<numChunkWords; numOccur++) {
615	wordNum = occurConvert.TranslateWord (numOccur);
616	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
617	wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
618	if (wordPtr.start >= 2)
619	wordPtr.here = chunkBuf.gamma_decode (NULL);
620	else wordPtr.here = wordPtr.start;
621	}
622	unsigned long tagNum;
623	for (numOccur=0; numOccur<numChunkTags; numOccur++) {
624	tagNum = occurConvert.TranslateTag (numOccur);
625	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
626	// only chunkFragCount is encoded for tags
627	tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
628	tagPtr.here = tagPtr.start;
629	}
630
631	/* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
632
633	// create the bit ptrs in dictionary order
634	unsigned long totalIBits = 0; // only dealing with memory
635	unsigned long chunkWordCount, chunkFragCount;
636	for (wordNum=0; wordNum<_idh.word_dict_size; wordNum++) {
637	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
638	chunkWordCount = wordPtr.start;
639	chunkFragCount = wordPtr.here;
640	wordPtr.start = totalIBits;
641	wordPtr.here = totalIBits;
642	wordPtr.lastFragNum = chunkStartFragNum;
643	wordPtr.lgB = 0;
644	if (chunkWordCount > 0) {
645	wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
646	chunkFragCount));
647	totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
648	// use unary encoding for memory buffer encoding of fragment freq
649	if (!wordLevelIndex) {
650	totalIBits += chunkWordCount;
651	}
652	}
653	}
654	for (tagNum=0; tagNum<_idh.tag_dict_size; tagNum++) {
655	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
656	chunkFragCount = tagPtr.here;
657	tagPtr.start = totalIBits;
658	tagPtr.here = totalIBits;
659	tagPtr.lastFragNum = chunkStartFragNum;
660	tagPtr.lgB = 0;
661	if (chunkFragCount > 0) {
662	unsigned long pTag = chunkFragCount*2;
663	tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
664	pTag));
665	unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
666	pTag);
667	// cout << tagNum + _idh.word_dict_size << " ";
668	// cout << "numFrags: " << numFragsInChunk
669	// << " chunkFragCount: " << chunkFragCount
670	// << " B: " << 1 << tagPtr.lgB
671	// << " blen: " << blen << "\n";
672	totalIBits += bLen;
673	}
674	}
675	bitPtrs.GetEndStart() = totalIBits;
676	bitPtrs.GetEndHere() = totalIBits;
677
678	if ((totalIBits + 7ul) >> 3ul > chunkMem) {
679	cerr << "totalIBits: " << totalIBits << "\n";
680	cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
681	cerr << "chunkMem: " << chunkMem << "\n";
682	FatalError (1, "Pointers exceed buffer size");
683	}
684	}
685
686
687
688
689	int init_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
690	// read in compressed dictionary header
691	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
692	MAGIC_STEM_BUILD, MG_ABORT);
693	idh.Read (dictFile);
694	fclose (dictFile);
695
696	// set the size of the bit ptrs
697	bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
698
699	// open the chunk file and read in the maximum memory needed
700	// for the inverted memory buffer
701	chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
702	MAGIC_CHUNK, MG_ABORT);
703	ReadUL (chunkFile, ivfMemBufSize);
704	chunkBuf.attachFile (chunkFile);
705
706	// allocate memory for the inverted buffer
707	ivfMemBuf = new char [ivfMemBufSize];
708	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
709
710	// read in the word dictionary
711	ReadWordDict (filename);
712
713	// read in the tag dictionary
714	ReadTagDict (filename, idh);
715
716	// read in the level information
717	ReadLevelFile (filename);
718	bool wordLevelIndex = ivfLevel.indexLevel.empty();
719
720	// set up the translation table file
721	occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
722
723	// reset some globals
724	numDocs = 0;
725	numChunkDocs = 0;
726	numDocsInChunk = 0;
727	numFrags = 0;
728	numFragsInChunk = 0;
729	chunkStartFragNum = 0;
730
731	strcpy (collectFilename, filename);
732
733
734	// create the inverted file
735	mg_ullong totalIBits = 0;
736	FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
737	MAGIC_INVF, MG_ABORT);
738	totalIBits += sizeof (unsigned long) * 8; // magic number
739	totalIBits += 8 * 200; // 200 byte gap -- why??????
740	fclose (invfFile);
741
742	// init the inverted file state cache
743	invfState.Open (filename);
744	InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
745
746	return COMPALLOK;
747	}
748
749	static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/tagName/) {
750	if (!tInfo.inTag) return;
751
752	// add this tag to the inverted list
753	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
754	unsigned long endFrag = numFrags;
755	int b = 1 << tagBitPtr.lgB;
756
757	/*
758	cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
759	<< tInfo.startFrag << " " << endFrag << "\n";
760	*/
761
762	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
763	buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
764	b, NULL);
765	buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
766	tagBitPtr.lastFragNum = endFrag;
767	tagBitPtr.here = buffer.position();
768	buffer.encodeDone();
769
770	// check for buffer overrun
771	bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
772
773	// reset information about this tag
774	tInfo.inTag = false;
775	tInfo.startFrag = 0;
776	}
777
778	static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
779	// close tag if open
780	IP2TagInfo &tInfo = tagMapDict[el.tagName];
781	if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
782
783	// open this tag
784	tInfo.inTag = true;
785	tInfo.startFrag = numFrags;
786
787	// check for start of next fragment
788	bool wordLevelIndex = ivfLevel.indexLevel.empty();
789	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
790	numFrags++;
791	inFrag = true;
792	}
793	}
794
795	static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
796	// check for end of fragment
797	bool wordLevelIndex = ivfLevel.indexLevel.empty();
798	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
799	inFrag = false;
800	}
801
802	IP2TagInfo &tInfo = tagMapDict[el.tagName];
803	CloseTextTag (tInfo, el.tagName);
804	}
805
806	static void ProcessText (const TextEl &el, bool &inFrag) {
807	// make sure this text is to be indexed
808	bool wordLevelIndex = ivfLevel.indexLevel.empty();
809	if (!wordLevelIndex && !inFrag) return;
810
811	const unsigned char *textHere = el.text.begin();
812	const unsigned char *textEnd = el.text.end() - 1;
813	unsigned char mgWord[MAXSTEMLEN + 1];
814
815	if (!inaword (textHere, textEnd))
816	ParseNonindexWord (textHere, textEnd);
817
818
819	// Alternately parse off words and non-words from the input
820
821	while (textHere <= textEnd) {
822	textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
823	textHere = ParseNonindexWord (textHere, textEnd);
824
825	if (mgWord[0] > 0) {
826	if (wordLevelIndex) numFrags++;
827
828	unsigned long wordNum = perf_hash (wordHashDict, mgWord);
829
830	/*
831	cout << wordNum << " \"";
832	cout.write (mgWord+1, *mgWord);
833	cout << "\" " << numFrags << "\n";
834	*/
835
836	// add this word to the inverted list
837	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
838	unsigned long fragNum = numFrags;
839	int b = 1 << wordBitPtr.lgB;
840
841	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
842
843	// note: this assumes that fragments don't carry over between
844	// chunks (which they don't because all tags are closed at the
845	// end of each document and chunks are based on document
846	// boundaries), i.e. the first fragment number must be greater
847	// than the starting fragment number of the chunk.
848	if (fragNum > wordBitPtr.lastFragNum) {
849	buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
850	b, NULL);
851	if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
852
853	} else if (!wordLevelIndex) {
854	// add one to the frequency count for this word
855	buffer.seek (buffer.position()-1);
856	buffer.encodeBit (0); // unary encoding -- last = 1
857	buffer.encodeBit (1);
858	}
859
860	wordBitPtr.lastFragNum = fragNum;
861	wordBitPtr.here = buffer.position();
862	buffer.encodeDone();
863
864	// check for buffer overrun
865	bitPtrs.CheckWordBufOverrun (wordNum);
866	}
867	}
868	}
869
870	// combine the in memory inverted buffer with the disk
871	// based inverted file
872	static void DiskMerge (char *filename) {
873	bool wordLevelIndex = ivfLevel.indexLevel.empty();
874
875	// make sure we have something to process
876	if (numChunkDocs <= 0) return;
877
878	// open the inverted file
879	FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
880	MAGIC_INVF, MG_ABORT);
881	random_bitio_buffer invfOutBuf (invfFile);
882
883	// set up to decode the entries in memory
884	mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
885
886	// write out the word information
887	unsigned long wordNum;
888	int b;
889	unsigned long currFragNum;
890	unsigned long delta;
891	unsigned long currFreq;
892	for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
893	// go to the end of the last inverted file entry
894	InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
895	invfOutBuf.SEEK_X (wordDiskState.here);
896
897	// go to the start of the inverted chunk info in memory
898	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
899	memInBuf.seek(wordBitPtr.start);
900
901	// decode each entry and re-write to disk
902	currFragNum = chunkStartFragNum;
903	while (memInBuf.position() < wordBitPtr.here) {
904	// decode word entry
905	b = 1 << wordBitPtr.lgB;
906	delta = memInBuf.bblock_decode (b, NULL);
907	currFragNum += delta;
908	if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
909	else currFreq = 1;
910
911	// recode on disk
912	invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
913	wordDiskState.B, NULL);
914	if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
915	wordDiskState.lastFragNum = currFragNum;
916	}
917
918	wordDiskState.here = invfOutBuf.TELL_X();
919	}
920
921	// write out the tag information
922	unsigned long tagNum;
923	unsigned long currTagStart;
924	unsigned long currTagEnd;
925	for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
926	// go to the end of the last inverted file entry
927	InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
928	invfOutBuf.SEEK_X (tagDiskState.here);
929
930	// go to the start of the inverted chunk info in memory
931	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
932	memInBuf.seek(tagBitPtr.start);
933
934	// decode each entry and re-write to disk
935	currTagEnd = chunkStartFragNum;
936	while (memInBuf.position() < tagBitPtr.here) {
937	// decode tag entry
938	b = 1 << tagBitPtr.lgB;
939	delta = memInBuf.bblock_decode (b, NULL) - 1;
940	currTagStart = currTagEnd + delta;
941	delta = memInBuf.bblock_decode (b, NULL) - 1;
942	currTagEnd = currTagStart + delta;
943
944	// recode on disk
945	invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
946	tagDiskState.B, NULL);
947	invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
948	tagDiskState.B, NULL);
949
950	tagDiskState.lastFragNum = currTagEnd;
951	}
952
953	tagDiskState.here = invfOutBuf.TELL_X();
954	}
955
956	memInBuf.done();
957
958	invfOutBuf.encodeDone();
959	fclose (invfFile);
960	}
961
962
963	int process_ivf_2 (const TagInfo &/tagInfo/, const TextElArray &doc) {
964	bool wordLevelIndex = ivfLevel.indexLevel.empty();
965	bool inFrag = false;
966	if (wordLevelIndex) inFrag = true; // unconditional
967
968	// get next chunk information if need to. the chunk information
969	// is needed before the first document is processed
970	if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
971
972	// process each text element
973	TextElArray::const_iterator here = doc.begin();
974	TextElArray::const_iterator end = doc.end();
975	while (here != end) {
976	// process this element
977	if ((here).elType == OpenTagE) ProcessOpenTag (here, inFrag);
978	else if ((here).elType == CloseTagE) ProcessCloseTag (here, inFrag);
979	else ProcessText (*here, inFrag);
980
981	here++;
982	}
983
984	// close off any unclosed tags
985	TagMapDict::iterator tdHere = tagMapDict.begin();
986	TagMapDict::iterator tdEnd = tagMapDict.end();
987	while (tdHere != tdEnd) {
988	CloseTextTag ((tdHere).second, (tdHere).first);
989	tdHere++;
990	}
991
992	// we've processed one more document
993	numDocs++;
994	numChunkDocs++;
995
996	// merge the memory based inverted file with the one on
997	// disk if this is the end of this chunk
998	if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
999
1000	return COMPALLOK;
1001	}
1002
1003
1004	static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
1005	FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1006	MAGIC_INVF, MG_ABORT);
1007	FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1008	MAGIC_INVF, MG_ABORT);
1009
1010	// skip the magic number
1011	fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
1012
1013	// write the inverted file header -- use defaults for most things
1014	invf_file_header ifh;
1015	ifh.no_of_words = idh.word_dict_size;
1016	ifh.no_of_tags = idh.tag_dict_size;
1017	ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1018	ifh.Write (outInvfFile);
1019
1020	bytesOutput = ftell (outInvfFile);
1021
1022	// process each meaningful byte in the file
1023	unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
1024	unsigned long entryNum;
1025	mg_ullong lastStart = 0;
1026	for (entryNum = 0; entryNum < numEntries; entryNum++) {
1027	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1028
1029	// overrun check
1030	if (stateRec.start < lastStart)
1031	FatalError (1, "Inverted file Buffer overrun");
1032	lastStart = stateRec.start;
1033
1034	unsigned long oldEntryStart = stateRec.start >> 3;
1035	unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
1036	unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1037	unsigned long oldEntryEndOver = stateRec.here & 7;
1038
1039	fseek (inInvfFile, oldEntryStart, SEEK_SET);
1040
1041	stateRec.here -= stateRec.start;
1042	stateRec.start = bytesOutput * 8 + oldEntryStartOver;
1043	stateRec.here += stateRec.start;
1044	while (oldEntryStart < oldEntryEnd) {
1045	unsigned char c = getc (inInvfFile);
1046	if (oldEntryStart == oldEntryEnd - 1) {
1047	u_char ands[8] =
1048	{0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1049	c &= ands[oldEntryEndOver];
1050	}
1051	putc (c, outInvfFile);
1052	bytesOutput++;
1053	oldEntryStart++;
1054	}
1055	}
1056
1057	fclose (inInvfFile);
1058
1059	#ifdef __WIN32__
1060	if (!(_chsize (_fileno (outInvfFile), bytesOutput)))
1061	Message ("Could not truncate invf.");
1062	#else
1063	ftruncate (fileno (outInvfFile), bytesOutput);
1064	#endif
1065
1066	fclose (outInvfFile);
1067	}
1068
1069	static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
1070	FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1071	MAGIC_INVI, MG_ABORT);
1072
1073	// process each meaningful byte in the file
1074	unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
1075	unsigned long entryNum;
1076	for (entryNum = 0; entryNum < numEntries; entryNum++) {
1077	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1078
1079	// assumes that inverted entries start at beginning of each byte
1080	if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1081	}
1082
1083	WriteUL (invfIdxFile, invfNumBytes);
1084
1085	fclose (invfIdxFile);
1086	}
1087
1088
1089	int done_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
1090	// close most open files
1091	if (chunkFile != NULL) {
1092	chunkBuf.done();
1093	fclose (chunkFile);
1094	chunkFile = NULL;
1095	}
1096	occurConvert.Close();
1097
1098	// free allocated memory
1099	bitPtrs.Clear();
1100	if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1101	free_perf_hash (wordHashDict);
1102	wordHashDict = NULL;
1103	tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1104
1105	// condense the inverted file and truncate it
1106	// this function also writes out the inverted header
1107	unsigned long invfNumBytes = 0;
1108	CondenseInvfFile (filename, invfNumBytes);
1109
1110	OutputInvfIdx (filename, invfNumBytes);
1111
1112	// close the rest of the open files
1113	invfState.Close ();
1114
1115	return COMPALLOK;
1116	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: