Context Navigation

source: indexers/trunk/mgpp/text/ivf.pass2.cpp@ 20847

Last change on this file since 20847 was 19822, checked in by mdewsnip, 15 years ago

Commented out all occurrences of

#define _XOPEN_SOURCE_EXTENDED 1

This was allegedly added for compilation on Solaris, but it just causes errors for me (on the NLNZ Solaris machines).

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 31.4 KB

Line
1	/**************************************************************************
2	*
3	* ivf.pass2.cpp -- Memory efficient pass 2 inversion
4	* Copyright (C) 1999 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#define _XOPEN_SOURCE 1
23	// This was added for Solaris, but it makes things worse on Solaris for me...
24	// #define _XOPEN_SOURCE_EXTENDED 1
25
26	#include <stdio.h>
27
28	#if defined __WIN32__
29	# include <process.h>
30	# include <io.h>
31	# define getpid _getpid
32	# define unlink _unlink
33	#else
34	# include <unistd.h>
35	# include "non_ansi.h"
36	#endif
37
38	#include "UCArray.h"
39	#include "sysfuncs.h"
40	#include "mg_files.h"
41	#include "invf.h"
42	#include "mg.h"
43	#include "build.h"
44	#include "locallib.h"
45	#include "bitio_m_random.h"
46	#include "bitio_m_stdio.h"
47	#include "bitio_m_mems.h"
48	#include "bitio_gen.h"
49	#include <stdio.h>
50	#include "words.h"
51	#include "messages.h"
52	#include "netorder.h"
53	#include "FIvfLevelInfo.h"
54	#include "perf_hash.h"
55	#include "string.h"
56
57	#include "longlong.h"
58
59	#if defined(GSDL_USE_OBJECTSPACE)
60	# include <ospace\std\map>
61	#elif defined(GSDL_USE_STL_H)
62	# include <map.h>
63	#else
64	# include <map>
65	#endif
66
67
68	#ifdef USE_LONG_LONG
69	#define SEEK_X seek_LL
70	#define TELL_X tell_LL
71	#else
72	#define SEEK_X seek
73	#define TELL_X tell
74	#endif
75
76	#ifndef RND_BUF_SIZE
77	#define RND_BUF_SIZE 8*1024
78	#endif
79
80
81	static unsigned long numDocs = 0;
82	static unsigned long numChunkDocs = 0;
83	static unsigned long numDocsInChunk = 0;
84
85	static unsigned long numFrags = 0;
86	static unsigned long numFragsInChunk = 0;
87	static unsigned long chunkStartFragNum = 0;
88
89
90
91
92	struct BitPtr {
93	unsigned long start;
94	unsigned long here;
95	unsigned long lastFragNum;
96	unsigned long lgB;
97
98	void Clear () { start = here = lastFragNum = lgB = 0; }
99	BitPtr () { Clear(); }
100	};
101
102	class WordBitPtrs {
103	protected:
104	unsigned long numWords;
105	unsigned long numTags;
106	unsigned long size;
107	BitPtr *wordBitPtrs;
108
109	void CheckBufOverrun (unsigned long num) {
110	if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
111	cerr << "numDocs: " << numDocs << "\n";
112	cerr << "numChunkDocs: " << numChunkDocs << "\n";
113	cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
114	cerr << "numFrags: " << numFrags << "\n";
115	cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
116	cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
117	cerr << "num: " << num << "\n";
118	cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
119	cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
120	cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
121	FatalError (1, "Bit buffer overrun");
122	}
123	}
124
125	public:
126	void Clear ();
127	WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
128	~WordBitPtrs ();
129	void SetSize (unsigned long _numWords,
130	unsigned long _numTags);
131
132	void ResetPtrs () {
133	if (wordBitPtrs == NULL) return;
134	unsigned long i;
135	for (i=0; i<size; ++i) wordBitPtrs[i].Clear();
136	}
137
138	BitPtr &GetWordBitPtr (unsigned long wordNum)
139	{ return wordBitPtrs[wordNum]; }
140	unsigned long &GetWordStart (unsigned long wordNum)
141	{ return wordBitPtrs[wordNum].start; }
142	unsigned long &GetWordHere (unsigned long wordNum)
143	{ return wordBitPtrs[wordNum].here; }
144	void CheckWordBufOverrun (unsigned long wordNum)
145	{ CheckBufOverrun (wordNum); }
146
147	BitPtr &GetTagBitPtr (unsigned long tagNum)
148	{ return wordBitPtrs[tagNum + numWords]; }
149	unsigned long &GetTagStart (unsigned long tagNum)
150	{ return wordBitPtrs[tagNum + numWords].start; }
151	unsigned long &GetTagHere (unsigned long tagNum)
152	{ return wordBitPtrs[tagNum + numWords].here; }
153	void CheckTagBufOverrun (unsigned long tagNum)
154	{ CheckBufOverrun (tagNum + numWords); }
155
156	BitPtr &GetEndBitPtr ()
157	{ return wordBitPtrs[size-1]; }
158	unsigned long &GetEndStart ()
159	{ return wordBitPtrs[size-1].start; }
160	unsigned long &GetEndHere ()
161	{ return wordBitPtrs[size-1].here; }
162	};
163
164
165	struct IP2TagInfo {
166	bool inTag;
167	unsigned long startFrag;
168	unsigned long tagNum;
169
170	IP2TagInfo () {
171	inTag = false;
172	startFrag = 0;
173	tagNum = 0;
174	}
175	};
176
177	// maps tags to tag information
178	typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
179
180
181	// class to handle the translation of occurrence order
182	// to dictionary order for words and tags
183	class OccurToDictConverter {
184	protected:
185	unsigned long pos;
186	unsigned long val;
187	FILE *transFile;
188	random_bitio_buffer rbs;
189
190	unsigned long wordDictSize;
191	unsigned long tagDictSize;
192
193	void SeekStart ();
194	unsigned long TranslateNum (unsigned long num);
195
196	public:
197	OccurToDictConverter ();
198	~OccurToDictConverter ();
199
200	void Open (char *filename, unsigned long _wordDictSize,
201	unsigned long _tagDictSize);
202
203	// Close frees all allocated memory
204	void Close ();
205
206	unsigned long TranslateWord (unsigned long occurNum)
207	{ return TranslateNum (occurNum); }
208	unsigned long TranslateTag (unsigned long occurNum)
209	{ return TranslateNum (occurNum+wordDictSize); }
210	};
211
212
213	struct InvfStateRec {
214	mg_ullong start;
215	mg_ullong here;
216	unsigned long lastFragNum;
217	unsigned long B;
218
219	void Clear () {
220	start = here = 0;
221	lastFragNum = B = 0;
222	}
223	InvfStateRec () { Clear (); }
224	};
225
226
227	#define ISR_SIZE 1024
228
229	class InvfStateCache {
230	protected:
231	InvfStateRec recCache [ISR_SIZE];
232	unsigned long startNum;
233
234	FILE *stateFile;
235
236	void ClearCache () {
237	unsigned int i = 0;
238	for (i=0; i<ISR_SIZE; ++i) recCache[i].Clear();
239	}
240
241	public:
242	InvfStateCache ();
243	~InvfStateCache ();
244
245	void Open (char *filename);
246	void Close ();
247
248	// previous references to state records may be
249	// invalidated calling GetRec
250	InvfStateRec &GetRec (unsigned long num);
251	};
252
253
254	static invf_dict_header idh;
255	static WordBitPtrs bitPtrs;
256
257	static FILE *chunkFile = NULL;
258	static stdio_bitio_buffer chunkBuf;
259
260	static unsigned long ivfMemBufSize = 0;
261	static char *ivfMemBuf = NULL;
262
263	// word and tag dictionaries. a map is used for the tag dictionary
264	// as it should never be very big (and the perfect hash function
265	// sometimes has trouble with small values).
266	static perf_hash_data *wordHashDict = NULL;
267	static TagMapDict tagMapDict;
268
269	// information about all the different levels
270	static FIvfLevel ivfLevel;
271
272	static OccurToDictConverter occurConvert;
273
274	// information about the state of the inverted file
275	static InvfStateCache invfState;
276
277	static char collectFilename[512];
278
279
280	void WordBitPtrs::Clear () {
281	numWords = 0;
282	numTags = 0;
283	size=0;
284	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
285	wordBitPtrs = NULL;
286	}
287
288	WordBitPtrs::~WordBitPtrs () {
289	if (wordBitPtrs != NULL) delete [] wordBitPtrs;
290	}
291
292	void WordBitPtrs::SetSize (unsigned long _numWords,
293	unsigned long _numTags){
294	Clear();
295	numWords = _numWords;
296	numTags = _numTags;
297	size = numWords + numTags + 1;
298	wordBitPtrs = new BitPtr [size];
299	}
300
301
302	void OccurToDictConverter::SeekStart () {
303	if (transFile == NULL) return;
304	rbs.SEEK_X (sizeof (unsigned long) * 8);
305	pos = 0;
306	}
307
308	unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
309	if (num < pos) SeekStart ();
310	while (pos <= num) {
311	if (pos < wordDictSize)
312	val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
313	else
314	val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
315	++pos;
316	}
317	return val;
318	}
319
320	OccurToDictConverter::OccurToDictConverter () {
321	pos = 0;
322	val = 0;
323	transFile = NULL;
324	wordDictSize = 0;
325	tagDictSize = 0;
326	}
327
328	OccurToDictConverter::~OccurToDictConverter () {
329	if (transFile != NULL) Close ();
330	}
331
332	void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
333	unsigned long _tagDictSize) {
334	if (transFile != NULL) Close ();
335
336	wordDictSize = _wordDictSize;
337	tagDictSize = _tagDictSize;
338
339	transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
340	MAGIC_CHUNK_TRANS, MG_ABORT);
341	rbs.attachFile (transFile, RND_BUF_SIZE);
342	SeekStart ();
343	val = 0;
344	}
345
346	void OccurToDictConverter::Close () {
347	if (transFile == NULL) return;
348
349	rbs.done ();
350	fclose (transFile);
351	transFile = NULL;
352	pos = 0;
353	val = 0;
354
355	wordDictSize = 0;
356	tagDictSize = 0;
357	}
358
359
360
361
362	InvfStateCache::InvfStateCache () {
363	startNum = 0;
364	stateFile = NULL;
365	}
366
367	InvfStateCache::~InvfStateCache () {
368	if (stateFile != NULL) Close ();
369	}
370
371	void InvfStateCache::Open (char *filename) {
372	if (stateFile != NULL) Close();
373
374	// open the state file
375	char path[512];
376	sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
377	".invf.state", (long) getpid ());
378	if (!(stateFile = fopen (path, "wb+"))) {
379	Message ("Unable to create \"%s\"", path);
380	exit (1);
381	}
382	unlink (path); // file will be deleted after it is closed
383	// reset the buffer
384	startNum = 0;
385	ClearCache();
386	}
387
388	void InvfStateCache::Close () {
389	if (stateFile == NULL) return;
390	fclose (stateFile);
391	stateFile = NULL;
392	startNum = 0;
393	}
394
395	InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
396	// see if cached
397	if ((num >= startNum) && (num < startNum + ISR_SIZE))
398	return recCache[num-startNum];
399
400	// not cached, write out this lot of records and read in
401	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
402	fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
403
404	// read in the new set of records
405	ClearCache ();
406	startNum = num - (num % ISR_SIZE);
407	fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
408	fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
409
410	return recCache[num-startNum];
411	}
412
413
414
415	static void ClearCharBuf (char *buf, unsigned long size) {
416	char *end = buf + size;
417	while (buf != end) *buf++ = 0;
418	}
419
420	static void ReadWordDict (char *filename) {
421	// read in the perfect hash function for the word dictionary
422	FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
423	MAGIC_HASH, MG_ABORT);
424	if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
425	FatalError (1, "Unable to read in hash data for word dictionary");
426	}
427	fclose (wordHashFile);
428	}
429
430	static void ReadTagDict (char *filename, invf_dict_header &_idh) {
431	// open the file
432	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
433	MAGIC_STEM_BUILD, MG_ABORT);
434
435	// seek to the start of the tag dictionary
436	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
437
438	unsigned long tagNum;
439	dict_el thisEl;
440	for (tagNum = 0; tagNum < _idh.tag_dict_size; ++tagNum) {
441	thisEl.Read (dictFile);
442	tagMapDict[thisEl.el].tagNum = tagNum;
443	}
444
445	fclose (dictFile);
446	}
447
448	static void ReadLevelFile (char *filename) {
449	FILE *f;
450	f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
451	MAGIC_INVF_LEVELS, MG_ABORT);
452	ivfLevel.Read (f);
453	fclose (f);
454	}
455
456	void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
457	if (totalIBits < lastTotalIBits) {
458	fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
459	if (sizeof (mg_ullong) < 8) {
460	fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
461	}
462	fprintf(stderr, " Build aborted.\n");
463	exit(1);
464	}
465	}
466
467	// assumes the inverted file state file has been opened
468	static void InitInvfState (char *filename,
469	invf_dict_header &_idh,
470	InvfStateCache &_invfState,
471	mg_ullong &totalIBits,
472	bool wordLevelIndex) {
473	// read in the dictionary, setting inverted state information
474	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
475	MAGIC_STEM_BUILD, MG_ABORT);
476
477	// seek to the start of the word dictionary
478	fseek (dictFile, _idh.word_dict_start, SEEK_SET);
479
480	// add the word entries
481	word_dict_el wordEl;
482	wordEl.SetNumLevels (_idh.num_levels);
483	unsigned long dictWordNum, p;
484	mg_ullong lastTotalIBits;
485	unsigned long N = _idh.num_frags;
486	for (dictWordNum=0; dictWordNum<_idh.word_dict_size; ++dictWordNum) {
487	// lastTotalIBits is used to detect integer overflow
488	lastTotalIBits = totalIBits;
489
490	// read the next word and associated information
491	wordEl.Read (dictFile, _idh.num_levels);
492
493	// update the state record
494	p = wordEl.frag_occur;
495	InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
496	wisr.start = totalIBits;
497	wisr.here = totalIBits;
498	wisr.B = BIO_Bblock_Init (N, p);
499
500	// add the length of the fragment numbers
501	totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
502
503	// if needed, add the length of the fragment frequency information
504	if (!wordLevelIndex)
505	totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
506
507	// align next byte
508	#ifdef USE_LONG_LONG
509	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
510	#else
511	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
512	#endif
513
514	CheckIntOverflow (totalIBits, lastTotalIBits);
515	}
516
517
518	// seek to the start of the tag dictionary
519	fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
520
521	// add the tag entries
522	dict_el tagEl;
523	unsigned long dictTagNum;
524	N = _idh.num_frags;
525	for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; ++dictTagNum) {
526	// lastTotalIBits is used to detect integer overflow
527	lastTotalIBits = totalIBits;
528
529	// read the next tag and associated information
530	tagEl.Read (dictFile);
531
532	// update the state record
533	p = tagEl.frag_occur*2;
534	InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
535	tisr.start = totalIBits;
536	tisr.here = totalIBits;
537	tisr.B = BIO_Bblock_Init (N+p, p);
538
539	// add the length of the fragment numbers (two numbers for each
540	// tag, one for start and one for end)
541	totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
542
543	// align next byte
544	#ifdef USE_LONG_LONG
545	totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
546	#else
547	totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
548	#endif
549
550	CheckIntOverflow (totalIBits, lastTotalIBits);
551	}
552
553	fclose (dictFile);
554	}
555
556	/*
557	// assumes the chunk tag information has been placed in .first
558	static void PrintChunkInfo (unsigned long chunkMem,
559	unsigned long numChunkWords,
560	unsigned long numChunkTags) {
561	static unsigned long chunksRead = 0;
562	++chunksRead;
563	cout << "Chunk Number: " << chunksRead << "\n";
564	cout << "numChunkDocs " << numDocsInChunk << "\n";
565	cout << "numChunkFrags " << numFragsInChunk << "\n";
566	cout << "mem " << chunkMem << "\n";
567	cout << "numWords " << numChunkWords << "\n";
568	cout << "numTags " << numChunkTags << "\n\n";
569
570	TagMapDict::iterator tagMapHere = tagMapDict.begin();
571	TagMapDict::iterator tagMapEnd = tagMapDict.end();
572	while (tagMapHere != tagMapEnd) {
573	unsigned long tagMapNum = (*tagMapHere).second.tagNum;
574	cout << (*tagMapHere).first << " " << tagMapNum << " "
575	<< bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
576	++tagMapHere;
577	}
578	}
579	*/
580
581	void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
582	// reset globals
583	numChunkDocs = 0;
584	chunkStartFragNum = numFrags;
585
586	// read in information about this chunk
587	numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
588	if (numDocsInChunk == 0)
589	FatalError (1, "The number of docs in the current chunk is 0");
590
591	numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
592	unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
593
594	if (chunkMem > ivfMemBufSize)
595	FatalError (1, "Chunk memory size is greater than maximum");
596
597	unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
598	unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
599
600
601	// reset stuff
602	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
603	bitPtrs.ResetPtrs();
604
605	// read in the entries in occurrence order storing the
606	// "chunkWordCount" in "start" and the "chunkFragCount"
607	// in "here"
608	unsigned long numOccur;
609	unsigned long wordNum;
610	for (numOccur=0; numOccur<numChunkWords; ++numOccur) {
611	wordNum = occurConvert.TranslateWord (numOccur);
612	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
613	wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
614	if (wordPtr.start >= 2)
615	wordPtr.here = chunkBuf.gamma_decode (NULL);
616	else wordPtr.here = wordPtr.start;
617	}
618	unsigned long tagNum;
619	for (numOccur=0; numOccur<numChunkTags; ++numOccur) {
620	tagNum = occurConvert.TranslateTag (numOccur);
621	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
622	// only chunkFragCount is encoded for tags
623	tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
624	tagPtr.here = tagPtr.start;
625	}
626
627	/* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
628
629	// create the bit ptrs in dictionary order
630	unsigned long totalIBits = 0; // only dealing with memory
631	unsigned long chunkWordCount, chunkFragCount;
632	for (wordNum=0; wordNum<_idh.word_dict_size; ++wordNum) {
633	BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
634	chunkWordCount = wordPtr.start;
635	chunkFragCount = wordPtr.here;
636	wordPtr.start = totalIBits;
637	wordPtr.here = totalIBits;
638	wordPtr.lastFragNum = chunkStartFragNum;
639	wordPtr.lgB = 0;
640	if (chunkWordCount > 0) {
641	wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
642	chunkFragCount));
643	totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
644	// use unary encoding for memory buffer encoding of fragment freq
645	if (!wordLevelIndex) {
646	totalIBits += chunkWordCount;
647	}
648	}
649	}
650	for (tagNum=0; tagNum<_idh.tag_dict_size; ++tagNum) {
651	BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
652	chunkFragCount = tagPtr.here;
653	tagPtr.start = totalIBits;
654	tagPtr.here = totalIBits;
655	tagPtr.lastFragNum = chunkStartFragNum;
656	tagPtr.lgB = 0;
657	if (chunkFragCount > 0) {
658	unsigned long pTag = chunkFragCount*2;
659	tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
660	pTag));
661	unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
662	pTag);
663	// cout << tagNum + _idh.word_dict_size << " ";
664	// cout << "numFrags: " << numFragsInChunk
665	// << " chunkFragCount: " << chunkFragCount
666	// << " B: " << 1 << tagPtr.lgB
667	// << " blen: " << blen << "\n";
668	totalIBits += bLen;
669	}
670	}
671	bitPtrs.GetEndStart() = totalIBits;
672	bitPtrs.GetEndHere() = totalIBits;
673
674	if ((totalIBits + 7ul) >> 3ul > chunkMem) {
675	cerr << "totalIBits: " << totalIBits << "\n";
676	cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
677	cerr << "chunkMem: " << chunkMem << "\n";
678	FatalError (1, "Pointers exceed buffer size");
679	}
680	}
681
682
683
684
685	int init_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
686	// read in compressed dictionary header
687	FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
688	MAGIC_STEM_BUILD, MG_ABORT);
689	idh.Read (dictFile);
690	fclose (dictFile);
691
692	// set the size of the bit ptrs
693	bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
694
695	// open the chunk file and read in the maximum memory needed
696	// for the inverted memory buffer
697	chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
698	MAGIC_CHUNK, MG_ABORT);
699	ReadUL (chunkFile, ivfMemBufSize);
700	chunkBuf.attachFile (chunkFile);
701
702	// allocate memory for the inverted buffer
703	ivfMemBuf = new char [ivfMemBufSize];
704	ClearCharBuf (ivfMemBuf, ivfMemBufSize);
705
706	// read in the word dictionary
707	ReadWordDict (filename);
708
709	// read in the tag dictionary
710	ReadTagDict (filename, idh);
711
712	// read in the level information
713	ReadLevelFile (filename);
714	bool wordLevelIndex = ivfLevel.indexLevel.empty();
715
716	// set up the translation table file
717	occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
718
719	// reset some globals
720	numDocs = 0;
721	numChunkDocs = 0;
722	numDocsInChunk = 0;
723	numFrags = 0;
724	numFragsInChunk = 0;
725	chunkStartFragNum = 0;
726
727	strcpy (collectFilename, filename);
728
729
730	// create the inverted file
731	mg_ullong totalIBits = 0;
732	FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
733	MAGIC_INVF, MG_ABORT);
734	totalIBits += sizeof (unsigned long) * 8; // magic number
735	totalIBits += 8 * 200; // 200 byte gap -- why??????
736	fclose (invfFile);
737
738	// init the inverted file state cache
739	invfState.Open (filename);
740	InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
741
742	return COMPALLOK;
743	}
744
745	static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/tagName/) {
746	if (!tInfo.inTag) return;
747
748	// add this tag to the inverted list
749	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
750	unsigned long endFrag = numFrags;
751	int b = 1 << tagBitPtr.lgB;
752
753	/*
754	cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
755	<< tInfo.startFrag << " " << endFrag << "\n";
756	*/
757
758	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
759	buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
760	b, NULL);
761	buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
762	tagBitPtr.lastFragNum = endFrag;
763	tagBitPtr.here = buffer.position();
764	buffer.encodeDone();
765
766	// check for buffer overrun
767	bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
768
769	// reset information about this tag
770	tInfo.inTag = false;
771	tInfo.startFrag = 0;
772	}
773
774	static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
775	// close tag if open
776	IP2TagInfo &tInfo = tagMapDict[el.tagName];
777	if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
778
779	// open this tag
780	tInfo.inTag = true;
781	tInfo.startFrag = numFrags;
782
783	// check for start of next fragment
784	bool wordLevelIndex = ivfLevel.indexLevel.empty();
785	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
786	++numFrags;
787	inFrag = true;
788	}
789	}
790
791	static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
792	// check for end of fragment
793	bool wordLevelIndex = ivfLevel.indexLevel.empty();
794	if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
795	inFrag = false;
796	}
797
798	IP2TagInfo &tInfo = tagMapDict[el.tagName];
799	CloseTextTag (tInfo, el.tagName);
800	}
801
802	static void ProcessText (const TextEl &el, bool &inFrag) {
803	// make sure this text is to be indexed
804	bool wordLevelIndex = ivfLevel.indexLevel.empty();
805	if (!wordLevelIndex && !inFrag) return;
806
807	const unsigned char *textHere = &(el.text[0]);
808	const unsigned char *textEnd = &(el.text[el.text.size() - 1]);
809	unsigned char mgWord[MAXSTEMLEN + 1];
810
811	if (!inaword_mgpp (textHere, textEnd))
812	ParseNonindexWord (textHere, textEnd);
813
814
815	// Alternately parse off words and non-words from the input
816
817	while (textHere <= textEnd) {
818	textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
819	textHere = ParseNonindexWord (textHere, textEnd);
820
821	if (mgWord[0] > 0) {
822	if (wordLevelIndex) ++numFrags;
823
824	unsigned long wordNum = perf_hash (wordHashDict, mgWord);
825
826	/*
827	cout << wordNum << " \"";
828	cout.write (mgWord+1, *mgWord);
829	cout << "\" " << numFrags << "\n";
830	*/
831
832	// add this word to the inverted list
833	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
834	unsigned long fragNum = numFrags;
835	int b = 1 << wordBitPtr.lgB;
836
837	mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
838
839	// note: this assumes that fragments don't carry over between
840	// chunks (which they don't because all tags are closed at the
841	// end of each document and chunks are based on document
842	// boundaries), i.e. the first fragment number must be greater
843	// than the starting fragment number of the chunk.
844	if (fragNum > wordBitPtr.lastFragNum) {
845	buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
846	b, NULL);
847	if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
848
849	} else if (!wordLevelIndex) {
850	// add one to the frequency count for this word
851	buffer.seek (buffer.position()-1);
852	buffer.encodeBit (0); // unary encoding -- last = 1
853	buffer.encodeBit (1);
854	}
855
856	wordBitPtr.lastFragNum = fragNum;
857	wordBitPtr.here = buffer.position();
858	buffer.encodeDone();
859
860	// check for buffer overrun
861	bitPtrs.CheckWordBufOverrun (wordNum);
862	}
863	}
864	}
865
866	// combine the in memory inverted buffer with the disk
867	// based inverted file
868	static void DiskMerge (char *filename) {
869	bool wordLevelIndex = ivfLevel.indexLevel.empty();
870
871	// make sure we have something to process
872	if (numChunkDocs <= 0) return;
873
874	// open the inverted file
875	FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
876	MAGIC_INVF, MG_ABORT);
877	random_bitio_buffer invfOutBuf (invfFile);
878
879	// set up to decode the entries in memory
880	mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
881
882	// write out the word information
883	unsigned long wordNum;
884	int b;
885	unsigned long currFragNum;
886	unsigned long delta;
887	unsigned long currFreq;
888	for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
889	// go to the end of the last inverted file entry
890	InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
891	invfOutBuf.SEEK_X (wordDiskState.here);
892
893	// go to the start of the inverted chunk info in memory
894	BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
895	memInBuf.seek(wordBitPtr.start);
896
897	// decode each entry and re-write to disk
898	currFragNum = chunkStartFragNum;
899	while (memInBuf.position() < wordBitPtr.here) {
900	// decode word entry
901	b = 1 << wordBitPtr.lgB;
902	delta = memInBuf.bblock_decode (b, NULL);
903	currFragNum += delta;
904	if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
905	else currFreq = 1;
906
907	// recode on disk
908	invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
909	wordDiskState.B, NULL);
910	if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
911	wordDiskState.lastFragNum = currFragNum;
912	}
913
914	wordDiskState.here = invfOutBuf.TELL_X();
915	}
916
917	// write out the tag information
918	unsigned long tagNum;
919	unsigned long currTagStart;
920	unsigned long currTagEnd;
921	for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
922	// go to the end of the last inverted file entry
923	InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
924	invfOutBuf.SEEK_X (tagDiskState.here);
925
926	// go to the start of the inverted chunk info in memory
927	BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
928	memInBuf.seek(tagBitPtr.start);
929
930	// decode each entry and re-write to disk
931	currTagEnd = chunkStartFragNum;
932	while (memInBuf.position() < tagBitPtr.here) {
933	// decode tag entry
934	b = 1 << tagBitPtr.lgB;
935	delta = memInBuf.bblock_decode (b, NULL) - 1;
936	currTagStart = currTagEnd + delta;
937	delta = memInBuf.bblock_decode (b, NULL) - 1;
938	currTagEnd = currTagStart + delta;
939
940	// recode on disk
941	invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
942	tagDiskState.B, NULL);
943	invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
944	tagDiskState.B, NULL);
945
946	tagDiskState.lastFragNum = currTagEnd;
947	}
948
949	tagDiskState.here = invfOutBuf.TELL_X();
950	}
951
952	memInBuf.done();
953
954	invfOutBuf.encodeDone();
955	fclose (invfFile);
956	}
957
958
959	int process_ivf_2 (const TagInfo &/tagInfo/, const TextElArray &doc) {
960	bool wordLevelIndex = ivfLevel.indexLevel.empty();
961	bool inFrag = false;
962	if (wordLevelIndex) inFrag = true; // unconditional
963
964	// get next chunk information if need to. the chunk information
965	// is needed before the first document is processed
966	if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
967
968	// process each text element
969	TextElArray::const_iterator here = doc.begin();
970	TextElArray::const_iterator end = doc.end();
971	while (here != end) {
972	// process this element
973	if ((here).elType == OpenTagE) ProcessOpenTag (here, inFrag);
974	else if ((here).elType == CloseTagE) ProcessCloseTag (here, inFrag);
975	else ProcessText (*here, inFrag);
976
977	++here;
978	}
979
980	// close off any unclosed tags
981	TagMapDict::iterator tdHere = tagMapDict.begin();
982	TagMapDict::iterator tdEnd = tagMapDict.end();
983	while (tdHere != tdEnd) {
984	CloseTextTag ((tdHere).second, (tdHere).first);
985	++tdHere;
986	}
987
988	// we've processed one more document
989	++numDocs;
990	++numChunkDocs;
991
992	// merge the memory based inverted file with the one on
993	// disk if this is the end of this chunk
994	if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
995
996	return COMPALLOK;
997	}
998
999
1000	static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
1001	FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1002	MAGIC_INVF, MG_ABORT);
1003	FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1004	MAGIC_INVF, MG_ABORT);
1005
1006	// skip the magic number
1007	fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
1008
1009	// write the inverted file header -- use defaults for most things
1010	invf_file_header ifh;
1011	ifh.no_of_words = idh.word_dict_size;
1012	ifh.no_of_tags = idh.tag_dict_size;
1013	ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1014	ifh.Write (outInvfFile);
1015
1016	bytesOutput = ftell (outInvfFile);
1017
1018	// process each meaningful byte in the file
1019	unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
1020	unsigned long entryNum;
1021	mg_ullong lastStart = 0;
1022	for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1023	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1024
1025	// overrun check
1026	if (stateRec.start < lastStart)
1027	FatalError (1, "Inverted file Buffer overrun");
1028	lastStart = stateRec.start;
1029
1030	unsigned long oldEntryStart = stateRec.start >> 3;
1031	unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
1032	unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1033	unsigned long oldEntryEndOver = stateRec.here & 7;
1034
1035	fseek (inInvfFile, oldEntryStart, SEEK_SET);
1036
1037	stateRec.here -= stateRec.start;
1038	stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver;
1039	stateRec.here += stateRec.start;
1040	while (oldEntryStart < oldEntryEnd) {
1041	unsigned char c = getc (inInvfFile);
1042	if (oldEntryStart == oldEntryEnd - 1) {
1043	u_char ands[8] =
1044	{0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1045	c &= ands[oldEntryEndOver];
1046	}
1047	putc (c, outInvfFile);
1048	++bytesOutput;
1049	++oldEntryStart;
1050	}
1051	}
1052
1053	fclose (inInvfFile);
1054
1055	#ifdef __WIN32__
1056	if (_chsize (_fileno (outInvfFile), bytesOutput) != 0)
1057	Message ("Could not truncate invf.");
1058	#else
1059	ftruncate (fileno (outInvfFile), bytesOutput);
1060	#endif
1061
1062	fclose (outInvfFile);
1063	}
1064
1065	static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
1066	FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1067	MAGIC_INVI, MG_ABORT);
1068
1069	// process each meaningful byte in the file
1070	unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
1071	unsigned long entryNum;
1072	for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1073	InvfStateRec &stateRec = invfState.GetRec (entryNum);
1074
1075	// assumes that inverted entries start at beginning of each byte
1076	if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1077	}
1078
1079	WriteUL (invfIdxFile, invfNumBytes);
1080
1081	fclose (invfIdxFile);
1082	}
1083
1084
1085	int done_ivf_2 (const TagInfo &/tagInfo/, char *filename) {
1086	// close most open files
1087	if (chunkFile != NULL) {
1088	chunkBuf.done();
1089	fclose (chunkFile);
1090	chunkFile = NULL;
1091	}
1092	occurConvert.Close();
1093
1094	// free allocated memory
1095	bitPtrs.Clear();
1096	if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1097	free_perf_hash (wordHashDict);
1098	wordHashDict = NULL;
1099	tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1100
1101	// condense the inverted file and truncate it
1102	// this function also writes out the inverted header
1103	unsigned long invfNumBytes = 0;
1104	CondenseInvfFile (filename, invfNumBytes);
1105
1106	OutputInvfIdx (filename, invfNumBytes);
1107
1108	// close the rest of the open files
1109	invfState.Close ();
1110
1111	return COMPALLOK;
1112	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: