source: main/trunk/greenstone2/common-src/indexers/mgpp/text/ivf.pass2.cpp@ 25147

Last change on this file since 25147 was 25147, checked in by kjdon, 12 years ago

merged 64_bit_Greenstone branch into trunk, rev 25139

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 31.3 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass2.cpp -- Memory efficient pass 2 inversion
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23// This was added for Solaris, but it makes things worse on Solaris for me...
24// #define _XOPEN_SOURCE_EXTENDED 1
25
26#include <stdio.h>
27
28#if defined __WIN32__
29# include <process.h>
30# include <io.h>
31# define getpid _getpid
32# define unlink _unlink
33#else
34# include <unistd.h>
35# include "non_ansi.h"
36#endif
37
38#include "UCArray.h"
39#include "sysfuncs.h"
40#include "mg_files.h"
41#include "invf.h"
42#include "mg.h"
43#include "build.h"
44#include "locallib.h"
45#include "bitio_m_random.h"
46#include "bitio_m_stdio.h"
47#include "bitio_m_mems.h"
48#include "bitio_gen.h"
49#include <stdio.h>
50#include "words.h"
51#include "messages.h"
52#include "netorder.h"
53#include "FIvfLevelInfo.h"
54#include "perf_hash.h"
55#include "string.h"
56
57#include "longlong.h"
58
59#if defined(GSDL_USE_OBJECTSPACE)
60# include <ospace\std\map>
61#elif defined(GSDL_USE_STL_H)
62# include <map.h>
63#else
64# include <map>
65#endif
66
67
68#ifdef USE_LONG_LONG
69#define SEEK_X seek_LL
70#define TELL_X tell_LL
71#else
72#define SEEK_X seek
73#define TELL_X tell
74#endif
75
76#ifndef RND_BUF_SIZE
77#define RND_BUF_SIZE 8*1024
78#endif
79
80
81static mg_u_long numDocs = 0;
82static mg_u_long numChunkDocs = 0;
83static mg_u_long numDocsInChunk = 0;
84
85static mg_u_long numFrags = 0;
86static mg_u_long numFragsInChunk = 0;
87static mg_u_long chunkStartFragNum = 0;
88
89
90
91
92struct BitPtr {
93 mg_u_long start;
94 mg_u_long here;
95 mg_u_long lastFragNum;
96 mg_u_long lgB;
97
98 void Clear () { start = here = lastFragNum = lgB = 0; }
99 BitPtr () { Clear(); }
100};
101
102class WordBitPtrs {
103protected:
104 mg_u_long numWords;
105 mg_u_long numTags;
106 mg_u_long size;
107 BitPtr *wordBitPtrs;
108
109 void CheckBufOverrun (mg_u_long num) {
110 if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
111 cerr << "numDocs: " << numDocs << "\n";
112 cerr << "numChunkDocs: " << numChunkDocs << "\n";
113 cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
114 cerr << "numFrags: " << numFrags << "\n";
115 cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
116 cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
117 cerr << "num: " << num << "\n";
118 cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
119 cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
120 cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
121 FatalError (1, "Bit buffer overrun");
122 }
123 }
124
125public:
126 void Clear ();
127 WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
128 ~WordBitPtrs ();
129 void SetSize (mg_u_long _numWords,
130 mg_u_long _numTags);
131
132 void ResetPtrs () {
133 if (wordBitPtrs == NULL) return;
134 mg_u_long i;
135 for (i=0; i<size; ++i) wordBitPtrs[i].Clear();
136 }
137
138 BitPtr &GetWordBitPtr (mg_u_long wordNum)
139 { return wordBitPtrs[wordNum]; }
140 mg_u_long &GetWordStart (mg_u_long wordNum)
141 { return wordBitPtrs[wordNum].start; }
142 mg_u_long &GetWordHere (mg_u_long wordNum)
143 { return wordBitPtrs[wordNum].here; }
144 void CheckWordBufOverrun (mg_u_long wordNum)
145 { CheckBufOverrun (wordNum); }
146
147 BitPtr &GetTagBitPtr (mg_u_long tagNum)
148 { return wordBitPtrs[tagNum + numWords]; }
149 mg_u_long &GetTagStart (mg_u_long tagNum)
150 { return wordBitPtrs[tagNum + numWords].start; }
151 mg_u_long &GetTagHere (mg_u_long tagNum)
152 { return wordBitPtrs[tagNum + numWords].here; }
153 void CheckTagBufOverrun (mg_u_long tagNum)
154 { CheckBufOverrun (tagNum + numWords); }
155
156 BitPtr &GetEndBitPtr ()
157 { return wordBitPtrs[size-1]; }
158 mg_u_long &GetEndStart ()
159 { return wordBitPtrs[size-1].start; }
160 mg_u_long &GetEndHere ()
161 { return wordBitPtrs[size-1].here; }
162};
163
164
165struct IP2TagInfo {
166 bool inTag;
167 mg_u_long startFrag;
168 mg_u_long tagNum;
169
170 IP2TagInfo () {
171 inTag = false;
172 startFrag = 0;
173 tagNum = 0;
174 }
175};
176
177// maps tags to tag information
178typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
179
180
181// class to handle the translation of occurrence order
182// to dictionary order for words and tags
183class OccurToDictConverter {
184protected:
185 mg_u_long pos;
186 mg_u_long val;
187 FILE *transFile;
188 random_bitio_buffer rbs;
189
190 mg_u_long wordDictSize;
191 mg_u_long tagDictSize;
192
193 void SeekStart ();
194 mg_u_long TranslateNum (mg_u_long num);
195
196public:
197 OccurToDictConverter ();
198 ~OccurToDictConverter ();
199
200 void Open (char *filename, mg_u_long _wordDictSize,
201 mg_u_long _tagDictSize);
202
203 // Close frees all allocated memory
204 void Close ();
205
206 mg_u_long TranslateWord (mg_u_long occurNum)
207 { return TranslateNum (occurNum); }
208 mg_u_long TranslateTag (mg_u_long occurNum)
209 { return TranslateNum (occurNum+wordDictSize); }
210};
211
212
213struct InvfStateRec {
214 mg_ullong start;
215 mg_ullong here;
216 mg_u_long lastFragNum;
217 mg_u_long B;
218
219 void Clear () {
220 start = here = 0;
221 lastFragNum = B = 0;
222 }
223 InvfStateRec () { Clear (); }
224};
225
226
227#define ISR_SIZE 1024
228
229class InvfStateCache {
230protected:
231 InvfStateRec recCache [ISR_SIZE];
232 mg_u_long startNum;
233
234 FILE *stateFile;
235
236 void ClearCache () {
237 unsigned int i = 0;
238 for (i=0; i<ISR_SIZE; ++i) recCache[i].Clear();
239 }
240
241public:
242 InvfStateCache ();
243 ~InvfStateCache ();
244
245 void Open (char *filename);
246 void Close ();
247
248 // previous references to state records may be
249 // invalidated calling GetRec
250 InvfStateRec &GetRec (mg_u_long num);
251};
252
253
254static invf_dict_header idh;
255static WordBitPtrs bitPtrs;
256
257static FILE *chunkFile = NULL;
258static stdio_bitio_buffer chunkBuf;
259
260static mg_u_long ivfMemBufSize = 0;
261static char *ivfMemBuf = NULL;
262
263// word and tag dictionaries. a map is used for the tag dictionary
264// as it should never be very big (and the perfect hash function
265// sometimes has trouble with small values).
266static perf_hash_data *wordHashDict = NULL;
267static TagMapDict tagMapDict;
268
269// information about all the different levels
270static FIvfLevel ivfLevel;
271
272static OccurToDictConverter occurConvert;
273
274// information about the state of the inverted file
275static InvfStateCache invfState;
276
277static char collectFilename[512];
278
279
280void WordBitPtrs::Clear () {
281 numWords = 0;
282 numTags = 0;
283 size=0;
284 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
285 wordBitPtrs = NULL;
286}
287
288WordBitPtrs::~WordBitPtrs () {
289 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
290}
291
292void WordBitPtrs::SetSize (mg_u_long _numWords,
293 mg_u_long _numTags){
294 Clear();
295 numWords = _numWords;
296 numTags = _numTags;
297 size = numWords + numTags + 1;
298 wordBitPtrs = new BitPtr [size];
299}
300
301
302void OccurToDictConverter::SeekStart () {
303 if (transFile == NULL) return;
304 rbs.SEEK_X (sizeof (mg_u_long) * 8);
305 pos = 0;
306}
307
308mg_u_long OccurToDictConverter::TranslateNum (mg_u_long num) {
309 if (num < pos) SeekStart ();
310 while (pos <= num) {
311 if (pos < wordDictSize)
312 val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
313 else
314 val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
315 ++pos;
316 }
317 return val;
318}
319
320OccurToDictConverter::OccurToDictConverter () {
321 pos = 0;
322 val = 0;
323 transFile = NULL;
324 wordDictSize = 0;
325 tagDictSize = 0;
326}
327
328OccurToDictConverter::~OccurToDictConverter () {
329 if (transFile != NULL) Close ();
330}
331
332void OccurToDictConverter::Open (char *filename, mg_u_long _wordDictSize,
333 mg_u_long _tagDictSize) {
334 if (transFile != NULL) Close ();
335
336 wordDictSize = _wordDictSize;
337 tagDictSize = _tagDictSize;
338
339 transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
340 MAGIC_CHUNK_TRANS, MG_ABORT);
341 rbs.attachFile (transFile, RND_BUF_SIZE);
342 SeekStart ();
343 val = 0;
344}
345
346void OccurToDictConverter::Close () {
347 if (transFile == NULL) return;
348
349 rbs.done ();
350 fclose (transFile);
351 transFile = NULL;
352 pos = 0;
353 val = 0;
354
355 wordDictSize = 0;
356 tagDictSize = 0;
357}
358
359
360
361
362InvfStateCache::InvfStateCache () {
363 startNum = 0;
364 stateFile = NULL;
365}
366
367InvfStateCache::~InvfStateCache () {
368 if (stateFile != NULL) Close ();
369}
370
371void InvfStateCache::Open (char *filename) {
372 if (stateFile != NULL) Close();
373
374 // open the state file
375 char path[512];
376 sprintf (path, FILE_NAME_FORMAT ".%d", get_basepath (), filename,
377 ".invf.state", (mg_s_long) getpid ());
378 if (!(stateFile = fopen (path, "wb+"))) {
379 Message ("Unable to create \"%s\"", path);
380 exit (1);
381 }
382 unlink (path); // file will be deleted after it is closed
383 // reset the buffer
384 startNum = 0;
385 ClearCache();
386}
387
388void InvfStateCache::Close () {
389 if (stateFile == NULL) return;
390 fclose (stateFile);
391 stateFile = NULL;
392 startNum = 0;
393}
394
395InvfStateRec &InvfStateCache::GetRec (mg_u_long num) {
396 // see if cached
397 if ((num >= startNum) && (num < startNum + ISR_SIZE))
398 return recCache[num-startNum];
399
400 // not cached, write out this lot of records and read in
401 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
402 fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
403
404 // read in the new set of records
405 ClearCache ();
406 startNum = num - (num % ISR_SIZE);
407 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
408 size_t numbytes = fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
409 if (numbytes != ISR_SIZE){
410#ifdef DEBUG
411 fprintf(stderr, "[mgpp/text/ivf.pass2.cpp L410] number of bytes read by fread does not match the requested amount\n");
412#endif
413 }
414
415 return recCache[num-startNum];
416}
417
418
419
420static void ClearCharBuf (char *buf, mg_u_long size) {
421 char *end = buf + size;
422 while (buf != end) *buf++ = 0;
423}
424
425static void ReadWordDict (char *filename) {
426 // read in the perfect hash function for the word dictionary
427 FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
428 MAGIC_HASH, MG_ABORT);
429 if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
430 FatalError (1, "Unable to read in hash data for word dictionary");
431 }
432 fclose (wordHashFile);
433}
434
435static void ReadTagDict (char *filename, invf_dict_header &_idh) {
436 // open the file
437 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
438 MAGIC_STEM_BUILD, MG_ABORT);
439
440 // seek to the start of the tag dictionary
441 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
442
443 mg_u_long tagNum;
444 dict_el thisEl;
445 for (tagNum = 0; tagNum < _idh.tag_dict_size; ++tagNum) {
446 thisEl.Read (dictFile);
447 tagMapDict[thisEl.el].tagNum = tagNum;
448 }
449
450 fclose (dictFile);
451}
452
453static void ReadLevelFile (char *filename) {
454 FILE *f;
455 f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
456 MAGIC_INVF_LEVELS, MG_ABORT);
457 ivfLevel.Read (f);
458 fclose (f);
459}
460
461void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
462 if (totalIBits < lastTotalIBits) {
463 fprintf(stderr, "ERROR: The totalIBits counter (%lu byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
464 if (sizeof (mg_ullong) < 8) {
465 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
466 }
467 fprintf(stderr, " Build aborted.\n");
468 exit(1);
469 }
470}
471
472// assumes the inverted file state file has been opened
473static void InitInvfState (char *filename,
474 invf_dict_header &_idh,
475 InvfStateCache &_invfState,
476 mg_ullong &totalIBits,
477 bool wordLevelIndex) {
478 // read in the dictionary, setting inverted state information
479 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
480 MAGIC_STEM_BUILD, MG_ABORT);
481
482 // seek to the start of the word dictionary
483 fseek (dictFile, _idh.word_dict_start, SEEK_SET);
484
485 // add the word entries
486 word_dict_el wordEl;
487 wordEl.SetNumLevels (_idh.num_levels);
488 mg_u_long dictWordNum, p;
489 mg_ullong lastTotalIBits;
490 mg_u_long N = _idh.num_frags;
491 for (dictWordNum=0; dictWordNum<_idh.word_dict_size; ++dictWordNum) {
492 // lastTotalIBits is used to detect integer overflow
493 lastTotalIBits = totalIBits;
494
495 // read the next word and associated information
496 wordEl.Read (dictFile, _idh.num_levels);
497
498 // update the state record
499 p = wordEl.frag_occur;
500 InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
501 wisr.start = totalIBits;
502 wisr.here = totalIBits;
503 wisr.B = BIO_Bblock_Init (N, p);
504
505 // add the length of the fragment numbers
506 totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
507
508 // if needed, add the length of the fragment frequency information
509 if (!wordLevelIndex)
510 totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
511
512 // align next byte
513#ifdef USE_LONG_LONG
514 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
515#else
516 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
517#endif
518
519 CheckIntOverflow (totalIBits, lastTotalIBits);
520 }
521
522
523 // seek to the start of the tag dictionary
524 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
525
526 // add the tag entries
527 dict_el tagEl;
528 mg_u_long dictTagNum;
529 N = _idh.num_frags;
530 for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; ++dictTagNum) {
531 // lastTotalIBits is used to detect integer overflow
532 lastTotalIBits = totalIBits;
533
534 // read the next tag and associated information
535 tagEl.Read (dictFile);
536
537 // update the state record
538 p = tagEl.frag_occur*2;
539 InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
540 tisr.start = totalIBits;
541 tisr.here = totalIBits;
542 tisr.B = BIO_Bblock_Init (N+p, p);
543
544 // add the length of the fragment numbers (two numbers for each
545 // tag, one for start and one for end)
546 totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
547
548 // align next byte
549#ifdef USE_LONG_LONG
550 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
551#else
552 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
553#endif
554
555 CheckIntOverflow (totalIBits, lastTotalIBits);
556 }
557
558 fclose (dictFile);
559}
560
561/*
562// assumes the chunk tag information has been placed in .first
563static void PrintChunkInfo (mg_u_long chunkMem,
564 mg_u_long numChunkWords,
565 mg_u_long numChunkTags) {
566 static mg_u_long chunksRead = 0;
567 ++chunksRead;
568 cout << "Chunk Number: " << chunksRead << "\n";
569 cout << "numChunkDocs " << numDocsInChunk << "\n";
570 cout << "numChunkFrags " << numFragsInChunk << "\n";
571 cout << "mem " << chunkMem << "\n";
572 cout << "numWords " << numChunkWords << "\n";
573 cout << "numTags " << numChunkTags << "\n\n";
574
575 TagMapDict::iterator tagMapHere = tagMapDict.begin();
576 TagMapDict::iterator tagMapEnd = tagMapDict.end();
577 while (tagMapHere != tagMapEnd) {
578 mg_u_long tagMapNum = (*tagMapHere).second.tagNum;
579 cout << (*tagMapHere).first << " " << tagMapNum << " "
580 << bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
581 ++tagMapHere;
582 }
583}
584*/
585
586void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
587 // reset globals
588 numChunkDocs = 0;
589 chunkStartFragNum = numFrags;
590
591 // read in information about this chunk
592 numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
593 if (numDocsInChunk == 0)
594 FatalError (1, "The number of docs in the current chunk is 0");
595
596 numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
597 mg_u_long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
598
599 if (chunkMem > ivfMemBufSize)
600 FatalError (1, "Chunk memory size is greater than maximum");
601
602 mg_u_long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
603 mg_u_long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
604
605
606 // reset stuff
607 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
608 bitPtrs.ResetPtrs();
609
610 // read in the entries in occurrence order storing the
611 // "chunkWordCount" in "start" and the "chunkFragCount"
612 // in "here"
613 mg_u_long numOccur;
614 mg_u_long wordNum;
615 for (numOccur=0; numOccur<numChunkWords; ++numOccur) {
616 wordNum = occurConvert.TranslateWord (numOccur);
617 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
618 wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
619 if (wordPtr.start >= 2)
620 wordPtr.here = chunkBuf.gamma_decode (NULL);
621 else wordPtr.here = wordPtr.start;
622 }
623 mg_u_long tagNum;
624 for (numOccur=0; numOccur<numChunkTags; ++numOccur) {
625 tagNum = occurConvert.TranslateTag (numOccur);
626 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
627 // only chunkFragCount is encoded for tags
628 tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
629 tagPtr.here = tagPtr.start;
630 }
631
632 /* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
633
634 // create the bit ptrs in dictionary order
635 mg_u_long totalIBits = 0; // only dealing with memory
636 mg_u_long chunkWordCount, chunkFragCount;
637 for (wordNum=0; wordNum<_idh.word_dict_size; ++wordNum) {
638 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
639 chunkWordCount = wordPtr.start;
640 chunkFragCount = wordPtr.here;
641 wordPtr.start = totalIBits;
642 wordPtr.here = totalIBits;
643 wordPtr.lastFragNum = chunkStartFragNum;
644 wordPtr.lgB = 0;
645 if (chunkWordCount > 0) {
646 wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
647 chunkFragCount));
648 totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
649 // use unary encoding for memory buffer encoding of fragment freq
650 if (!wordLevelIndex) {
651 totalIBits += chunkWordCount;
652 }
653 }
654 }
655 for (tagNum=0; tagNum<_idh.tag_dict_size; ++tagNum) {
656 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
657 chunkFragCount = tagPtr.here;
658 tagPtr.start = totalIBits;
659 tagPtr.here = totalIBits;
660 tagPtr.lastFragNum = chunkStartFragNum;
661 tagPtr.lgB = 0;
662 if (chunkFragCount > 0) {
663 mg_u_long pTag = chunkFragCount*2;
664 tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
665 pTag));
666 mg_u_long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
667 pTag);
668// cout << tagNum + _idh.word_dict_size << " ";
669// cout << "numFrags: " << numFragsInChunk
670// << " chunkFragCount: " << chunkFragCount
671// << " B: " << 1 << tagPtr.lgB
672// << " blen: " << blen << "\n";
673 totalIBits += bLen;
674 }
675 }
676 bitPtrs.GetEndStart() = totalIBits;
677 bitPtrs.GetEndHere() = totalIBits;
678
679 if ((totalIBits + 7ul) >> 3ul > chunkMem) {
680 cerr << "totalIBits: " << totalIBits << "\n";
681 cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
682 cerr << "chunkMem: " << chunkMem << "\n";
683 FatalError (1, "Pointers exceed buffer size");
684 }
685}
686
687
688
689
690int init_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
691 // read in compressed dictionary header
692 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
693 MAGIC_STEM_BUILD, MG_ABORT);
694 idh.Read (dictFile);
695 fclose (dictFile);
696
697 // set the size of the bit ptrs
698 bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
699
700 // open the chunk file and read in the maximum memory needed
701 // for the inverted memory buffer
702 chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
703 MAGIC_CHUNK, MG_ABORT);
704 ReadUL (chunkFile, ivfMemBufSize);
705 chunkBuf.attachFile (chunkFile);
706
707 // allocate memory for the inverted buffer
708 ivfMemBuf = new char [ivfMemBufSize];
709 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
710
711 // read in the word dictionary
712 ReadWordDict (filename);
713
714 // read in the tag dictionary
715 ReadTagDict (filename, idh);
716
717 // read in the level information
718 ReadLevelFile (filename);
719 bool wordLevelIndex = ivfLevel.indexLevel.empty();
720
721 // set up the translation table file
722 occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
723
724 // reset some globals
725 numDocs = 0;
726 numChunkDocs = 0;
727 numDocsInChunk = 0;
728 numFrags = 0;
729 numFragsInChunk = 0;
730 chunkStartFragNum = 0;
731
732 strcpy (collectFilename, filename);
733
734
735 // create the inverted file
736 mg_ullong totalIBits = 0;
737 FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
738 MAGIC_INVF, MG_ABORT);
739 totalIBits += sizeof (mg_u_long) * 8; // magic number
740 totalIBits += 8 * 200; // 200 byte gap -- why??????
741 fclose (invfFile);
742
743 // init the inverted file state cache
744 invfState.Open (filename);
745 InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
746
747 return COMPALLOK;
748}
749
750static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/*tagName*/) {
751 if (!tInfo.inTag) return;
752
753 // add this tag to the inverted list
754 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
755 mg_u_long endFrag = numFrags;
756 int b = 1 << tagBitPtr.lgB;
757
758 /*
759 cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
760 << tInfo.startFrag << " " << endFrag << "\n";
761 */
762
763 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
764 buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
765 b, NULL);
766 buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
767 tagBitPtr.lastFragNum = endFrag;
768 tagBitPtr.here = buffer.position();
769 buffer.encodeDone();
770
771 // check for buffer overrun
772 bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
773
774 // reset information about this tag
775 tInfo.inTag = false;
776 tInfo.startFrag = 0;
777}
778
779static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
780 // close tag if open
781 IP2TagInfo &tInfo = tagMapDict[el.tagName];
782 if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
783
784 // open this tag
785 tInfo.inTag = true;
786 tInfo.startFrag = numFrags;
787
788 // check for start of next fragment
789 bool wordLevelIndex = ivfLevel.indexLevel.empty();
790 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
791 ++numFrags;
792 inFrag = true;
793 }
794}
795
796static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
797 // check for end of fragment
798 bool wordLevelIndex = ivfLevel.indexLevel.empty();
799 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
800 inFrag = false;
801 }
802
803 IP2TagInfo &tInfo = tagMapDict[el.tagName];
804 CloseTextTag (tInfo, el.tagName);
805}
806
807static void ProcessText (const TextEl &el, bool &inFrag) {
808 // make sure this text is to be indexed
809 bool wordLevelIndex = ivfLevel.indexLevel.empty();
810 if (!wordLevelIndex && !inFrag) return;
811
812 const unsigned char *textHere = &(el.text[0]);
813 const unsigned char *textEnd = &(el.text[el.text.size() - 1]);
814 unsigned char mgWord[MAXSTEMLEN + 1];
815
816 if (!inaword_mgpp (textHere, textEnd))
817 ParseNonindexWord (textHere, textEnd);
818
819
820 // Alternately parse off words and non-words from the input
821
822 while (textHere <= textEnd) {
823 textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
824 textHere = ParseNonindexWord (textHere, textEnd);
825
826 if (mgWord[0] > 0) {
827 if (wordLevelIndex) ++numFrags;
828
829 mg_u_long wordNum = perf_hash (wordHashDict, mgWord);
830
831 /*
832 cout << wordNum << " \"";
833 cout.write (mgWord+1, *mgWord);
834 cout << "\" " << numFrags << "\n";
835 */
836
837 // add this word to the inverted list
838 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
839 mg_u_long fragNum = numFrags;
840 int b = 1 << wordBitPtr.lgB;
841
842 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
843
844 // note: this assumes that fragments don't carry over between
845 // chunks (which they don't because all tags are closed at the
846 // end of each document and chunks are based on document
847 // boundaries), i.e. the first fragment number must be greater
848 // than the starting fragment number of the chunk.
849 if (fragNum > wordBitPtr.lastFragNum) {
850 buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
851 b, NULL);
852 if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
853
854 } else if (!wordLevelIndex) {
855 // add one to the frequency count for this word
856 buffer.seek (buffer.position()-1);
857 buffer.encodeBit (0); // unary encoding -- last = 1
858 buffer.encodeBit (1);
859 }
860
861 wordBitPtr.lastFragNum = fragNum;
862 wordBitPtr.here = buffer.position();
863 buffer.encodeDone();
864
865 // check for buffer overrun
866 bitPtrs.CheckWordBufOverrun (wordNum);
867 }
868 }
869}
870
871// combine the in memory inverted buffer with the disk
872// based inverted file
873static void DiskMerge (char *filename) {
874 bool wordLevelIndex = ivfLevel.indexLevel.empty();
875
876 // make sure we have something to process
877 if (numChunkDocs <= 0) return;
878
879 // open the inverted file
880 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
881 MAGIC_INVF, MG_ABORT);
882 random_bitio_buffer invfOutBuf (invfFile);
883
884 // set up to decode the entries in memory
885 mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
886
887 // write out the word information
888 mg_u_long wordNum;
889 int b;
890 mg_u_long currFragNum;
891 mg_u_long delta;
892 mg_u_long currFreq;
893 for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
894 // go to the end of the last inverted file entry
895 InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
896 invfOutBuf.SEEK_X (wordDiskState.here);
897
898 // go to the start of the inverted chunk info in memory
899 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
900 memInBuf.seek(wordBitPtr.start);
901
902 // decode each entry and re-write to disk
903 currFragNum = chunkStartFragNum;
904 while (memInBuf.position() < wordBitPtr.here) {
905 // decode word entry
906 b = 1 << wordBitPtr.lgB;
907 delta = memInBuf.bblock_decode (b, NULL);
908 currFragNum += delta;
909 if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
910 else currFreq = 1;
911
912 // recode on disk
913 invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
914 wordDiskState.B, NULL);
915 if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
916 wordDiskState.lastFragNum = currFragNum;
917 }
918
919 wordDiskState.here = invfOutBuf.TELL_X();
920 }
921
922 // write out the tag information
923 mg_u_long tagNum;
924 mg_u_long currTagStart;
925 mg_u_long currTagEnd;
926 for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
927 // go to the end of the last inverted file entry
928 InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
929 invfOutBuf.SEEK_X (tagDiskState.here);
930
931 // go to the start of the inverted chunk info in memory
932 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
933 memInBuf.seek(tagBitPtr.start);
934
935 // decode each entry and re-write to disk
936 currTagEnd = chunkStartFragNum;
937 while (memInBuf.position() < tagBitPtr.here) {
938 // decode tag entry
939 b = 1 << tagBitPtr.lgB;
940 delta = memInBuf.bblock_decode (b, NULL) - 1;
941 currTagStart = currTagEnd + delta;
942 delta = memInBuf.bblock_decode (b, NULL) - 1;
943 currTagEnd = currTagStart + delta;
944
945 // recode on disk
946 invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
947 tagDiskState.B, NULL);
948 invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
949 tagDiskState.B, NULL);
950
951 tagDiskState.lastFragNum = currTagEnd;
952 }
953
954 tagDiskState.here = invfOutBuf.TELL_X();
955 }
956
957 memInBuf.done();
958
959 invfOutBuf.encodeDone();
960 fclose (invfFile);
961}
962
963
964int process_ivf_2 (const TagInfo &/*tagInfo*/, const TextElArray &doc) {
965 bool wordLevelIndex = ivfLevel.indexLevel.empty();
966 bool inFrag = false;
967 if (wordLevelIndex) inFrag = true; // unconditional
968
969 // get next chunk information if need to. the chunk information
970 // is needed before the first document is processed
971 if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
972
973 // process each text element
974 TextElArray::const_iterator here = doc.begin();
975 TextElArray::const_iterator end = doc.end();
976 while (here != end) {
977 // process this element
978 if ((*here).elType == OpenTagE) ProcessOpenTag (*here, inFrag);
979 else if ((*here).elType == CloseTagE) ProcessCloseTag (*here, inFrag);
980 else ProcessText (*here, inFrag);
981
982 ++here;
983 }
984
985 // close off any unclosed tags
986 TagMapDict::iterator tdHere = tagMapDict.begin();
987 TagMapDict::iterator tdEnd = tagMapDict.end();
988 while (tdHere != tdEnd) {
989 CloseTextTag ((*tdHere).second, (*tdHere).first);
990 ++tdHere;
991 }
992
993 // we've processed one more document
994 ++numDocs;
995 ++numChunkDocs;
996
997 // merge the memory based inverted file with the one on
998 // disk if this is the end of this chunk
999 if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
1000
1001 return COMPALLOK;
1002}
1003
1004
1005static void CondenseInvfFile (char *filename, mg_u_long &bytesOutput) {
1006 FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1007 MAGIC_INVF, MG_ABORT);
1008 FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1009 MAGIC_INVF, MG_ABORT);
1010
1011 // skip the magic number
1012 fseek (outInvfFile, sizeof (mg_u_long), SEEK_SET);
1013
1014 // write the inverted file header -- use defaults for most things
1015 invf_file_header ifh;
1016 ifh.no_of_words = idh.word_dict_size;
1017 ifh.no_of_tags = idh.tag_dict_size;
1018 ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1019 ifh.Write (outInvfFile);
1020
1021 bytesOutput = ftell (outInvfFile);
1022
1023 // process each meaningful byte in the file
1024 mg_u_long numEntries = ifh.no_of_words + ifh.no_of_tags;
1025 mg_u_long entryNum;
1026 mg_ullong lastStart = 0;
1027 for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1028 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1029
1030 // overrun check
1031 if (stateRec.start < lastStart)
1032 FatalError (1, "Inverted file Buffer overrun");
1033 lastStart = stateRec.start;
1034
1035 mg_u_long oldEntryStart = stateRec.start >> 3;
1036 mg_u_long oldEntryStartOver = stateRec.start & 7; // should be 0
1037 mg_u_long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1038 mg_u_long oldEntryEndOver = stateRec.here & 7;
1039
1040 fseek (inInvfFile, oldEntryStart, SEEK_SET);
1041
1042 stateRec.here -= stateRec.start;
1043 stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver;
1044 stateRec.here += stateRec.start;
1045 while (oldEntryStart < oldEntryEnd) {
1046 unsigned char c = getc (inInvfFile);
1047 if (oldEntryStart == oldEntryEnd - 1) {
1048 u_char ands[8] =
1049 {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1050 c &= ands[oldEntryEndOver];
1051 }
1052 putc (c, outInvfFile);
1053 ++bytesOutput;
1054 ++oldEntryStart;
1055 }
1056 }
1057
1058 fclose (inInvfFile);
1059
1060#ifdef __WIN32__
1061 if (_chsize (_fileno (outInvfFile), bytesOutput) != 0)
1062 Message ("Could not truncate invf.");
1063#else
1064 if(ftruncate (fileno (outInvfFile), bytesOutput) != 0)
1065 {
1066 fprintf(stderr, "Could not truncate invf");
1067 }
1068#endif
1069
1070 fclose (outInvfFile);
1071}
1072
1073static void OutputInvfIdx (char *filename, mg_u_long invfNumBytes) {
1074 FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1075 MAGIC_INVI, MG_ABORT);
1076
1077 // process each meaningful byte in the file
1078 mg_u_long numEntries = idh.word_dict_size + idh.tag_dict_size;
1079 mg_u_long entryNum;
1080 for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1081 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1082
1083 // assumes that inverted entries start at beginning of each byte
1084 if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1085 }
1086
1087 WriteUL (invfIdxFile, invfNumBytes);
1088
1089 fclose (invfIdxFile);
1090}
1091
1092
1093int done_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
1094 // close most open files
1095 if (chunkFile != NULL) {
1096 chunkBuf.done();
1097 fclose (chunkFile);
1098 chunkFile = NULL;
1099 }
1100 occurConvert.Close();
1101
1102 // free allocated memory
1103 bitPtrs.Clear();
1104 if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1105 free_perf_hash (wordHashDict);
1106 wordHashDict = NULL;
1107 tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1108
1109 // condense the inverted file and truncate it
1110 // this function also writes out the inverted header
1111 mg_u_long invfNumBytes = 0;
1112 CondenseInvfFile (filename, invfNumBytes);
1113
1114 OutputInvfIdx (filename, invfNumBytes);
1115
1116 // close the rest of the open files
1117 invfState.Close ();
1118
1119 return COMPALLOK;
1120}
Note: See TracBrowser for help on using the repository browser.