source: trunk/indexers/mgpp/text/ivf.pass2.cpp@ 13715

Last change on this file since 13715 was 13715, checked in by kjdon, 17 years ago

renamed inaword, isaspace and skipspace so they don't conflict with mg versions

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 31.3 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass2.cpp -- Memory efficient pass 2 inversion
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25#include <stdio.h>
26
27#if defined __WIN32__
28# include <process.h>
29# include <io.h>
30# define getpid _getpid
31# define unlink _unlink
32#else
33# include <unistd.h>
34# include "non_ansi.h"
35#endif
36
37#include "UCArray.h"
38#include "sysfuncs.h"
39#include "mg_files.h"
40#include "invf.h"
41#include "mg.h"
42#include "build.h"
43#include "locallib.h"
44#include "bitio_m_random.h"
45#include "bitio_m_stdio.h"
46#include "bitio_m_mems.h"
47#include "bitio_gen.h"
48#include <stdio.h>
49#include "words.h"
50#include "messages.h"
51#include "netorder.h"
52#include "FIvfLevelInfo.h"
53#include "perf_hash.h"
54#include "string.h"
55
56#include "longlong.h"
57
58#if defined(GSDL_USE_OBJECTSPACE)
59# include <ospace\std\map>
60#elif defined(GSDL_USE_STL_H)
61# include <map.h>
62#else
63# include <map>
64#endif
65
66
67#ifdef USE_LONG_LONG
68#define SEEK_X seek_LL
69#define TELL_X tell_LL
70#else
71#define SEEK_X seek
72#define TELL_X tell
73#endif
74
75#ifndef RND_BUF_SIZE
76#define RND_BUF_SIZE 8*1024
77#endif
78
79
80static unsigned long numDocs = 0;
81static unsigned long numChunkDocs = 0;
82static unsigned long numDocsInChunk = 0;
83
84static unsigned long numFrags = 0;
85static unsigned long numFragsInChunk = 0;
86static unsigned long chunkStartFragNum = 0;
87
88
89
90
91struct BitPtr {
92 unsigned long start;
93 unsigned long here;
94 unsigned long lastFragNum;
95 unsigned long lgB;
96
97 void Clear () { start = here = lastFragNum = lgB = 0; }
98 BitPtr () { Clear(); }
99};
100
101class WordBitPtrs {
102protected:
103 unsigned long numWords;
104 unsigned long numTags;
105 unsigned long size;
106 BitPtr *wordBitPtrs;
107
108 void CheckBufOverrun (unsigned long num) {
109 if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
110 cerr << "numDocs: " << numDocs << "\n";
111 cerr << "numChunkDocs: " << numChunkDocs << "\n";
112 cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
113 cerr << "numFrags: " << numFrags << "\n";
114 cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
115 cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
116 cerr << "num: " << num << "\n";
117 cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
118 cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
119 cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
120 FatalError (1, "Bit buffer overrun");
121 }
122 }
123
124public:
125 void Clear ();
126 WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
127 ~WordBitPtrs ();
128 void SetSize (unsigned long _numWords,
129 unsigned long _numTags);
130
131 void ResetPtrs () {
132 if (wordBitPtrs == NULL) return;
133 unsigned long i;
134 for (i=0; i<size; ++i) wordBitPtrs[i].Clear();
135 }
136
137 BitPtr &GetWordBitPtr (unsigned long wordNum)
138 { return wordBitPtrs[wordNum]; }
139 unsigned long &GetWordStart (unsigned long wordNum)
140 { return wordBitPtrs[wordNum].start; }
141 unsigned long &GetWordHere (unsigned long wordNum)
142 { return wordBitPtrs[wordNum].here; }
143 void CheckWordBufOverrun (unsigned long wordNum)
144 { CheckBufOverrun (wordNum); }
145
146 BitPtr &GetTagBitPtr (unsigned long tagNum)
147 { return wordBitPtrs[tagNum + numWords]; }
148 unsigned long &GetTagStart (unsigned long tagNum)
149 { return wordBitPtrs[tagNum + numWords].start; }
150 unsigned long &GetTagHere (unsigned long tagNum)
151 { return wordBitPtrs[tagNum + numWords].here; }
152 void CheckTagBufOverrun (unsigned long tagNum)
153 { CheckBufOverrun (tagNum + numWords); }
154
155 BitPtr &GetEndBitPtr ()
156 { return wordBitPtrs[size-1]; }
157 unsigned long &GetEndStart ()
158 { return wordBitPtrs[size-1].start; }
159 unsigned long &GetEndHere ()
160 { return wordBitPtrs[size-1].here; }
161};
162
163
164struct IP2TagInfo {
165 bool inTag;
166 unsigned long startFrag;
167 unsigned long tagNum;
168
169 IP2TagInfo () {
170 inTag = false;
171 startFrag = 0;
172 tagNum = 0;
173 }
174};
175
176// maps tags to tag information
177typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
178
179
180// class to handle the translation of occurrence order
181// to dictionary order for words and tags
182class OccurToDictConverter {
183protected:
184 unsigned long pos;
185 unsigned long val;
186 FILE *transFile;
187 random_bitio_buffer rbs;
188
189 unsigned long wordDictSize;
190 unsigned long tagDictSize;
191
192 void SeekStart ();
193 unsigned long TranslateNum (unsigned long num);
194
195public:
196 OccurToDictConverter ();
197 ~OccurToDictConverter ();
198
199 void Open (char *filename, unsigned long _wordDictSize,
200 unsigned long _tagDictSize);
201
202 // Close frees all allocated memory
203 void Close ();
204
205 unsigned long TranslateWord (unsigned long occurNum)
206 { return TranslateNum (occurNum); }
207 unsigned long TranslateTag (unsigned long occurNum)
208 { return TranslateNum (occurNum+wordDictSize); }
209};
210
211
212struct InvfStateRec {
213 mg_ullong start;
214 mg_ullong here;
215 unsigned long lastFragNum;
216 unsigned long B;
217
218 void Clear () {
219 start = here = 0;
220 lastFragNum = B = 0;
221 }
222 InvfStateRec () { Clear (); }
223};
224
225
226#define ISR_SIZE 1024
227
228class InvfStateCache {
229protected:
230 InvfStateRec recCache [ISR_SIZE];
231 unsigned long startNum;
232
233 FILE *stateFile;
234
235 void ClearCache () {
236 unsigned int i = 0;
237 for (i=0; i<ISR_SIZE; ++i) recCache[i].Clear();
238 }
239
240public:
241 InvfStateCache ();
242 ~InvfStateCache ();
243
244 void Open (char *filename);
245 void Close ();
246
247 // previous references to state records may be
248 // invalidated calling GetRec
249 InvfStateRec &GetRec (unsigned long num);
250};
251
252
253static invf_dict_header idh;
254static WordBitPtrs bitPtrs;
255
256static FILE *chunkFile = NULL;
257static stdio_bitio_buffer chunkBuf;
258
259static unsigned long ivfMemBufSize = 0;
260static char *ivfMemBuf = NULL;
261
262// word and tag dictionaries. a map is used for the tag dictionary
263// as it should never be very big (and the perfect hash function
264// sometimes has trouble with small values).
265static perf_hash_data *wordHashDict = NULL;
266static TagMapDict tagMapDict;
267
268// information about all the different levels
269static FIvfLevel ivfLevel;
270
271static OccurToDictConverter occurConvert;
272
273// information about the state of the inverted file
274static InvfStateCache invfState;
275
276static char collectFilename[512];
277
278
279void WordBitPtrs::Clear () {
280 numWords = 0;
281 numTags = 0;
282 size=0;
283 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
284 wordBitPtrs = NULL;
285}
286
287WordBitPtrs::~WordBitPtrs () {
288 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
289}
290
291void WordBitPtrs::SetSize (unsigned long _numWords,
292 unsigned long _numTags){
293 Clear();
294 numWords = _numWords;
295 numTags = _numTags;
296 size = numWords + numTags + 1;
297 wordBitPtrs = new BitPtr [size];
298}
299
300
301void OccurToDictConverter::SeekStart () {
302 if (transFile == NULL) return;
303 rbs.SEEK_X (sizeof (unsigned long) * 8);
304 pos = 0;
305}
306
307unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
308 if (num < pos) SeekStart ();
309 while (pos <= num) {
310 if (pos < wordDictSize)
311 val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
312 else
313 val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
314 ++pos;
315 }
316 return val;
317}
318
319OccurToDictConverter::OccurToDictConverter () {
320 pos = 0;
321 val = 0;
322 transFile = NULL;
323 wordDictSize = 0;
324 tagDictSize = 0;
325}
326
327OccurToDictConverter::~OccurToDictConverter () {
328 if (transFile != NULL) Close ();
329}
330
331void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
332 unsigned long _tagDictSize) {
333 if (transFile != NULL) Close ();
334
335 wordDictSize = _wordDictSize;
336 tagDictSize = _tagDictSize;
337
338 transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
339 MAGIC_CHUNK_TRANS, MG_ABORT);
340 rbs.attachFile (transFile, RND_BUF_SIZE);
341 SeekStart ();
342 val = 0;
343}
344
345void OccurToDictConverter::Close () {
346 if (transFile == NULL) return;
347
348 rbs.done ();
349 fclose (transFile);
350 transFile = NULL;
351 pos = 0;
352 val = 0;
353
354 wordDictSize = 0;
355 tagDictSize = 0;
356}
357
358
359
360
361InvfStateCache::InvfStateCache () {
362 startNum = 0;
363 stateFile = NULL;
364}
365
366InvfStateCache::~InvfStateCache () {
367 if (stateFile != NULL) Close ();
368}
369
370void InvfStateCache::Open (char *filename) {
371 if (stateFile != NULL) Close();
372
373 // open the state file
374 char path[512];
375 sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
376 ".invf.state", (long) getpid ());
377 if (!(stateFile = fopen (path, "wb+"))) {
378 Message ("Unable to create \"%s\"", path);
379 exit (1);
380 }
381 unlink (path); // file will be deleted after it is closed
382 // reset the buffer
383 startNum = 0;
384 ClearCache();
385}
386
387void InvfStateCache::Close () {
388 if (stateFile == NULL) return;
389 fclose (stateFile);
390 stateFile = NULL;
391 startNum = 0;
392}
393
394InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
395 // see if cached
396 if ((num >= startNum) && (num < startNum + ISR_SIZE))
397 return recCache[num-startNum];
398
399 // not cached, write out this lot of records and read in
400 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
401 fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
402
403 // read in the new set of records
404 ClearCache ();
405 startNum = num - (num % ISR_SIZE);
406 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
407 fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
408
409 return recCache[num-startNum];
410}
411
412
413
414static void ClearCharBuf (char *buf, unsigned long size) {
415 char *end = buf + size;
416 while (buf != end) *buf++ = 0;
417}
418
419static void ReadWordDict (char *filename) {
420 // read in the perfect hash function for the word dictionary
421 FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
422 MAGIC_HASH, MG_ABORT);
423 if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
424 FatalError (1, "Unable to read in hash data for word dictionary");
425 }
426 fclose (wordHashFile);
427}
428
429static void ReadTagDict (char *filename, invf_dict_header &_idh) {
430 // open the file
431 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
432 MAGIC_STEM_BUILD, MG_ABORT);
433
434 // seek to the start of the tag dictionary
435 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
436
437 unsigned long tagNum;
438 dict_el thisEl;
439 for (tagNum = 0; tagNum < _idh.tag_dict_size; ++tagNum) {
440 thisEl.Read (dictFile);
441 tagMapDict[thisEl.el].tagNum = tagNum;
442 }
443
444 fclose (dictFile);
445}
446
447static void ReadLevelFile (char *filename) {
448 FILE *f;
449 f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
450 MAGIC_INVF_LEVELS, MG_ABORT);
451 ivfLevel.Read (f);
452 fclose (f);
453}
454
455void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
456 if (totalIBits < lastTotalIBits) {
457 fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
458 if (sizeof (mg_ullong) < 8) {
459 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
460 }
461 fprintf(stderr, " Build aborted.\n");
462 exit(1);
463 }
464}
465
466// assumes the inverted file state file has been opened
467static void InitInvfState (char *filename,
468 invf_dict_header &_idh,
469 InvfStateCache &_invfState,
470 mg_ullong &totalIBits,
471 bool wordLevelIndex) {
472 // read in the dictionary, setting inverted state information
473 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
474 MAGIC_STEM_BUILD, MG_ABORT);
475
476 // seek to the start of the word dictionary
477 fseek (dictFile, _idh.word_dict_start, SEEK_SET);
478
479 // add the word entries
480 word_dict_el wordEl;
481 wordEl.SetNumLevels (_idh.num_levels);
482 unsigned long dictWordNum, p;
483 mg_ullong lastTotalIBits;
484 unsigned long N = _idh.num_frags;
485 for (dictWordNum=0; dictWordNum<_idh.word_dict_size; ++dictWordNum) {
486 // lastTotalIBits is used to detect integer overflow
487 lastTotalIBits = totalIBits;
488
489 // read the next word and associated information
490 wordEl.Read (dictFile, _idh.num_levels);
491
492 // update the state record
493 p = wordEl.frag_occur;
494 InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
495 wisr.start = totalIBits;
496 wisr.here = totalIBits;
497 wisr.B = BIO_Bblock_Init (N, p);
498
499 // add the length of the fragment numbers
500 totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
501
502 // if needed, add the length of the fragment frequency information
503 if (!wordLevelIndex)
504 totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
505
506 // align next byte
507#ifdef USE_LONG_LONG
508 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
509#else
510 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
511#endif
512
513 CheckIntOverflow (totalIBits, lastTotalIBits);
514 }
515
516
517 // seek to the start of the tag dictionary
518 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
519
520 // add the tag entries
521 dict_el tagEl;
522 unsigned long dictTagNum;
523 N = _idh.num_frags;
524 for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; ++dictTagNum) {
525 // lastTotalIBits is used to detect integer overflow
526 lastTotalIBits = totalIBits;
527
528 // read the next tag and associated information
529 tagEl.Read (dictFile);
530
531 // update the state record
532 p = tagEl.frag_occur*2;
533 InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
534 tisr.start = totalIBits;
535 tisr.here = totalIBits;
536 tisr.B = BIO_Bblock_Init (N+p, p);
537
538 // add the length of the fragment numbers (two numbers for each
539 // tag, one for start and one for end)
540 totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
541
542 // align next byte
543#ifdef USE_LONG_LONG
544 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
545#else
546 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
547#endif
548
549 CheckIntOverflow (totalIBits, lastTotalIBits);
550 }
551
552 fclose (dictFile);
553}
554
555/*
556// assumes the chunk tag information has been placed in .first
557static void PrintChunkInfo (unsigned long chunkMem,
558 unsigned long numChunkWords,
559 unsigned long numChunkTags) {
560 static unsigned long chunksRead = 0;
561 ++chunksRead;
562 cout << "Chunk Number: " << chunksRead << "\n";
563 cout << "numChunkDocs " << numDocsInChunk << "\n";
564 cout << "numChunkFrags " << numFragsInChunk << "\n";
565 cout << "mem " << chunkMem << "\n";
566 cout << "numWords " << numChunkWords << "\n";
567 cout << "numTags " << numChunkTags << "\n\n";
568
569 TagMapDict::iterator tagMapHere = tagMapDict.begin();
570 TagMapDict::iterator tagMapEnd = tagMapDict.end();
571 while (tagMapHere != tagMapEnd) {
572 unsigned long tagMapNum = (*tagMapHere).second.tagNum;
573 cout << (*tagMapHere).first << " " << tagMapNum << " "
574 << bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
575 ++tagMapHere;
576 }
577}
578*/
579
580void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
581 // reset globals
582 numChunkDocs = 0;
583 chunkStartFragNum = numFrags;
584
585 // read in information about this chunk
586 numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
587 if (numDocsInChunk == 0)
588 FatalError (1, "The number of docs in the current chunk is 0");
589
590 numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
591 unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
592
593 if (chunkMem > ivfMemBufSize)
594 FatalError (1, "Chunk memory size is greater than maximum");
595
596 unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
597 unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
598
599
600 // reset stuff
601 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
602 bitPtrs.ResetPtrs();
603
604 // read in the entries in occurrence order storing the
605 // "chunkWordCount" in "start" and the "chunkFragCount"
606 // in "here"
607 unsigned long numOccur;
608 unsigned long wordNum;
609 for (numOccur=0; numOccur<numChunkWords; ++numOccur) {
610 wordNum = occurConvert.TranslateWord (numOccur);
611 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
612 wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
613 if (wordPtr.start >= 2)
614 wordPtr.here = chunkBuf.gamma_decode (NULL);
615 else wordPtr.here = wordPtr.start;
616 }
617 unsigned long tagNum;
618 for (numOccur=0; numOccur<numChunkTags; ++numOccur) {
619 tagNum = occurConvert.TranslateTag (numOccur);
620 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
621 // only chunkFragCount is encoded for tags
622 tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
623 tagPtr.here = tagPtr.start;
624 }
625
626 /* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
627
628 // create the bit ptrs in dictionary order
629 unsigned long totalIBits = 0; // only dealing with memory
630 unsigned long chunkWordCount, chunkFragCount;
631 for (wordNum=0; wordNum<_idh.word_dict_size; ++wordNum) {
632 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
633 chunkWordCount = wordPtr.start;
634 chunkFragCount = wordPtr.here;
635 wordPtr.start = totalIBits;
636 wordPtr.here = totalIBits;
637 wordPtr.lastFragNum = chunkStartFragNum;
638 wordPtr.lgB = 0;
639 if (chunkWordCount > 0) {
640 wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
641 chunkFragCount));
642 totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
643 // use unary encoding for memory buffer encoding of fragment freq
644 if (!wordLevelIndex) {
645 totalIBits += chunkWordCount;
646 }
647 }
648 }
649 for (tagNum=0; tagNum<_idh.tag_dict_size; ++tagNum) {
650 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
651 chunkFragCount = tagPtr.here;
652 tagPtr.start = totalIBits;
653 tagPtr.here = totalIBits;
654 tagPtr.lastFragNum = chunkStartFragNum;
655 tagPtr.lgB = 0;
656 if (chunkFragCount > 0) {
657 unsigned long pTag = chunkFragCount*2;
658 tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
659 pTag));
660 unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
661 pTag);
662// cout << tagNum + _idh.word_dict_size << " ";
663// cout << "numFrags: " << numFragsInChunk
664// << " chunkFragCount: " << chunkFragCount
665// << " B: " << 1 << tagPtr.lgB
666// << " blen: " << blen << "\n";
667 totalIBits += bLen;
668 }
669 }
670 bitPtrs.GetEndStart() = totalIBits;
671 bitPtrs.GetEndHere() = totalIBits;
672
673 if ((totalIBits + 7ul) >> 3ul > chunkMem) {
674 cerr << "totalIBits: " << totalIBits << "\n";
675 cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
676 cerr << "chunkMem: " << chunkMem << "\n";
677 FatalError (1, "Pointers exceed buffer size");
678 }
679}
680
681
682
683
684int init_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
685 // read in compressed dictionary header
686 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
687 MAGIC_STEM_BUILD, MG_ABORT);
688 idh.Read (dictFile);
689 fclose (dictFile);
690
691 // set the size of the bit ptrs
692 bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
693
694 // open the chunk file and read in the maximum memory needed
695 // for the inverted memory buffer
696 chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
697 MAGIC_CHUNK, MG_ABORT);
698 ReadUL (chunkFile, ivfMemBufSize);
699 chunkBuf.attachFile (chunkFile);
700
701 // allocate memory for the inverted buffer
702 ivfMemBuf = new char [ivfMemBufSize];
703 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
704
705 // read in the word dictionary
706 ReadWordDict (filename);
707
708 // read in the tag dictionary
709 ReadTagDict (filename, idh);
710
711 // read in the level information
712 ReadLevelFile (filename);
713 bool wordLevelIndex = ivfLevel.indexLevel.empty();
714
715 // set up the translation table file
716 occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
717
718 // reset some globals
719 numDocs = 0;
720 numChunkDocs = 0;
721 numDocsInChunk = 0;
722 numFrags = 0;
723 numFragsInChunk = 0;
724 chunkStartFragNum = 0;
725
726 strcpy (collectFilename, filename);
727
728
729 // create the inverted file
730 mg_ullong totalIBits = 0;
731 FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
732 MAGIC_INVF, MG_ABORT);
733 totalIBits += sizeof (unsigned long) * 8; // magic number
734 totalIBits += 8 * 200; // 200 byte gap -- why??????
735 fclose (invfFile);
736
737 // init the inverted file state cache
738 invfState.Open (filename);
739 InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
740
741 return COMPALLOK;
742}
743
744static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/*tagName*/) {
745 if (!tInfo.inTag) return;
746
747 // add this tag to the inverted list
748 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
749 unsigned long endFrag = numFrags;
750 int b = 1 << tagBitPtr.lgB;
751
752 /*
753 cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
754 << tInfo.startFrag << " " << endFrag << "\n";
755 */
756
757 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
758 buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
759 b, NULL);
760 buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
761 tagBitPtr.lastFragNum = endFrag;
762 tagBitPtr.here = buffer.position();
763 buffer.encodeDone();
764
765 // check for buffer overrun
766 bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
767
768 // reset information about this tag
769 tInfo.inTag = false;
770 tInfo.startFrag = 0;
771}
772
773static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
774 // close tag if open
775 IP2TagInfo &tInfo = tagMapDict[el.tagName];
776 if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
777
778 // open this tag
779 tInfo.inTag = true;
780 tInfo.startFrag = numFrags;
781
782 // check for start of next fragment
783 bool wordLevelIndex = ivfLevel.indexLevel.empty();
784 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
785 ++numFrags;
786 inFrag = true;
787 }
788}
789
790static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
791 // check for end of fragment
792 bool wordLevelIndex = ivfLevel.indexLevel.empty();
793 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
794 inFrag = false;
795 }
796
797 IP2TagInfo &tInfo = tagMapDict[el.tagName];
798 CloseTextTag (tInfo, el.tagName);
799}
800
801static void ProcessText (const TextEl &el, bool &inFrag) {
802 // make sure this text is to be indexed
803 bool wordLevelIndex = ivfLevel.indexLevel.empty();
804 if (!wordLevelIndex && !inFrag) return;
805
806 const unsigned char *textHere = &(el.text[0]);
807 const unsigned char *textEnd = &(el.text[el.text.size() - 1]);
808 unsigned char mgWord[MAXSTEMLEN + 1];
809
810 if (!inaword_mgpp (textHere, textEnd))
811 ParseNonindexWord (textHere, textEnd);
812
813
814 // Alternately parse off words and non-words from the input
815
816 while (textHere <= textEnd) {
817 textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
818 textHere = ParseNonindexWord (textHere, textEnd);
819
820 if (mgWord[0] > 0) {
821 if (wordLevelIndex) ++numFrags;
822
823 unsigned long wordNum = perf_hash (wordHashDict, mgWord);
824
825 /*
826 cout << wordNum << " \"";
827 cout.write (mgWord+1, *mgWord);
828 cout << "\" " << numFrags << "\n";
829 */
830
831 // add this word to the inverted list
832 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
833 unsigned long fragNum = numFrags;
834 int b = 1 << wordBitPtr.lgB;
835
836 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
837
838 // note: this assumes that fragments don't carry over between
839 // chunks (which they don't because all tags are closed at the
840 // end of each document and chunks are based on document
841 // boundaries), i.e. the first fragment number must be greater
842 // than the starting fragment number of the chunk.
843 if (fragNum > wordBitPtr.lastFragNum) {
844 buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
845 b, NULL);
846 if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
847
848 } else if (!wordLevelIndex) {
849 // add one to the frequency count for this word
850 buffer.seek (buffer.position()-1);
851 buffer.encodeBit (0); // unary encoding -- last = 1
852 buffer.encodeBit (1);
853 }
854
855 wordBitPtr.lastFragNum = fragNum;
856 wordBitPtr.here = buffer.position();
857 buffer.encodeDone();
858
859 // check for buffer overrun
860 bitPtrs.CheckWordBufOverrun (wordNum);
861 }
862 }
863}
864
865// combine the in memory inverted buffer with the disk
866// based inverted file
867static void DiskMerge (char *filename) {
868 bool wordLevelIndex = ivfLevel.indexLevel.empty();
869
870 // make sure we have something to process
871 if (numChunkDocs <= 0) return;
872
873 // open the inverted file
874 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
875 MAGIC_INVF, MG_ABORT);
876 random_bitio_buffer invfOutBuf (invfFile);
877
878 // set up to decode the entries in memory
879 mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
880
881 // write out the word information
882 unsigned long wordNum;
883 int b;
884 unsigned long currFragNum;
885 unsigned long delta;
886 unsigned long currFreq;
887 for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
888 // go to the end of the last inverted file entry
889 InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
890 invfOutBuf.SEEK_X (wordDiskState.here);
891
892 // go to the start of the inverted chunk info in memory
893 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
894 memInBuf.seek(wordBitPtr.start);
895
896 // decode each entry and re-write to disk
897 currFragNum = chunkStartFragNum;
898 while (memInBuf.position() < wordBitPtr.here) {
899 // decode word entry
900 b = 1 << wordBitPtr.lgB;
901 delta = memInBuf.bblock_decode (b, NULL);
902 currFragNum += delta;
903 if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
904 else currFreq = 1;
905
906 // recode on disk
907 invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
908 wordDiskState.B, NULL);
909 if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
910 wordDiskState.lastFragNum = currFragNum;
911 }
912
913 wordDiskState.here = invfOutBuf.TELL_X();
914 }
915
916 // write out the tag information
917 unsigned long tagNum;
918 unsigned long currTagStart;
919 unsigned long currTagEnd;
920 for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
921 // go to the end of the last inverted file entry
922 InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
923 invfOutBuf.SEEK_X (tagDiskState.here);
924
925 // go to the start of the inverted chunk info in memory
926 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
927 memInBuf.seek(tagBitPtr.start);
928
929 // decode each entry and re-write to disk
930 currTagEnd = chunkStartFragNum;
931 while (memInBuf.position() < tagBitPtr.here) {
932 // decode tag entry
933 b = 1 << tagBitPtr.lgB;
934 delta = memInBuf.bblock_decode (b, NULL) - 1;
935 currTagStart = currTagEnd + delta;
936 delta = memInBuf.bblock_decode (b, NULL) - 1;
937 currTagEnd = currTagStart + delta;
938
939 // recode on disk
940 invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
941 tagDiskState.B, NULL);
942 invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
943 tagDiskState.B, NULL);
944
945 tagDiskState.lastFragNum = currTagEnd;
946 }
947
948 tagDiskState.here = invfOutBuf.TELL_X();
949 }
950
951 memInBuf.done();
952
953 invfOutBuf.encodeDone();
954 fclose (invfFile);
955}
956
957
958int process_ivf_2 (const TagInfo &/*tagInfo*/, const TextElArray &doc) {
959 bool wordLevelIndex = ivfLevel.indexLevel.empty();
960 bool inFrag = false;
961 if (wordLevelIndex) inFrag = true; // unconditional
962
963 // get next chunk information if need to. the chunk information
964 // is needed before the first document is processed
965 if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
966
967 // process each text element
968 TextElArray::const_iterator here = doc.begin();
969 TextElArray::const_iterator end = doc.end();
970 while (here != end) {
971 // process this element
972 if ((*here).elType == OpenTagE) ProcessOpenTag (*here, inFrag);
973 else if ((*here).elType == CloseTagE) ProcessCloseTag (*here, inFrag);
974 else ProcessText (*here, inFrag);
975
976 ++here;
977 }
978
979 // close off any unclosed tags
980 TagMapDict::iterator tdHere = tagMapDict.begin();
981 TagMapDict::iterator tdEnd = tagMapDict.end();
982 while (tdHere != tdEnd) {
983 CloseTextTag ((*tdHere).second, (*tdHere).first);
984 ++tdHere;
985 }
986
987 // we've processed one more document
988 ++numDocs;
989 ++numChunkDocs;
990
991 // merge the memory based inverted file with the one on
992 // disk if this is the end of this chunk
993 if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
994
995 return COMPALLOK;
996}
997
998
999static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
1000 FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1001 MAGIC_INVF, MG_ABORT);
1002 FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1003 MAGIC_INVF, MG_ABORT);
1004
1005 // skip the magic number
1006 fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
1007
1008 // write the inverted file header -- use defaults for most things
1009 invf_file_header ifh;
1010 ifh.no_of_words = idh.word_dict_size;
1011 ifh.no_of_tags = idh.tag_dict_size;
1012 ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1013 ifh.Write (outInvfFile);
1014
1015 bytesOutput = ftell (outInvfFile);
1016
1017 // process each meaningful byte in the file
1018 unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
1019 unsigned long entryNum;
1020 mg_ullong lastStart = 0;
1021 for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1022 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1023
1024 // overrun check
1025 if (stateRec.start < lastStart)
1026 FatalError (1, "Inverted file Buffer overrun");
1027 lastStart = stateRec.start;
1028
1029 unsigned long oldEntryStart = stateRec.start >> 3;
1030 unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
1031 unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1032 unsigned long oldEntryEndOver = stateRec.here & 7;
1033
1034 fseek (inInvfFile, oldEntryStart, SEEK_SET);
1035
1036 stateRec.here -= stateRec.start;
1037 stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver;
1038 stateRec.here += stateRec.start;
1039 while (oldEntryStart < oldEntryEnd) {
1040 unsigned char c = getc (inInvfFile);
1041 if (oldEntryStart == oldEntryEnd - 1) {
1042 u_char ands[8] =
1043 {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1044 c &= ands[oldEntryEndOver];
1045 }
1046 putc (c, outInvfFile);
1047 ++bytesOutput;
1048 ++oldEntryStart;
1049 }
1050 }
1051
1052 fclose (inInvfFile);
1053
1054#ifdef __WIN32__
1055 if (_chsize (_fileno (outInvfFile), bytesOutput) != 0)
1056 Message ("Could not truncate invf.");
1057#else
1058 ftruncate (fileno (outInvfFile), bytesOutput);
1059#endif
1060
1061 fclose (outInvfFile);
1062}
1063
1064static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
1065 FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1066 MAGIC_INVI, MG_ABORT);
1067
1068 // process each meaningful byte in the file
1069 unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
1070 unsigned long entryNum;
1071 for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1072 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1073
1074 // assumes that inverted entries start at beginning of each byte
1075 if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1076 }
1077
1078 WriteUL (invfIdxFile, invfNumBytes);
1079
1080 fclose (invfIdxFile);
1081}
1082
1083
1084int done_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
1085 // close most open files
1086 if (chunkFile != NULL) {
1087 chunkBuf.done();
1088 fclose (chunkFile);
1089 chunkFile = NULL;
1090 }
1091 occurConvert.Close();
1092
1093 // free allocated memory
1094 bitPtrs.Clear();
1095 if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1096 free_perf_hash (wordHashDict);
1097 wordHashDict = NULL;
1098 tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1099
1100 // condense the inverted file and truncate it
1101 // this function also writes out the inverted header
1102 unsigned long invfNumBytes = 0;
1103 CondenseInvfFile (filename, invfNumBytes);
1104
1105 OutputInvfIdx (filename, invfNumBytes);
1106
1107 // close the rest of the open files
1108 invfState.Close ();
1109
1110 return COMPALLOK;
1111}
Note: See TracBrowser for help on using the repository browser.