source: indexers/trunk/mgpp/text/ivf.pass2.cpp@ 20847

Last change on this file since 20847 was 19822, checked in by mdewsnip, 15 years ago

Commented out all occurrences of

#define _XOPEN_SOURCE_EXTENDED 1

This was allegedly added for compilation on Solaris, but it just causes errors for me (on the NLNZ Solaris machines).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 31.4 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass2.cpp -- Memory efficient pass 2 inversion
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23// This was added for Solaris, but it makes things worse on Solaris for me...
24// #define _XOPEN_SOURCE_EXTENDED 1
25
26#include <stdio.h>
27
28#if defined __WIN32__
29# include <process.h>
30# include <io.h>
31# define getpid _getpid
32# define unlink _unlink
33#else
34# include <unistd.h>
35# include "non_ansi.h"
36#endif
37
38#include "UCArray.h"
39#include "sysfuncs.h"
40#include "mg_files.h"
41#include "invf.h"
42#include "mg.h"
43#include "build.h"
44#include "locallib.h"
45#include "bitio_m_random.h"
46#include "bitio_m_stdio.h"
47#include "bitio_m_mems.h"
48#include "bitio_gen.h"
49#include <stdio.h>
50#include "words.h"
51#include "messages.h"
52#include "netorder.h"
53#include "FIvfLevelInfo.h"
54#include "perf_hash.h"
55#include "string.h"
56
57#include "longlong.h"
58
59#if defined(GSDL_USE_OBJECTSPACE)
60# include <ospace\std\map>
61#elif defined(GSDL_USE_STL_H)
62# include <map.h>
63#else
64# include <map>
65#endif
66
67
68#ifdef USE_LONG_LONG
69#define SEEK_X seek_LL
70#define TELL_X tell_LL
71#else
72#define SEEK_X seek
73#define TELL_X tell
74#endif
75
76#ifndef RND_BUF_SIZE
77#define RND_BUF_SIZE 8*1024
78#endif
79
80
81static unsigned long numDocs = 0;
82static unsigned long numChunkDocs = 0;
83static unsigned long numDocsInChunk = 0;
84
85static unsigned long numFrags = 0;
86static unsigned long numFragsInChunk = 0;
87static unsigned long chunkStartFragNum = 0;
88
89
90
91
92struct BitPtr {
93 unsigned long start;
94 unsigned long here;
95 unsigned long lastFragNum;
96 unsigned long lgB;
97
98 void Clear () { start = here = lastFragNum = lgB = 0; }
99 BitPtr () { Clear(); }
100};
101
102class WordBitPtrs {
103protected:
104 unsigned long numWords;
105 unsigned long numTags;
106 unsigned long size;
107 BitPtr *wordBitPtrs;
108
109 void CheckBufOverrun (unsigned long num) {
110 if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
111 cerr << "numDocs: " << numDocs << "\n";
112 cerr << "numChunkDocs: " << numChunkDocs << "\n";
113 cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
114 cerr << "numFrags: " << numFrags << "\n";
115 cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
116 cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
117 cerr << "num: " << num << "\n";
118 cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
119 cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
120 cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
121 FatalError (1, "Bit buffer overrun");
122 }
123 }
124
125public:
126 void Clear ();
127 WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
128 ~WordBitPtrs ();
129 void SetSize (unsigned long _numWords,
130 unsigned long _numTags);
131
132 void ResetPtrs () {
133 if (wordBitPtrs == NULL) return;
134 unsigned long i;
135 for (i=0; i<size; ++i) wordBitPtrs[i].Clear();
136 }
137
138 BitPtr &GetWordBitPtr (unsigned long wordNum)
139 { return wordBitPtrs[wordNum]; }
140 unsigned long &GetWordStart (unsigned long wordNum)
141 { return wordBitPtrs[wordNum].start; }
142 unsigned long &GetWordHere (unsigned long wordNum)
143 { return wordBitPtrs[wordNum].here; }
144 void CheckWordBufOverrun (unsigned long wordNum)
145 { CheckBufOverrun (wordNum); }
146
147 BitPtr &GetTagBitPtr (unsigned long tagNum)
148 { return wordBitPtrs[tagNum + numWords]; }
149 unsigned long &GetTagStart (unsigned long tagNum)
150 { return wordBitPtrs[tagNum + numWords].start; }
151 unsigned long &GetTagHere (unsigned long tagNum)
152 { return wordBitPtrs[tagNum + numWords].here; }
153 void CheckTagBufOverrun (unsigned long tagNum)
154 { CheckBufOverrun (tagNum + numWords); }
155
156 BitPtr &GetEndBitPtr ()
157 { return wordBitPtrs[size-1]; }
158 unsigned long &GetEndStart ()
159 { return wordBitPtrs[size-1].start; }
160 unsigned long &GetEndHere ()
161 { return wordBitPtrs[size-1].here; }
162};
163
164
165struct IP2TagInfo {
166 bool inTag;
167 unsigned long startFrag;
168 unsigned long tagNum;
169
170 IP2TagInfo () {
171 inTag = false;
172 startFrag = 0;
173 tagNum = 0;
174 }
175};
176
177// maps tags to tag information
178typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
179
180
181// class to handle the translation of occurrence order
182// to dictionary order for words and tags
183class OccurToDictConverter {
184protected:
185 unsigned long pos;
186 unsigned long val;
187 FILE *transFile;
188 random_bitio_buffer rbs;
189
190 unsigned long wordDictSize;
191 unsigned long tagDictSize;
192
193 void SeekStart ();
194 unsigned long TranslateNum (unsigned long num);
195
196public:
197 OccurToDictConverter ();
198 ~OccurToDictConverter ();
199
200 void Open (char *filename, unsigned long _wordDictSize,
201 unsigned long _tagDictSize);
202
203 // Close frees all allocated memory
204 void Close ();
205
206 unsigned long TranslateWord (unsigned long occurNum)
207 { return TranslateNum (occurNum); }
208 unsigned long TranslateTag (unsigned long occurNum)
209 { return TranslateNum (occurNum+wordDictSize); }
210};
211
212
213struct InvfStateRec {
214 mg_ullong start;
215 mg_ullong here;
216 unsigned long lastFragNum;
217 unsigned long B;
218
219 void Clear () {
220 start = here = 0;
221 lastFragNum = B = 0;
222 }
223 InvfStateRec () { Clear (); }
224};
225
226
227#define ISR_SIZE 1024
228
229class InvfStateCache {
230protected:
231 InvfStateRec recCache [ISR_SIZE];
232 unsigned long startNum;
233
234 FILE *stateFile;
235
236 void ClearCache () {
237 unsigned int i = 0;
238 for (i=0; i<ISR_SIZE; ++i) recCache[i].Clear();
239 }
240
241public:
242 InvfStateCache ();
243 ~InvfStateCache ();
244
245 void Open (char *filename);
246 void Close ();
247
248 // previous references to state records may be
249 // invalidated calling GetRec
250 InvfStateRec &GetRec (unsigned long num);
251};
252
253
254static invf_dict_header idh;
255static WordBitPtrs bitPtrs;
256
257static FILE *chunkFile = NULL;
258static stdio_bitio_buffer chunkBuf;
259
260static unsigned long ivfMemBufSize = 0;
261static char *ivfMemBuf = NULL;
262
263// word and tag dictionaries. a map is used for the tag dictionary
264// as it should never be very big (and the perfect hash function
265// sometimes has trouble with small values).
266static perf_hash_data *wordHashDict = NULL;
267static TagMapDict tagMapDict;
268
269// information about all the different levels
270static FIvfLevel ivfLevel;
271
272static OccurToDictConverter occurConvert;
273
274// information about the state of the inverted file
275static InvfStateCache invfState;
276
277static char collectFilename[512];
278
279
280void WordBitPtrs::Clear () {
281 numWords = 0;
282 numTags = 0;
283 size=0;
284 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
285 wordBitPtrs = NULL;
286}
287
288WordBitPtrs::~WordBitPtrs () {
289 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
290}
291
292void WordBitPtrs::SetSize (unsigned long _numWords,
293 unsigned long _numTags){
294 Clear();
295 numWords = _numWords;
296 numTags = _numTags;
297 size = numWords + numTags + 1;
298 wordBitPtrs = new BitPtr [size];
299}
300
301
302void OccurToDictConverter::SeekStart () {
303 if (transFile == NULL) return;
304 rbs.SEEK_X (sizeof (unsigned long) * 8);
305 pos = 0;
306}
307
308unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
309 if (num < pos) SeekStart ();
310 while (pos <= num) {
311 if (pos < wordDictSize)
312 val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
313 else
314 val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
315 ++pos;
316 }
317 return val;
318}
319
320OccurToDictConverter::OccurToDictConverter () {
321 pos = 0;
322 val = 0;
323 transFile = NULL;
324 wordDictSize = 0;
325 tagDictSize = 0;
326}
327
328OccurToDictConverter::~OccurToDictConverter () {
329 if (transFile != NULL) Close ();
330}
331
332void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
333 unsigned long _tagDictSize) {
334 if (transFile != NULL) Close ();
335
336 wordDictSize = _wordDictSize;
337 tagDictSize = _tagDictSize;
338
339 transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
340 MAGIC_CHUNK_TRANS, MG_ABORT);
341 rbs.attachFile (transFile, RND_BUF_SIZE);
342 SeekStart ();
343 val = 0;
344}
345
346void OccurToDictConverter::Close () {
347 if (transFile == NULL) return;
348
349 rbs.done ();
350 fclose (transFile);
351 transFile = NULL;
352 pos = 0;
353 val = 0;
354
355 wordDictSize = 0;
356 tagDictSize = 0;
357}
358
359
360
361
362InvfStateCache::InvfStateCache () {
363 startNum = 0;
364 stateFile = NULL;
365}
366
367InvfStateCache::~InvfStateCache () {
368 if (stateFile != NULL) Close ();
369}
370
371void InvfStateCache::Open (char *filename) {
372 if (stateFile != NULL) Close();
373
374 // open the state file
375 char path[512];
376 sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
377 ".invf.state", (long) getpid ());
378 if (!(stateFile = fopen (path, "wb+"))) {
379 Message ("Unable to create \"%s\"", path);
380 exit (1);
381 }
382 unlink (path); // file will be deleted after it is closed
383 // reset the buffer
384 startNum = 0;
385 ClearCache();
386}
387
388void InvfStateCache::Close () {
389 if (stateFile == NULL) return;
390 fclose (stateFile);
391 stateFile = NULL;
392 startNum = 0;
393}
394
395InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
396 // see if cached
397 if ((num >= startNum) && (num < startNum + ISR_SIZE))
398 return recCache[num-startNum];
399
400 // not cached, write out this lot of records and read in
401 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
402 fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
403
404 // read in the new set of records
405 ClearCache ();
406 startNum = num - (num % ISR_SIZE);
407 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
408 fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
409
410 return recCache[num-startNum];
411}
412
413
414
415static void ClearCharBuf (char *buf, unsigned long size) {
416 char *end = buf + size;
417 while (buf != end) *buf++ = 0;
418}
419
420static void ReadWordDict (char *filename) {
421 // read in the perfect hash function for the word dictionary
422 FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
423 MAGIC_HASH, MG_ABORT);
424 if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
425 FatalError (1, "Unable to read in hash data for word dictionary");
426 }
427 fclose (wordHashFile);
428}
429
430static void ReadTagDict (char *filename, invf_dict_header &_idh) {
431 // open the file
432 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
433 MAGIC_STEM_BUILD, MG_ABORT);
434
435 // seek to the start of the tag dictionary
436 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
437
438 unsigned long tagNum;
439 dict_el thisEl;
440 for (tagNum = 0; tagNum < _idh.tag_dict_size; ++tagNum) {
441 thisEl.Read (dictFile);
442 tagMapDict[thisEl.el].tagNum = tagNum;
443 }
444
445 fclose (dictFile);
446}
447
448static void ReadLevelFile (char *filename) {
449 FILE *f;
450 f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
451 MAGIC_INVF_LEVELS, MG_ABORT);
452 ivfLevel.Read (f);
453 fclose (f);
454}
455
456void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
457 if (totalIBits < lastTotalIBits) {
458 fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
459 if (sizeof (mg_ullong) < 8) {
460 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
461 }
462 fprintf(stderr, " Build aborted.\n");
463 exit(1);
464 }
465}
466
467// assumes the inverted file state file has been opened
468static void InitInvfState (char *filename,
469 invf_dict_header &_idh,
470 InvfStateCache &_invfState,
471 mg_ullong &totalIBits,
472 bool wordLevelIndex) {
473 // read in the dictionary, setting inverted state information
474 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
475 MAGIC_STEM_BUILD, MG_ABORT);
476
477 // seek to the start of the word dictionary
478 fseek (dictFile, _idh.word_dict_start, SEEK_SET);
479
480 // add the word entries
481 word_dict_el wordEl;
482 wordEl.SetNumLevels (_idh.num_levels);
483 unsigned long dictWordNum, p;
484 mg_ullong lastTotalIBits;
485 unsigned long N = _idh.num_frags;
486 for (dictWordNum=0; dictWordNum<_idh.word_dict_size; ++dictWordNum) {
487 // lastTotalIBits is used to detect integer overflow
488 lastTotalIBits = totalIBits;
489
490 // read the next word and associated information
491 wordEl.Read (dictFile, _idh.num_levels);
492
493 // update the state record
494 p = wordEl.frag_occur;
495 InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
496 wisr.start = totalIBits;
497 wisr.here = totalIBits;
498 wisr.B = BIO_Bblock_Init (N, p);
499
500 // add the length of the fragment numbers
501 totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
502
503 // if needed, add the length of the fragment frequency information
504 if (!wordLevelIndex)
505 totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
506
507 // align next byte
508#ifdef USE_LONG_LONG
509 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
510#else
511 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
512#endif
513
514 CheckIntOverflow (totalIBits, lastTotalIBits);
515 }
516
517
518 // seek to the start of the tag dictionary
519 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
520
521 // add the tag entries
522 dict_el tagEl;
523 unsigned long dictTagNum;
524 N = _idh.num_frags;
525 for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; ++dictTagNum) {
526 // lastTotalIBits is used to detect integer overflow
527 lastTotalIBits = totalIBits;
528
529 // read the next tag and associated information
530 tagEl.Read (dictFile);
531
532 // update the state record
533 p = tagEl.frag_occur*2;
534 InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
535 tisr.start = totalIBits;
536 tisr.here = totalIBits;
537 tisr.B = BIO_Bblock_Init (N+p, p);
538
539 // add the length of the fragment numbers (two numbers for each
540 // tag, one for start and one for end)
541 totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
542
543 // align next byte
544#ifdef USE_LONG_LONG
545 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
546#else
547 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
548#endif
549
550 CheckIntOverflow (totalIBits, lastTotalIBits);
551 }
552
553 fclose (dictFile);
554}
555
556/*
557// assumes the chunk tag information has been placed in .first
558static void PrintChunkInfo (unsigned long chunkMem,
559 unsigned long numChunkWords,
560 unsigned long numChunkTags) {
561 static unsigned long chunksRead = 0;
562 ++chunksRead;
563 cout << "Chunk Number: " << chunksRead << "\n";
564 cout << "numChunkDocs " << numDocsInChunk << "\n";
565 cout << "numChunkFrags " << numFragsInChunk << "\n";
566 cout << "mem " << chunkMem << "\n";
567 cout << "numWords " << numChunkWords << "\n";
568 cout << "numTags " << numChunkTags << "\n\n";
569
570 TagMapDict::iterator tagMapHere = tagMapDict.begin();
571 TagMapDict::iterator tagMapEnd = tagMapDict.end();
572 while (tagMapHere != tagMapEnd) {
573 unsigned long tagMapNum = (*tagMapHere).second.tagNum;
574 cout << (*tagMapHere).first << " " << tagMapNum << " "
575 << bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
576 ++tagMapHere;
577 }
578}
579*/
580
581void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
582 // reset globals
583 numChunkDocs = 0;
584 chunkStartFragNum = numFrags;
585
586 // read in information about this chunk
587 numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
588 if (numDocsInChunk == 0)
589 FatalError (1, "The number of docs in the current chunk is 0");
590
591 numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
592 unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
593
594 if (chunkMem > ivfMemBufSize)
595 FatalError (1, "Chunk memory size is greater than maximum");
596
597 unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
598 unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
599
600
601 // reset stuff
602 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
603 bitPtrs.ResetPtrs();
604
605 // read in the entries in occurrence order storing the
606 // "chunkWordCount" in "start" and the "chunkFragCount"
607 // in "here"
608 unsigned long numOccur;
609 unsigned long wordNum;
610 for (numOccur=0; numOccur<numChunkWords; ++numOccur) {
611 wordNum = occurConvert.TranslateWord (numOccur);
612 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
613 wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
614 if (wordPtr.start >= 2)
615 wordPtr.here = chunkBuf.gamma_decode (NULL);
616 else wordPtr.here = wordPtr.start;
617 }
618 unsigned long tagNum;
619 for (numOccur=0; numOccur<numChunkTags; ++numOccur) {
620 tagNum = occurConvert.TranslateTag (numOccur);
621 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
622 // only chunkFragCount is encoded for tags
623 tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
624 tagPtr.here = tagPtr.start;
625 }
626
627 /* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
628
629 // create the bit ptrs in dictionary order
630 unsigned long totalIBits = 0; // only dealing with memory
631 unsigned long chunkWordCount, chunkFragCount;
632 for (wordNum=0; wordNum<_idh.word_dict_size; ++wordNum) {
633 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
634 chunkWordCount = wordPtr.start;
635 chunkFragCount = wordPtr.here;
636 wordPtr.start = totalIBits;
637 wordPtr.here = totalIBits;
638 wordPtr.lastFragNum = chunkStartFragNum;
639 wordPtr.lgB = 0;
640 if (chunkWordCount > 0) {
641 wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
642 chunkFragCount));
643 totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
644 // use unary encoding for memory buffer encoding of fragment freq
645 if (!wordLevelIndex) {
646 totalIBits += chunkWordCount;
647 }
648 }
649 }
650 for (tagNum=0; tagNum<_idh.tag_dict_size; ++tagNum) {
651 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
652 chunkFragCount = tagPtr.here;
653 tagPtr.start = totalIBits;
654 tagPtr.here = totalIBits;
655 tagPtr.lastFragNum = chunkStartFragNum;
656 tagPtr.lgB = 0;
657 if (chunkFragCount > 0) {
658 unsigned long pTag = chunkFragCount*2;
659 tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
660 pTag));
661 unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
662 pTag);
663// cout << tagNum + _idh.word_dict_size << " ";
664// cout << "numFrags: " << numFragsInChunk
665// << " chunkFragCount: " << chunkFragCount
666// << " B: " << 1 << tagPtr.lgB
667// << " blen: " << blen << "\n";
668 totalIBits += bLen;
669 }
670 }
671 bitPtrs.GetEndStart() = totalIBits;
672 bitPtrs.GetEndHere() = totalIBits;
673
674 if ((totalIBits + 7ul) >> 3ul > chunkMem) {
675 cerr << "totalIBits: " << totalIBits << "\n";
676 cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
677 cerr << "chunkMem: " << chunkMem << "\n";
678 FatalError (1, "Pointers exceed buffer size");
679 }
680}
681
682
683
684
685int init_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
686 // read in compressed dictionary header
687 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
688 MAGIC_STEM_BUILD, MG_ABORT);
689 idh.Read (dictFile);
690 fclose (dictFile);
691
692 // set the size of the bit ptrs
693 bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
694
695 // open the chunk file and read in the maximum memory needed
696 // for the inverted memory buffer
697 chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
698 MAGIC_CHUNK, MG_ABORT);
699 ReadUL (chunkFile, ivfMemBufSize);
700 chunkBuf.attachFile (chunkFile);
701
702 // allocate memory for the inverted buffer
703 ivfMemBuf = new char [ivfMemBufSize];
704 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
705
706 // read in the word dictionary
707 ReadWordDict (filename);
708
709 // read in the tag dictionary
710 ReadTagDict (filename, idh);
711
712 // read in the level information
713 ReadLevelFile (filename);
714 bool wordLevelIndex = ivfLevel.indexLevel.empty();
715
716 // set up the translation table file
717 occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
718
719 // reset some globals
720 numDocs = 0;
721 numChunkDocs = 0;
722 numDocsInChunk = 0;
723 numFrags = 0;
724 numFragsInChunk = 0;
725 chunkStartFragNum = 0;
726
727 strcpy (collectFilename, filename);
728
729
730 // create the inverted file
731 mg_ullong totalIBits = 0;
732 FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
733 MAGIC_INVF, MG_ABORT);
734 totalIBits += sizeof (unsigned long) * 8; // magic number
735 totalIBits += 8 * 200; // 200 byte gap -- why??????
736 fclose (invfFile);
737
738 // init the inverted file state cache
739 invfState.Open (filename);
740 InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
741
742 return COMPALLOK;
743}
744
745static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/*tagName*/) {
746 if (!tInfo.inTag) return;
747
748 // add this tag to the inverted list
749 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
750 unsigned long endFrag = numFrags;
751 int b = 1 << tagBitPtr.lgB;
752
753 /*
754 cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
755 << tInfo.startFrag << " " << endFrag << "\n";
756 */
757
758 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
759 buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
760 b, NULL);
761 buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
762 tagBitPtr.lastFragNum = endFrag;
763 tagBitPtr.here = buffer.position();
764 buffer.encodeDone();
765
766 // check for buffer overrun
767 bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
768
769 // reset information about this tag
770 tInfo.inTag = false;
771 tInfo.startFrag = 0;
772}
773
774static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
775 // close tag if open
776 IP2TagInfo &tInfo = tagMapDict[el.tagName];
777 if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
778
779 // open this tag
780 tInfo.inTag = true;
781 tInfo.startFrag = numFrags;
782
783 // check for start of next fragment
784 bool wordLevelIndex = ivfLevel.indexLevel.empty();
785 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
786 ++numFrags;
787 inFrag = true;
788 }
789}
790
791static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
792 // check for end of fragment
793 bool wordLevelIndex = ivfLevel.indexLevel.empty();
794 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
795 inFrag = false;
796 }
797
798 IP2TagInfo &tInfo = tagMapDict[el.tagName];
799 CloseTextTag (tInfo, el.tagName);
800}
801
802static void ProcessText (const TextEl &el, bool &inFrag) {
803 // make sure this text is to be indexed
804 bool wordLevelIndex = ivfLevel.indexLevel.empty();
805 if (!wordLevelIndex && !inFrag) return;
806
807 const unsigned char *textHere = &(el.text[0]);
808 const unsigned char *textEnd = &(el.text[el.text.size() - 1]);
809 unsigned char mgWord[MAXSTEMLEN + 1];
810
811 if (!inaword_mgpp (textHere, textEnd))
812 ParseNonindexWord (textHere, textEnd);
813
814
815 // Alternately parse off words and non-words from the input
816
817 while (textHere <= textEnd) {
818 textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
819 textHere = ParseNonindexWord (textHere, textEnd);
820
821 if (mgWord[0] > 0) {
822 if (wordLevelIndex) ++numFrags;
823
824 unsigned long wordNum = perf_hash (wordHashDict, mgWord);
825
826 /*
827 cout << wordNum << " \"";
828 cout.write (mgWord+1, *mgWord);
829 cout << "\" " << numFrags << "\n";
830 */
831
832 // add this word to the inverted list
833 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
834 unsigned long fragNum = numFrags;
835 int b = 1 << wordBitPtr.lgB;
836
837 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
838
839 // note: this assumes that fragments don't carry over between
840 // chunks (which they don't because all tags are closed at the
841 // end of each document and chunks are based on document
842 // boundaries), i.e. the first fragment number must be greater
843 // than the starting fragment number of the chunk.
844 if (fragNum > wordBitPtr.lastFragNum) {
845 buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
846 b, NULL);
847 if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
848
849 } else if (!wordLevelIndex) {
850 // add one to the frequency count for this word
851 buffer.seek (buffer.position()-1);
852 buffer.encodeBit (0); // unary encoding -- last = 1
853 buffer.encodeBit (1);
854 }
855
856 wordBitPtr.lastFragNum = fragNum;
857 wordBitPtr.here = buffer.position();
858 buffer.encodeDone();
859
860 // check for buffer overrun
861 bitPtrs.CheckWordBufOverrun (wordNum);
862 }
863 }
864}
865
866// combine the in memory inverted buffer with the disk
867// based inverted file
868static void DiskMerge (char *filename) {
869 bool wordLevelIndex = ivfLevel.indexLevel.empty();
870
871 // make sure we have something to process
872 if (numChunkDocs <= 0) return;
873
874 // open the inverted file
875 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
876 MAGIC_INVF, MG_ABORT);
877 random_bitio_buffer invfOutBuf (invfFile);
878
879 // set up to decode the entries in memory
880 mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
881
882 // write out the word information
883 unsigned long wordNum;
884 int b;
885 unsigned long currFragNum;
886 unsigned long delta;
887 unsigned long currFreq;
888 for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
889 // go to the end of the last inverted file entry
890 InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
891 invfOutBuf.SEEK_X (wordDiskState.here);
892
893 // go to the start of the inverted chunk info in memory
894 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
895 memInBuf.seek(wordBitPtr.start);
896
897 // decode each entry and re-write to disk
898 currFragNum = chunkStartFragNum;
899 while (memInBuf.position() < wordBitPtr.here) {
900 // decode word entry
901 b = 1 << wordBitPtr.lgB;
902 delta = memInBuf.bblock_decode (b, NULL);
903 currFragNum += delta;
904 if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
905 else currFreq = 1;
906
907 // recode on disk
908 invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
909 wordDiskState.B, NULL);
910 if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
911 wordDiskState.lastFragNum = currFragNum;
912 }
913
914 wordDiskState.here = invfOutBuf.TELL_X();
915 }
916
917 // write out the tag information
918 unsigned long tagNum;
919 unsigned long currTagStart;
920 unsigned long currTagEnd;
921 for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
922 // go to the end of the last inverted file entry
923 InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
924 invfOutBuf.SEEK_X (tagDiskState.here);
925
926 // go to the start of the inverted chunk info in memory
927 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
928 memInBuf.seek(tagBitPtr.start);
929
930 // decode each entry and re-write to disk
931 currTagEnd = chunkStartFragNum;
932 while (memInBuf.position() < tagBitPtr.here) {
933 // decode tag entry
934 b = 1 << tagBitPtr.lgB;
935 delta = memInBuf.bblock_decode (b, NULL) - 1;
936 currTagStart = currTagEnd + delta;
937 delta = memInBuf.bblock_decode (b, NULL) - 1;
938 currTagEnd = currTagStart + delta;
939
940 // recode on disk
941 invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
942 tagDiskState.B, NULL);
943 invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
944 tagDiskState.B, NULL);
945
946 tagDiskState.lastFragNum = currTagEnd;
947 }
948
949 tagDiskState.here = invfOutBuf.TELL_X();
950 }
951
952 memInBuf.done();
953
954 invfOutBuf.encodeDone();
955 fclose (invfFile);
956}
957
958
959int process_ivf_2 (const TagInfo &/*tagInfo*/, const TextElArray &doc) {
960 bool wordLevelIndex = ivfLevel.indexLevel.empty();
961 bool inFrag = false;
962 if (wordLevelIndex) inFrag = true; // unconditional
963
964 // get next chunk information if need to. the chunk information
965 // is needed before the first document is processed
966 if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
967
968 // process each text element
969 TextElArray::const_iterator here = doc.begin();
970 TextElArray::const_iterator end = doc.end();
971 while (here != end) {
972 // process this element
973 if ((*here).elType == OpenTagE) ProcessOpenTag (*here, inFrag);
974 else if ((*here).elType == CloseTagE) ProcessCloseTag (*here, inFrag);
975 else ProcessText (*here, inFrag);
976
977 ++here;
978 }
979
980 // close off any unclosed tags
981 TagMapDict::iterator tdHere = tagMapDict.begin();
982 TagMapDict::iterator tdEnd = tagMapDict.end();
983 while (tdHere != tdEnd) {
984 CloseTextTag ((*tdHere).second, (*tdHere).first);
985 ++tdHere;
986 }
987
988 // we've processed one more document
989 ++numDocs;
990 ++numChunkDocs;
991
992 // merge the memory based inverted file with the one on
993 // disk if this is the end of this chunk
994 if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
995
996 return COMPALLOK;
997}
998
999
1000static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
1001 FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1002 MAGIC_INVF, MG_ABORT);
1003 FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1004 MAGIC_INVF, MG_ABORT);
1005
1006 // skip the magic number
1007 fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
1008
1009 // write the inverted file header -- use defaults for most things
1010 invf_file_header ifh;
1011 ifh.no_of_words = idh.word_dict_size;
1012 ifh.no_of_tags = idh.tag_dict_size;
1013 ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1014 ifh.Write (outInvfFile);
1015
1016 bytesOutput = ftell (outInvfFile);
1017
1018 // process each meaningful byte in the file
1019 unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
1020 unsigned long entryNum;
1021 mg_ullong lastStart = 0;
1022 for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1023 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1024
1025 // overrun check
1026 if (stateRec.start < lastStart)
1027 FatalError (1, "Inverted file Buffer overrun");
1028 lastStart = stateRec.start;
1029
1030 unsigned long oldEntryStart = stateRec.start >> 3;
1031 unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
1032 unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1033 unsigned long oldEntryEndOver = stateRec.here & 7;
1034
1035 fseek (inInvfFile, oldEntryStart, SEEK_SET);
1036
1037 stateRec.here -= stateRec.start;
1038 stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver;
1039 stateRec.here += stateRec.start;
1040 while (oldEntryStart < oldEntryEnd) {
1041 unsigned char c = getc (inInvfFile);
1042 if (oldEntryStart == oldEntryEnd - 1) {
1043 u_char ands[8] =
1044 {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1045 c &= ands[oldEntryEndOver];
1046 }
1047 putc (c, outInvfFile);
1048 ++bytesOutput;
1049 ++oldEntryStart;
1050 }
1051 }
1052
1053 fclose (inInvfFile);
1054
1055#ifdef __WIN32__
1056 if (_chsize (_fileno (outInvfFile), bytesOutput) != 0)
1057 Message ("Could not truncate invf.");
1058#else
1059 ftruncate (fileno (outInvfFile), bytesOutput);
1060#endif
1061
1062 fclose (outInvfFile);
1063}
1064
1065static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
1066 FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1067 MAGIC_INVI, MG_ABORT);
1068
1069 // process each meaningful byte in the file
1070 unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
1071 unsigned long entryNum;
1072 for (entryNum = 0; entryNum < numEntries; ++entryNum) {
1073 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1074
1075 // assumes that inverted entries start at beginning of each byte
1076 if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1077 }
1078
1079 WriteUL (invfIdxFile, invfNumBytes);
1080
1081 fclose (invfIdxFile);
1082}
1083
1084
1085int done_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
1086 // close most open files
1087 if (chunkFile != NULL) {
1088 chunkBuf.done();
1089 fclose (chunkFile);
1090 chunkFile = NULL;
1091 }
1092 occurConvert.Close();
1093
1094 // free allocated memory
1095 bitPtrs.Clear();
1096 if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1097 free_perf_hash (wordHashDict);
1098 wordHashDict = NULL;
1099 tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1100
1101 // condense the inverted file and truncate it
1102 // this function also writes out the inverted header
1103 unsigned long invfNumBytes = 0;
1104 CondenseInvfFile (filename, invfNumBytes);
1105
1106 OutputInvfIdx (filename, invfNumBytes);
1107
1108 // close the rest of the open files
1109 invfState.Close ();
1110
1111 return COMPALLOK;
1112}
Note: See TracBrowser for help on using the repository browser.