source: trunk/gsdl/src/mgpp/text/ivf.pass2.cpp@ 2443

Last change on this file since 2443 was 2443, checked in by jrm21, 23 years ago

added some #defines for proper portability.

(eg ftruncate needs _XOPEN_SOURCE_EXTENDED)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 31.4 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass2.cpp -- Memory efficient pass 2 inversion
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: ivf.pass2.cpp 2443 2001-05-17 04:52:38Z jrm21 $
21 *
22 **************************************************************************/
23
24/*
25 $Log$
26 Revision 1.2 2001/05/17 04:52:38 jrm21
27 added some #defines for proper portability.
28 (eg ftruncate needs _XOPEN_SOURCE_EXTENDED)
29
30 Revision 1.1 2000/01/14 02:26:07 sjboddie
31 Rodgers new C++ mg
32
33 */
34#define _XOPEN_SOURCE 1
35#define _XOPEN_SOURCE_EXTENDED 1
36#include <stdio.h>
37#include <unistd.h>
38
39#include "non_ansi.h"
40
41#include "sysfuncs.h"
42
43#include "mg_files.h"
44#include "invf.h"
45#include "mg.h"
46#include "build.h"
47#include "locallib.h"
48#include "UCArray.h"
49#include "bitio_m_random.h"
50#include "bitio_m_stdio.h"
51#include "bitio_m_mems.h"
52#include "bitio_gen.h"
53#include <stdio.h>
54#include "words.h"
55#include "messages.h"
56#include "netorder.h"
57#include "FIvfLevelInfo.h"
58#include "perf_hash.h"
59#include "string.h"
60
61#include "longlong.h"
62
63#ifdef __WIN32_
64#include <io.h>
65#endif
66
67#if defined(GSDL_USE_OBJECTSPACE)
68# include <ospace\std\map>
69#elif defined(GSDL_USE_STL_H)
70# include <map.h>
71#else
72# include <map>
73#endif
74
75
76#ifdef USE_LONG_LONG
77#define SEEK_X seek_LL
78#define TELL_X tell_LL
79#else
80#define SEEK_X seek
81#define TELL_X tell
82#endif
83
84#ifndef RND_BUF_SIZE
85#define RND_BUF_SIZE 8*1024
86#endif
87
88
89static unsigned long numDocs = 0;
90static unsigned long numChunkDocs = 0;
91static unsigned long numDocsInChunk = 0;
92
93static unsigned long numFrags = 0;
94static unsigned long numFragsInChunk = 0;
95static unsigned long chunkStartFragNum = 0;
96
97
98
99
100struct BitPtr {
101 unsigned long start;
102 unsigned long here;
103 unsigned long lastFragNum;
104 unsigned long lgB;
105
106 void Clear () { start = here = lastFragNum = lgB = 0; }
107 BitPtr () { Clear(); }
108};
109
110class WordBitPtrs {
111protected:
112 unsigned long numWords;
113 unsigned long numTags;
114 unsigned long size;
115 BitPtr *wordBitPtrs;
116
117 void CheckBufOverrun (unsigned long num) {
118 if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
119 cerr << "numDocs: " << numDocs << "\n";
120 cerr << "numChunkDocs: " << numChunkDocs << "\n";
121 cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
122 cerr << "numFrags: " << numFrags << "\n";
123 cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
124 cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
125 cerr << "num: " << num << "\n";
126 cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
127 cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
128 cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
129 FatalError (1, "Bit buffer overrun");
130 }
131 }
132
133public:
134 void Clear ();
135 WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
136 ~WordBitPtrs ();
137 void SetSize (unsigned long _numWords,
138 unsigned long _numTags);
139
140 void ResetPtrs () {
141 if (wordBitPtrs == NULL) return;
142 unsigned long i;
143 for (i=0; i<size; i++) wordBitPtrs[i].Clear();
144 }
145
146 BitPtr &GetWordBitPtr (unsigned long wordNum)
147 { return wordBitPtrs[wordNum]; }
148 unsigned long &GetWordStart (unsigned long wordNum)
149 { return wordBitPtrs[wordNum].start; }
150 unsigned long &GetWordHere (unsigned long wordNum)
151 { return wordBitPtrs[wordNum].here; }
152 void CheckWordBufOverrun (unsigned long wordNum)
153 { CheckBufOverrun (wordNum); }
154
155 BitPtr &GetTagBitPtr (unsigned long tagNum)
156 { return wordBitPtrs[tagNum + numWords]; }
157 unsigned long &GetTagStart (unsigned long tagNum)
158 { return wordBitPtrs[tagNum + numWords].start; }
159 unsigned long &GetTagHere (unsigned long tagNum)
160 { return wordBitPtrs[tagNum + numWords].here; }
161 void CheckTagBufOverrun (unsigned long tagNum)
162 { CheckBufOverrun (tagNum + numWords); }
163
164 BitPtr &GetEndBitPtr ()
165 { return wordBitPtrs[size-1]; }
166 unsigned long &GetEndStart ()
167 { return wordBitPtrs[size-1].start; }
168 unsigned long &GetEndHere ()
169 { return wordBitPtrs[size-1].here; }
170};
171
172
173struct IP2TagInfo {
174 bool inTag;
175 unsigned long startFrag;
176 unsigned long tagNum;
177
178 IP2TagInfo () {
179 inTag = false;
180 startFrag = 0;
181 tagNum = 0;
182 }
183};
184
185// maps tags to tag information
186typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
187
188
189// class to handle the translation of occurrence order
190// to dictionary order for words and tags
191class OccurToDictConverter {
192protected:
193 unsigned long pos;
194 unsigned long val;
195 FILE *transFile;
196 random_bitio_buffer rbs;
197
198 unsigned long wordDictSize;
199 unsigned long tagDictSize;
200
201 void SeekStart ();
202 unsigned long TranslateNum (unsigned long num);
203
204public:
205 OccurToDictConverter ();
206 ~OccurToDictConverter ();
207
208 void Open (char *filename, unsigned long _wordDictSize,
209 unsigned long _tagDictSize);
210
211 // Close frees all allocated memory
212 void Close ();
213
214 unsigned long TranslateWord (unsigned long occurNum)
215 { return TranslateNum (occurNum); }
216 unsigned long TranslateTag (unsigned long occurNum)
217 { return TranslateNum (occurNum+wordDictSize); }
218};
219
220
221struct InvfStateRec {
222 mg_ullong start;
223 mg_ullong here;
224 unsigned long lastFragNum;
225 unsigned long B;
226
227 void Clear () {
228 start = here = 0;
229 lastFragNum = B = 0;
230 }
231 InvfStateRec () { Clear (); }
232};
233
234
235#define ISR_SIZE 1024
236
237class InvfStateCache {
238protected:
239 InvfStateRec recCache [ISR_SIZE];
240 unsigned long startNum;
241
242 FILE *stateFile;
243
244 void ClearCache () {
245 unsigned int i = 0;
246 for (i=0; i<ISR_SIZE; i++) recCache[i].Clear();
247 }
248
249public:
250 InvfStateCache ();
251 ~InvfStateCache ();
252
253 void Open (char *filename);
254 void Close ();
255
256 // previous references to state records may be
257 // invalidated calling GetRec
258 InvfStateRec &GetRec (unsigned long num);
259};
260
261
262static invf_dict_header idh;
263static WordBitPtrs bitPtrs;
264
265static FILE *chunkFile = NULL;
266static stdio_bitio_buffer chunkBuf;
267
268static unsigned long ivfMemBufSize = 0;
269static char *ivfMemBuf = NULL;
270
271// word and tag dictionaries. a map is used for the tag dictionary
272// as it should never be very big (and the perfect hash function
273// sometimes has trouble with small values).
274static perf_hash_data *wordHashDict = NULL;
275static TagMapDict tagMapDict;
276
277// information about all the different levels
278static FIvfLevel ivfLevel;
279
280static OccurToDictConverter occurConvert;
281
282// information about the state of the inverted file
283static InvfStateCache invfState;
284
285static char collectFilename[512];
286
287
288void WordBitPtrs::Clear () {
289 numWords = 0;
290 numTags = 0;
291 size=0;
292 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
293 wordBitPtrs = NULL;
294}
295
296WordBitPtrs::~WordBitPtrs () {
297 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
298}
299
300void WordBitPtrs::SetSize (unsigned long _numWords,
301 unsigned long _numTags){
302 Clear();
303 numWords = _numWords;
304 numTags = _numTags;
305 size = numWords + numTags + 1;
306 wordBitPtrs = new BitPtr [size];
307}
308
309
310void OccurToDictConverter::SeekStart () {
311 if (transFile == NULL) return;
312 rbs.SEEK_X (sizeof (unsigned long) * 8);
313 pos = 0;
314}
315
316unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
317 if (num < pos) SeekStart ();
318 while (pos <= num) {
319 if (pos < wordDictSize)
320 val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
321 else
322 val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
323 pos++;
324 }
325 return val;
326}
327
328OccurToDictConverter::OccurToDictConverter () {
329 pos = 0;
330 val = 0;
331 transFile = NULL;
332 wordDictSize = 0;
333 tagDictSize = 0;
334}
335
336OccurToDictConverter::~OccurToDictConverter () {
337 if (transFile != NULL) Close ();
338}
339
340void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
341 unsigned long _tagDictSize) {
342 if (transFile != NULL) Close ();
343
344 wordDictSize = _wordDictSize;
345 tagDictSize = _tagDictSize;
346
347 transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
348 MAGIC_CHUNK_TRANS, MG_ABORT);
349 rbs.attachFile (transFile, RND_BUF_SIZE);
350 SeekStart ();
351 val = 0;
352}
353
354void OccurToDictConverter::Close () {
355 if (transFile == NULL) return;
356
357 rbs.done ();
358 fclose (transFile);
359 transFile = NULL;
360 pos = 0;
361 val = 0;
362
363 wordDictSize = 0;
364 tagDictSize = 0;
365}
366
367
368
369
370InvfStateCache::InvfStateCache () {
371 startNum = 0;
372 stateFile = NULL;
373}
374
375InvfStateCache::~InvfStateCache () {
376 if (stateFile != NULL) Close ();
377}
378
379void InvfStateCache::Open (char *filename) {
380 if (stateFile != NULL) Close();
381
382 // open the state file
383 char path[512];
384 sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
385 ".invf.state", (long) getpid ());
386 if (!(stateFile = fopen (path, "wb+"))) {
387 Message ("Unable to create \"%s\"", path);
388 exit (1);
389 }
390 unlink (path); // file will be deleted after it is closed
391
392 // reset the buffer
393 startNum = 0;
394 ClearCache();
395}
396
397void InvfStateCache::Close () {
398 if (stateFile == NULL) return;
399 fclose (stateFile);
400 stateFile = NULL;
401 startNum = 0;
402}
403
404InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
405 // see if cached
406 if ((num >= startNum) && (num < startNum + ISR_SIZE))
407 return recCache[num-startNum];
408
409 // not cached, write out this lot of records and read in
410 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
411 fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
412
413 // read in the new set of records
414 ClearCache ();
415 startNum = num - (num % ISR_SIZE);
416 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
417 fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
418
419 return recCache[num-startNum];
420}
421
422
423
424static void ClearCharBuf (char *buf, unsigned long size) {
425 char *end = buf + size;
426 while (buf != end) *buf++ = 0;
427}
428
429static void ReadWordDict (char *filename) {
430 // read in the perfect hash function for the word dictionary
431 FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
432 MAGIC_HASH, MG_ABORT);
433 if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
434 FatalError (1, "Unable to read in hash data for word dictionary");
435 }
436 fclose (wordHashFile);
437}
438
439static void ReadTagDict (char *filename, invf_dict_header &_idh) {
440 // open the file
441 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
442 MAGIC_STEM_BUILD, MG_ABORT);
443
444 // seek to the start of the tag dictionary
445 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
446
447 unsigned long tagNum;
448 dict_el thisEl;
449 for (tagNum = 0; tagNum < _idh.tag_dict_size; tagNum++) {
450 thisEl.Read (dictFile);
451 tagMapDict[thisEl.el].tagNum = tagNum;
452 }
453
454 fclose (dictFile);
455}
456
457static void ReadLevelFile (char *filename) {
458 FILE *f;
459 f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
460 MAGIC_INVF_LEVELS, MG_ABORT);
461 ivfLevel.Read (f);
462 fclose (f);
463}
464
465void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
466 if (totalIBits < lastTotalIBits) {
467 fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
468 if (sizeof (mg_ullong) < 8) {
469 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
470 }
471 fprintf(stderr, " Build aborted.\n");
472 exit(1);
473 }
474}
475
476// assumes the inverted file state file has been opened
477static void InitInvfState (char *filename,
478 invf_dict_header &_idh,
479 InvfStateCache &_invfState,
480 mg_ullong &totalIBits,
481 bool wordLevelIndex) {
482 // read in the dictionary, setting inverted state information
483 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
484 MAGIC_STEM_BUILD, MG_ABORT);
485
486 // seek to the start of the word dictionary
487 fseek (dictFile, _idh.word_dict_start, SEEK_SET);
488
489 // add the word entries
490 word_dict_el wordEl;
491 wordEl.SetNumLevels (_idh.num_levels);
492 unsigned long dictWordNum, p;
493 mg_ullong lastTotalIBits;
494 unsigned long N = _idh.num_frags;
495 for (dictWordNum=0; dictWordNum<_idh.word_dict_size; dictWordNum++) {
496 // lastTotalIBits is used to detect integer overflow
497 lastTotalIBits = totalIBits;
498
499 // read the next word and associated information
500 wordEl.Read (dictFile, _idh.num_levels);
501
502 // update the state record
503 p = wordEl.frag_occur;
504 InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
505 wisr.start = totalIBits;
506 wisr.here = totalIBits;
507 wisr.B = BIO_Bblock_Init (N, p);
508
509 // add the length of the fragment numbers
510 totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
511
512 // if needed, add the length of the fragment frequency information
513 if (!wordLevelIndex)
514 totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
515
516 // align next byte
517#ifdef USE_LONG_LONG
518 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
519#else
520 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
521#endif
522
523 CheckIntOverflow (totalIBits, lastTotalIBits);
524 }
525
526
527 // seek to the start of the tag dictionary
528 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
529
530 // add the tag entries
531 dict_el tagEl;
532 unsigned long dictTagNum;
533 N = _idh.num_frags;
534 for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; dictTagNum++) {
535 // lastTotalIBits is used to detect integer overflow
536 lastTotalIBits = totalIBits;
537
538 // read the next tag and associated information
539 tagEl.Read (dictFile);
540
541 // update the state record
542 p = tagEl.frag_occur*2;
543 InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
544 tisr.start = totalIBits;
545 tisr.here = totalIBits;
546 tisr.B = BIO_Bblock_Init (N+p, p);
547
548 // add the length of the fragment numbers (two numbers for each
549 // tag, one for start and one for end)
550 totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
551
552 // align next byte
553#ifdef USE_LONG_LONG
554 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
555#else
556 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
557#endif
558
559 CheckIntOverflow (totalIBits, lastTotalIBits);
560 }
561
562 fclose (dictFile);
563}
564
565/*
566// assumes the chunk tag information has been placed in .first
567static void PrintChunkInfo (unsigned long chunkMem,
568 unsigned long numChunkWords,
569 unsigned long numChunkTags) {
570 static unsigned long chunksRead = 0;
571 chunksRead++;
572 cout << "Chunk Number: " << chunksRead << "\n";
573 cout << "numChunkDocs " << numDocsInChunk << "\n";
574 cout << "numChunkFrags " << numFragsInChunk << "\n";
575 cout << "mem " << chunkMem << "\n";
576 cout << "numWords " << numChunkWords << "\n";
577 cout << "numTags " << numChunkTags << "\n\n";
578
579 TagMapDict::iterator tagMapHere = tagMapDict.begin();
580 TagMapDict::iterator tagMapEnd = tagMapDict.end();
581 while (tagMapHere != tagMapEnd) {
582 unsigned long tagMapNum = (*tagMapHere).second.tagNum;
583 cout << (*tagMapHere).first << " " << tagMapNum << " "
584 << bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
585 tagMapHere++;
586 }
587}
588*/
589
590void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
591 // reset globals
592 numChunkDocs = 0;
593 chunkStartFragNum = numFrags;
594
595 // read in information about this chunk
596 numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
597 if (numDocsInChunk == 0)
598 FatalError (1, "The number of docs in the current chunk is 0");
599
600 numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
601 unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
602
603 if (chunkMem > ivfMemBufSize)
604 FatalError (1, "Chunk memory size is greater than maximum");
605
606 unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
607 unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
608
609
610 // reset stuff
611 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
612 bitPtrs.ResetPtrs();
613
614 // read in the entries in occurrence order storing the
615 // "chunkWordCount" in "start" and the "chunkFragCount"
616 // in "here"
617 unsigned long numOccur;
618 unsigned long wordNum;
619 for (numOccur=0; numOccur<numChunkWords; numOccur++) {
620 wordNum = occurConvert.TranslateWord (numOccur);
621 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
622 wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
623 if (wordPtr.start >= 2)
624 wordPtr.here = chunkBuf.gamma_decode (NULL);
625 else wordPtr.here = wordPtr.start;
626 }
627 unsigned long tagNum;
628 for (numOccur=0; numOccur<numChunkTags; numOccur++) {
629 tagNum = occurConvert.TranslateTag (numOccur);
630 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
631 // only chunkFragCount is encoded for tags
632 tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
633 tagPtr.here = tagPtr.start;
634 }
635
636 /* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
637
638 // create the bit ptrs in dictionary order
639 unsigned long totalIBits = 0; // only dealing with memory
640 unsigned long chunkWordCount, chunkFragCount;
641 for (wordNum=0; wordNum<_idh.word_dict_size; wordNum++) {
642 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
643 chunkWordCount = wordPtr.start;
644 chunkFragCount = wordPtr.here;
645 wordPtr.start = totalIBits;
646 wordPtr.here = totalIBits;
647 wordPtr.lastFragNum = chunkStartFragNum;
648 wordPtr.lgB = 0;
649 if (chunkWordCount > 0) {
650 wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
651 chunkFragCount));
652 totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
653 // use unary encoding for memory buffer encoding of fragment freq
654 if (!wordLevelIndex) {
655 totalIBits += chunkWordCount;
656 }
657 }
658 }
659 for (tagNum=0; tagNum<_idh.tag_dict_size; tagNum++) {
660 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
661 chunkFragCount = tagPtr.here;
662 tagPtr.start = totalIBits;
663 tagPtr.here = totalIBits;
664 tagPtr.lastFragNum = chunkStartFragNum;
665 tagPtr.lgB = 0;
666 if (chunkFragCount > 0) {
667 unsigned long pTag = chunkFragCount*2;
668 tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
669 pTag));
670 unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
671 pTag);
672// cout << tagNum + _idh.word_dict_size << " ";
673// cout << "numFrags: " << numFragsInChunk
674// << " chunkFragCount: " << chunkFragCount
675// << " B: " << 1 << tagPtr.lgB
676// << " blen: " << blen << "\n";
677 totalIBits += bLen;
678 }
679 }
680 bitPtrs.GetEndStart() = totalIBits;
681 bitPtrs.GetEndHere() = totalIBits;
682
683 if ((totalIBits + 7ul) >> 3ul > chunkMem) {
684 cerr << "totalIBits: " << totalIBits << "\n";
685 cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
686 cerr << "chunkMem: " << chunkMem << "\n";
687 FatalError (1, "Pointers exceed buffer size");
688 }
689}
690
691
692
693
694int init_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
695 // read in compressed dictionary header
696 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
697 MAGIC_STEM_BUILD, MG_ABORT);
698 idh.Read (dictFile);
699 fclose (dictFile);
700
701 // set the size of the bit ptrs
702 bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
703
704 // open the chunk file and read in the maximum memory needed
705 // for the inverted memory buffer
706 chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
707 MAGIC_CHUNK, MG_ABORT);
708 ReadUL (chunkFile, ivfMemBufSize);
709 chunkBuf.attachFile (chunkFile);
710
711 // allocate memory for the inverted buffer
712 ivfMemBuf = new char [ivfMemBufSize];
713 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
714
715 // read in the word dictionary
716 ReadWordDict (filename);
717
718 // read in the tag dictionary
719 ReadTagDict (filename, idh);
720
721 // read in the level information
722 ReadLevelFile (filename);
723 bool wordLevelIndex = ivfLevel.indexLevel.empty();
724
725 // set up the translation table file
726 occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
727
728 // reset some globals
729 numDocs = 0;
730 numChunkDocs = 0;
731 numDocsInChunk = 0;
732 numFrags = 0;
733 numFragsInChunk = 0;
734 chunkStartFragNum = 0;
735
736 strcpy (collectFilename, filename);
737
738
739 // create the inverted file
740 mg_ullong totalIBits = 0;
741 FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
742 MAGIC_INVF, MG_ABORT);
743 totalIBits += sizeof (unsigned long) * 8; // magic number
744 totalIBits += 8 * 200; // 200 byte gap -- why??????
745 fclose (invfFile);
746
747 // init the inverted file state cache
748 invfState.Open (filename);
749 InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
750
751 return COMPALLOK;
752}
753
754static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/*tagName*/) {
755 if (!tInfo.inTag) return;
756
757 // add this tag to the inverted list
758 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
759 unsigned long endFrag = numFrags;
760 int b = 1 << tagBitPtr.lgB;
761
762 /*
763 cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
764 << tInfo.startFrag << " " << endFrag << "\n";
765 */
766
767 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
768 buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
769 b, NULL);
770 buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
771 tagBitPtr.lastFragNum = endFrag;
772 tagBitPtr.here = buffer.position();
773 buffer.encodeDone();
774
775 // check for buffer overrun
776 bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
777
778 // reset information about this tag
779 tInfo.inTag = false;
780 tInfo.startFrag = 0;
781}
782
783static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
784 // close tag if open
785 IP2TagInfo &tInfo = tagMapDict[el.tagName];
786 if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
787
788 // open this tag
789 tInfo.inTag = true;
790 tInfo.startFrag = numFrags;
791
792 // check for start of next fragment
793 bool wordLevelIndex = ivfLevel.indexLevel.empty();
794 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
795 numFrags++;
796 inFrag = true;
797 }
798}
799
800static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
801 // check for end of fragment
802 bool wordLevelIndex = ivfLevel.indexLevel.empty();
803 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
804 inFrag = false;
805 }
806
807 IP2TagInfo &tInfo = tagMapDict[el.tagName];
808 CloseTextTag (tInfo, el.tagName);
809}
810
811static void ProcessText (const TextEl &el, bool &inFrag) {
812 // make sure this text is to be indexed
813 bool wordLevelIndex = ivfLevel.indexLevel.empty();
814 if (!wordLevelIndex && !inFrag) return;
815
816 const unsigned char *textHere = el.text.begin();
817 const unsigned char *textEnd = el.text.end() - 1;
818 unsigned char mgWord[MAXSTEMLEN + 1];
819
820 if (!inaword (textHere, textEnd))
821 ParseNonindexWord (textHere, textEnd);
822
823
824 // Alternately parse off words and non-words from the input
825
826 while (textHere <= textEnd) {
827 textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
828 textHere = ParseNonindexWord (textHere, textEnd);
829
830 if (mgWord[0] > 0) {
831 if (wordLevelIndex) numFrags++;
832
833 unsigned long wordNum = perf_hash (wordHashDict, mgWord);
834
835 /*
836 cout << wordNum << " \"";
837 cout.write (mgWord+1, *mgWord);
838 cout << "\" " << numFrags << "\n";
839 */
840
841 // add this word to the inverted list
842 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
843 unsigned long fragNum = numFrags;
844 int b = 1 << wordBitPtr.lgB;
845
846 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
847
848 // note: this assumes that fragments don't carry over between
849 // chunks (which they don't because all tags are closed at the
850 // end of each document and chunks are based on document
851 // boundaries), i.e. the first fragment number must be greater
852 // than the starting fragment number of the chunk.
853 if (fragNum > wordBitPtr.lastFragNum) {
854 buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
855 b, NULL);
856 if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
857
858 } else if (!wordLevelIndex) {
859 // add one to the frequency count for this word
860 buffer.seek (buffer.position()-1);
861 buffer.encodeBit (0); // unary encoding -- last = 1
862 buffer.encodeBit (1);
863 }
864
865 wordBitPtr.lastFragNum = fragNum;
866 wordBitPtr.here = buffer.position();
867 buffer.encodeDone();
868
869 // check for buffer overrun
870 bitPtrs.CheckWordBufOverrun (wordNum);
871 }
872 }
873}
874
875// combine the in memory inverted buffer with the disk
876// based inverted file
877static void DiskMerge (char *filename) {
878 bool wordLevelIndex = ivfLevel.indexLevel.empty();
879
880 // make sure we have something to process
881 if (numChunkDocs <= 0) return;
882
883 // open the inverted file
884 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
885 MAGIC_INVF, MG_ABORT);
886 random_bitio_buffer invfOutBuf (invfFile);
887
888 // set up to decode the entries in memory
889 mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
890
891 // write out the word information
892 unsigned long wordNum;
893 int b;
894 unsigned long currFragNum;
895 unsigned long delta;
896 unsigned long currFreq;
897 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
898 // go to the end of the last inverted file entry
899 InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
900 invfOutBuf.SEEK_X (wordDiskState.here);
901
902 // go to the start of the inverted chunk info in memory
903 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
904 memInBuf.seek(wordBitPtr.start);
905
906 // decode each entry and re-write to disk
907 currFragNum = chunkStartFragNum;
908 while (memInBuf.position() < wordBitPtr.here) {
909 // decode word entry
910 b = 1 << wordBitPtr.lgB;
911 delta = memInBuf.bblock_decode (b, NULL);
912 currFragNum += delta;
913 if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
914 else currFreq = 1;
915
916 // recode on disk
917 invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
918 wordDiskState.B, NULL);
919 if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
920 wordDiskState.lastFragNum = currFragNum;
921 }
922
923 wordDiskState.here = invfOutBuf.TELL_X();
924 }
925
926 // write out the tag information
927 unsigned long tagNum;
928 unsigned long currTagStart;
929 unsigned long currTagEnd;
930 for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
931 // go to the end of the last inverted file entry
932 InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
933 invfOutBuf.SEEK_X (tagDiskState.here);
934
935 // go to the start of the inverted chunk info in memory
936 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
937 memInBuf.seek(tagBitPtr.start);
938
939 // decode each entry and re-write to disk
940 currTagEnd = chunkStartFragNum;
941 while (memInBuf.position() < tagBitPtr.here) {
942 // decode tag entry
943 b = 1 << tagBitPtr.lgB;
944 delta = memInBuf.bblock_decode (b, NULL) - 1;
945 currTagStart = currTagEnd + delta;
946 delta = memInBuf.bblock_decode (b, NULL) - 1;
947 currTagEnd = currTagStart + delta;
948
949 // recode on disk
950 invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
951 tagDiskState.B, NULL);
952 invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
953 tagDiskState.B, NULL);
954
955 tagDiskState.lastFragNum = currTagEnd;
956 }
957
958 tagDiskState.here = invfOutBuf.TELL_X();
959 }
960
961 memInBuf.done();
962
963 invfOutBuf.encodeDone();
964 fclose (invfFile);
965}
966
967
968int process_ivf_2 (const TagInfo &/*tagInfo*/, const TextElArray &doc) {
969 bool wordLevelIndex = ivfLevel.indexLevel.empty();
970 bool inFrag = false;
971 if (wordLevelIndex) inFrag = true; // unconditional
972
973 // get next chunk information if need to. the chunk information
974 // is needed before the first document is processed
975 if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
976
977 // process each text element
978 TextElArray::const_iterator here = doc.begin();
979 TextElArray::const_iterator end = doc.end();
980 while (here != end) {
981 // process this element
982 if ((*here).elType == OpenTagE) ProcessOpenTag (*here, inFrag);
983 else if ((*here).elType == CloseTagE) ProcessCloseTag (*here, inFrag);
984 else ProcessText (*here, inFrag);
985
986 here++;
987 }
988
989 // close off any unclosed tags
990 TagMapDict::iterator tdHere = tagMapDict.begin();
991 TagMapDict::iterator tdEnd = tagMapDict.end();
992 while (tdHere != tdEnd) {
993 CloseTextTag ((*tdHere).second, (*tdHere).first);
994 tdHere++;
995 }
996
997 // we've processed one more document
998 numDocs++;
999 numChunkDocs++;
1000
1001 // merge the memory based inverted file with the one on
1002 // disk if this is the end of this chunk
1003 if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
1004
1005 return COMPALLOK;
1006}
1007
1008
1009static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
1010 FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1011 MAGIC_INVF, MG_ABORT);
1012 FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1013 MAGIC_INVF, MG_ABORT);
1014
1015 // skip the magic number
1016 fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
1017
1018 // write the inverted file header -- use defaults for most things
1019 invf_file_header ifh;
1020 ifh.no_of_words = idh.word_dict_size;
1021 ifh.no_of_tags = idh.tag_dict_size;
1022 ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1023 ifh.Write (outInvfFile);
1024
1025 bytesOutput = ftell (outInvfFile);
1026
1027 // process each meaningful byte in the file
1028 unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
1029 unsigned long entryNum;
1030 mg_ullong lastStart = 0;
1031 for (entryNum = 0; entryNum < numEntries; entryNum++) {
1032 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1033
1034 // overrun check
1035 if (stateRec.start < lastStart)
1036 FatalError (1, "Inverted file Buffer overrun");
1037 lastStart = stateRec.start;
1038
1039 unsigned long oldEntryStart = stateRec.start >> 3;
1040 unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
1041 unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1042 unsigned long oldEntryEndOver = stateRec.here & 7;
1043
1044 fseek (inInvfFile, oldEntryStart, SEEK_SET);
1045
1046 stateRec.here -= stateRec.start;
1047 stateRec.start = bytesOutput * 8 + oldEntryStartOver;
1048 stateRec.here += stateRec.start;
1049 while (oldEntryStart < oldEntryEnd) {
1050 unsigned char c = getc (inInvfFile);
1051 if (oldEntryStart == oldEntryEnd - 1) {
1052 u_char ands[8] =
1053 {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1054 c &= ands[oldEntryEndOver];
1055 }
1056 putc (c, outInvfFile);
1057 bytesOutput++;
1058 oldEntryStart++;
1059 }
1060 }
1061
1062 fclose (inInvfFile);
1063
1064#ifdef __WIN32__
1065 if (!(_chsize (_fileno (outInvfFile), bytesOutput)))
1066 Message ("Could not truncate invf.");
1067#else
1068 ftruncate (fileno (outInvfFile), bytesOutput);
1069#endif
1070
1071 fclose (outInvfFile);
1072}
1073
1074static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
1075 FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1076 MAGIC_INVI, MG_ABORT);
1077
1078 // process each meaningful byte in the file
1079 unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
1080 unsigned long entryNum;
1081 for (entryNum = 0; entryNum < numEntries; entryNum++) {
1082 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1083
1084 // assumes that inverted entries start at beginning of each byte
1085 if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1086 }
1087
1088 WriteUL (invfIdxFile, invfNumBytes);
1089
1090 fclose (invfIdxFile);
1091}
1092
1093
1094int done_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
1095 // close most open files
1096 if (chunkFile != NULL) {
1097 chunkBuf.done();
1098 fclose (chunkFile);
1099 chunkFile = NULL;
1100 }
1101 occurConvert.Close();
1102
1103 // free allocated memory
1104 bitPtrs.Clear();
1105 if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1106 free_perf_hash (wordHashDict);
1107 wordHashDict = NULL;
1108 tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1109
1110 // condense the inverted file and truncate it
1111 // this function also writes out the inverted header
1112 unsigned long invfNumBytes = 0;
1113 CondenseInvfFile (filename, invfNumBytes);
1114
1115 OutputInvfIdx (filename, invfNumBytes);
1116
1117 // close the rest of the open files
1118 invfState.Close ();
1119
1120 return COMPALLOK;
1121}
Note: See TracBrowser for help on using the repository browser.