source: trunk/gsdl/src/mgpp/text/ivf.pass2.cpp@ 2448

Last change on this file since 2448 was 2448, checked in by kjm18, 23 years ago

fixed unsigned long overflow error in Condense Invf function
bytesOutput cast to mg_ullong.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 31.6 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass2.cpp -- Memory efficient pass 2 inversion
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: ivf.pass2.cpp 2448 2001-05-18 03:20:10Z kjm18 $
21 *
22 **************************************************************************/
23
24/*
25 $Log$
26 Revision 1.3 2001/05/18 03:20:10 kjm18
27 fixed unsigned long overflow error in Condense Invf function
28 bytesOutput cast to mg_ullong.
29
30 Revision 1.2 2001/05/17 04:52:38 jrm21
31 added some #defines for proper portability.
32 (eg ftruncate needs _XOPEN_SOURCE_EXTENDED)
33
34 Revision 1.1 2000/01/14 02:26:07 sjboddie
35 Rodgers new C++ mg
36
37 */
38#define _XOPEN_SOURCE 1
39#define _XOPEN_SOURCE_EXTENDED 1
40#include <stdio.h>
41#include <unistd.h>
42
43#include "non_ansi.h"
44
45#include "sysfuncs.h"
46
47#include "mg_files.h"
48#include "invf.h"
49#include "mg.h"
50#include "build.h"
51#include "locallib.h"
52#include "UCArray.h"
53#include "bitio_m_random.h"
54#include "bitio_m_stdio.h"
55#include "bitio_m_mems.h"
56#include "bitio_gen.h"
57#include <stdio.h>
58#include "words.h"
59#include "messages.h"
60#include "netorder.h"
61#include "FIvfLevelInfo.h"
62#include "perf_hash.h"
63#include "string.h"
64
65#include "longlong.h"
66
67#ifdef __WIN32_
68#include <io.h>
69#endif
70
71#if defined(GSDL_USE_OBJECTSPACE)
72# include <ospace\std\map>
73#elif defined(GSDL_USE_STL_H)
74# include <map.h>
75#else
76# include <map>
77#endif
78
79
80#ifdef USE_LONG_LONG
81#define SEEK_X seek_LL
82#define TELL_X tell_LL
83#else
84#define SEEK_X seek
85#define TELL_X tell
86#endif
87
88#ifndef RND_BUF_SIZE
89#define RND_BUF_SIZE 8*1024
90#endif
91
92
93static unsigned long numDocs = 0;
94static unsigned long numChunkDocs = 0;
95static unsigned long numDocsInChunk = 0;
96
97static unsigned long numFrags = 0;
98static unsigned long numFragsInChunk = 0;
99static unsigned long chunkStartFragNum = 0;
100
101
102
103
104struct BitPtr {
105 unsigned long start;
106 unsigned long here;
107 unsigned long lastFragNum;
108 unsigned long lgB;
109
110 void Clear () { start = here = lastFragNum = lgB = 0; }
111 BitPtr () { Clear(); }
112};
113
114class WordBitPtrs {
115protected:
116 unsigned long numWords;
117 unsigned long numTags;
118 unsigned long size;
119 BitPtr *wordBitPtrs;
120
121 void CheckBufOverrun (unsigned long num) {
122 if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
123 cerr << "numDocs: " << numDocs << "\n";
124 cerr << "numChunkDocs: " << numChunkDocs << "\n";
125 cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
126 cerr << "numFrags: " << numFrags << "\n";
127 cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
128 cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
129 cerr << "num: " << num << "\n";
130 cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
131 cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
132 cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
133 FatalError (1, "Bit buffer overrun");
134 }
135 }
136
137public:
138 void Clear ();
139 WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
140 ~WordBitPtrs ();
141 void SetSize (unsigned long _numWords,
142 unsigned long _numTags);
143
144 void ResetPtrs () {
145 if (wordBitPtrs == NULL) return;
146 unsigned long i;
147 for (i=0; i<size; i++) wordBitPtrs[i].Clear();
148 }
149
150 BitPtr &GetWordBitPtr (unsigned long wordNum)
151 { return wordBitPtrs[wordNum]; }
152 unsigned long &GetWordStart (unsigned long wordNum)
153 { return wordBitPtrs[wordNum].start; }
154 unsigned long &GetWordHere (unsigned long wordNum)
155 { return wordBitPtrs[wordNum].here; }
156 void CheckWordBufOverrun (unsigned long wordNum)
157 { CheckBufOverrun (wordNum); }
158
159 BitPtr &GetTagBitPtr (unsigned long tagNum)
160 { return wordBitPtrs[tagNum + numWords]; }
161 unsigned long &GetTagStart (unsigned long tagNum)
162 { return wordBitPtrs[tagNum + numWords].start; }
163 unsigned long &GetTagHere (unsigned long tagNum)
164 { return wordBitPtrs[tagNum + numWords].here; }
165 void CheckTagBufOverrun (unsigned long tagNum)
166 { CheckBufOverrun (tagNum + numWords); }
167
168 BitPtr &GetEndBitPtr ()
169 { return wordBitPtrs[size-1]; }
170 unsigned long &GetEndStart ()
171 { return wordBitPtrs[size-1].start; }
172 unsigned long &GetEndHere ()
173 { return wordBitPtrs[size-1].here; }
174};
175
176
177struct IP2TagInfo {
178 bool inTag;
179 unsigned long startFrag;
180 unsigned long tagNum;
181
182 IP2TagInfo () {
183 inTag = false;
184 startFrag = 0;
185 tagNum = 0;
186 }
187};
188
189// maps tags to tag information
190typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
191
192
193// class to handle the translation of occurrence order
194// to dictionary order for words and tags
195class OccurToDictConverter {
196protected:
197 unsigned long pos;
198 unsigned long val;
199 FILE *transFile;
200 random_bitio_buffer rbs;
201
202 unsigned long wordDictSize;
203 unsigned long tagDictSize;
204
205 void SeekStart ();
206 unsigned long TranslateNum (unsigned long num);
207
208public:
209 OccurToDictConverter ();
210 ~OccurToDictConverter ();
211
212 void Open (char *filename, unsigned long _wordDictSize,
213 unsigned long _tagDictSize);
214
215 // Close frees all allocated memory
216 void Close ();
217
218 unsigned long TranslateWord (unsigned long occurNum)
219 { return TranslateNum (occurNum); }
220 unsigned long TranslateTag (unsigned long occurNum)
221 { return TranslateNum (occurNum+wordDictSize); }
222};
223
224
225struct InvfStateRec {
226 mg_ullong start;
227 mg_ullong here;
228 unsigned long lastFragNum;
229 unsigned long B;
230
231 void Clear () {
232 start = here = 0;
233 lastFragNum = B = 0;
234 }
235 InvfStateRec () { Clear (); }
236};
237
238
239#define ISR_SIZE 1024
240
241class InvfStateCache {
242protected:
243 InvfStateRec recCache [ISR_SIZE];
244 unsigned long startNum;
245
246 FILE *stateFile;
247
248 void ClearCache () {
249 unsigned int i = 0;
250 for (i=0; i<ISR_SIZE; i++) recCache[i].Clear();
251 }
252
253public:
254 InvfStateCache ();
255 ~InvfStateCache ();
256
257 void Open (char *filename);
258 void Close ();
259
260 // previous references to state records may be
261 // invalidated calling GetRec
262 InvfStateRec &GetRec (unsigned long num);
263};
264
265
266static invf_dict_header idh;
267static WordBitPtrs bitPtrs;
268
269static FILE *chunkFile = NULL;
270static stdio_bitio_buffer chunkBuf;
271
272static unsigned long ivfMemBufSize = 0;
273static char *ivfMemBuf = NULL;
274
275// word and tag dictionaries. a map is used for the tag dictionary
276// as it should never be very big (and the perfect hash function
277// sometimes has trouble with small values).
278static perf_hash_data *wordHashDict = NULL;
279static TagMapDict tagMapDict;
280
281// information about all the different levels
282static FIvfLevel ivfLevel;
283
284static OccurToDictConverter occurConvert;
285
286// information about the state of the inverted file
287static InvfStateCache invfState;
288
289static char collectFilename[512];
290
291
292void WordBitPtrs::Clear () {
293 numWords = 0;
294 numTags = 0;
295 size=0;
296 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
297 wordBitPtrs = NULL;
298}
299
300WordBitPtrs::~WordBitPtrs () {
301 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
302}
303
304void WordBitPtrs::SetSize (unsigned long _numWords,
305 unsigned long _numTags){
306 Clear();
307 numWords = _numWords;
308 numTags = _numTags;
309 size = numWords + numTags + 1;
310 wordBitPtrs = new BitPtr [size];
311}
312
313
314void OccurToDictConverter::SeekStart () {
315 if (transFile == NULL) return;
316 rbs.SEEK_X (sizeof (unsigned long) * 8);
317 pos = 0;
318}
319
320unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
321 if (num < pos) SeekStart ();
322 while (pos <= num) {
323 if (pos < wordDictSize)
324 val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
325 else
326 val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
327 pos++;
328 }
329 return val;
330}
331
332OccurToDictConverter::OccurToDictConverter () {
333 pos = 0;
334 val = 0;
335 transFile = NULL;
336 wordDictSize = 0;
337 tagDictSize = 0;
338}
339
340OccurToDictConverter::~OccurToDictConverter () {
341 if (transFile != NULL) Close ();
342}
343
344void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
345 unsigned long _tagDictSize) {
346 if (transFile != NULL) Close ();
347
348 wordDictSize = _wordDictSize;
349 tagDictSize = _tagDictSize;
350
351 transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
352 MAGIC_CHUNK_TRANS, MG_ABORT);
353 rbs.attachFile (transFile, RND_BUF_SIZE);
354 SeekStart ();
355 val = 0;
356}
357
358void OccurToDictConverter::Close () {
359 if (transFile == NULL) return;
360
361 rbs.done ();
362 fclose (transFile);
363 transFile = NULL;
364 pos = 0;
365 val = 0;
366
367 wordDictSize = 0;
368 tagDictSize = 0;
369}
370
371
372
373
374InvfStateCache::InvfStateCache () {
375 startNum = 0;
376 stateFile = NULL;
377}
378
379InvfStateCache::~InvfStateCache () {
380 if (stateFile != NULL) Close ();
381}
382
383void InvfStateCache::Open (char *filename) {
384 if (stateFile != NULL) Close();
385
386 // open the state file
387 char path[512];
388 sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
389 ".invf.state", (long) getpid ());
390 if (!(stateFile = fopen (path, "wb+"))) {
391 Message ("Unable to create \"%s\"", path);
392 exit (1);
393 }
394 unlink (path); // file will be deleted after it is closed
395
396 // reset the buffer
397 startNum = 0;
398 ClearCache();
399}
400
401void InvfStateCache::Close () {
402 if (stateFile == NULL) return;
403 fclose (stateFile);
404 stateFile = NULL;
405 startNum = 0;
406}
407
408InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
409 // see if cached
410 if ((num >= startNum) && (num < startNum + ISR_SIZE))
411 return recCache[num-startNum];
412
413 // not cached, write out this lot of records and read in
414 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
415 fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
416
417 // read in the new set of records
418 ClearCache ();
419 startNum = num - (num % ISR_SIZE);
420 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
421 fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
422
423 return recCache[num-startNum];
424}
425
426
427
428static void ClearCharBuf (char *buf, unsigned long size) {
429 char *end = buf + size;
430 while (buf != end) *buf++ = 0;
431}
432
433static void ReadWordDict (char *filename) {
434 // read in the perfect hash function for the word dictionary
435 FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
436 MAGIC_HASH, MG_ABORT);
437 if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
438 FatalError (1, "Unable to read in hash data for word dictionary");
439 }
440 fclose (wordHashFile);
441}
442
443static void ReadTagDict (char *filename, invf_dict_header &_idh) {
444 // open the file
445 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
446 MAGIC_STEM_BUILD, MG_ABORT);
447
448 // seek to the start of the tag dictionary
449 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
450
451 unsigned long tagNum;
452 dict_el thisEl;
453 for (tagNum = 0; tagNum < _idh.tag_dict_size; tagNum++) {
454 thisEl.Read (dictFile);
455 tagMapDict[thisEl.el].tagNum = tagNum;
456 }
457
458 fclose (dictFile);
459}
460
461static void ReadLevelFile (char *filename) {
462 FILE *f;
463 f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
464 MAGIC_INVF_LEVELS, MG_ABORT);
465 ivfLevel.Read (f);
466 fclose (f);
467}
468
469void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
470 if (totalIBits < lastTotalIBits) {
471 fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
472 if (sizeof (mg_ullong) < 8) {
473 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
474 }
475 fprintf(stderr, " Build aborted.\n");
476 exit(1);
477 }
478}
479
480// assumes the inverted file state file has been opened
481static void InitInvfState (char *filename,
482 invf_dict_header &_idh,
483 InvfStateCache &_invfState,
484 mg_ullong &totalIBits,
485 bool wordLevelIndex) {
486 // read in the dictionary, setting inverted state information
487 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
488 MAGIC_STEM_BUILD, MG_ABORT);
489
490 // seek to the start of the word dictionary
491 fseek (dictFile, _idh.word_dict_start, SEEK_SET);
492
493 // add the word entries
494 word_dict_el wordEl;
495 wordEl.SetNumLevels (_idh.num_levels);
496 unsigned long dictWordNum, p;
497 mg_ullong lastTotalIBits;
498 unsigned long N = _idh.num_frags;
499 for (dictWordNum=0; dictWordNum<_idh.word_dict_size; dictWordNum++) {
500 // lastTotalIBits is used to detect integer overflow
501 lastTotalIBits = totalIBits;
502
503 // read the next word and associated information
504 wordEl.Read (dictFile, _idh.num_levels);
505
506 // update the state record
507 p = wordEl.frag_occur;
508 InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
509 wisr.start = totalIBits;
510 wisr.here = totalIBits;
511 wisr.B = BIO_Bblock_Init (N, p);
512
513 // add the length of the fragment numbers
514 totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
515
516 // if needed, add the length of the fragment frequency information
517 if (!wordLevelIndex)
518 totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
519
520 // align next byte
521#ifdef USE_LONG_LONG
522 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
523#else
524 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
525#endif
526
527 CheckIntOverflow (totalIBits, lastTotalIBits);
528 }
529
530
531 // seek to the start of the tag dictionary
532 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
533
534 // add the tag entries
535 dict_el tagEl;
536 unsigned long dictTagNum;
537 N = _idh.num_frags;
538 for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; dictTagNum++) {
539 // lastTotalIBits is used to detect integer overflow
540 lastTotalIBits = totalIBits;
541
542 // read the next tag and associated information
543 tagEl.Read (dictFile);
544
545 // update the state record
546 p = tagEl.frag_occur*2;
547 InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
548 tisr.start = totalIBits;
549 tisr.here = totalIBits;
550 tisr.B = BIO_Bblock_Init (N+p, p);
551
552 // add the length of the fragment numbers (two numbers for each
553 // tag, one for start and one for end)
554 totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
555
556 // align next byte
557#ifdef USE_LONG_LONG
558 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
559#else
560 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
561#endif
562
563 CheckIntOverflow (totalIBits, lastTotalIBits);
564 }
565
566 fclose (dictFile);
567}
568
569/*
570// assumes the chunk tag information has been placed in .first
571static void PrintChunkInfo (unsigned long chunkMem,
572 unsigned long numChunkWords,
573 unsigned long numChunkTags) {
574 static unsigned long chunksRead = 0;
575 chunksRead++;
576 cout << "Chunk Number: " << chunksRead << "\n";
577 cout << "numChunkDocs " << numDocsInChunk << "\n";
578 cout << "numChunkFrags " << numFragsInChunk << "\n";
579 cout << "mem " << chunkMem << "\n";
580 cout << "numWords " << numChunkWords << "\n";
581 cout << "numTags " << numChunkTags << "\n\n";
582
583 TagMapDict::iterator tagMapHere = tagMapDict.begin();
584 TagMapDict::iterator tagMapEnd = tagMapDict.end();
585 while (tagMapHere != tagMapEnd) {
586 unsigned long tagMapNum = (*tagMapHere).second.tagNum;
587 cout << (*tagMapHere).first << " " << tagMapNum << " "
588 << bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
589 tagMapHere++;
590 }
591}
592*/
593
594void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
595 // reset globals
596 numChunkDocs = 0;
597 chunkStartFragNum = numFrags;
598
599 // read in information about this chunk
600 numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
601 if (numDocsInChunk == 0)
602 FatalError (1, "The number of docs in the current chunk is 0");
603
604 numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
605 unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
606
607 if (chunkMem > ivfMemBufSize)
608 FatalError (1, "Chunk memory size is greater than maximum");
609
610 unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
611 unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
612
613
614 // reset stuff
615 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
616 bitPtrs.ResetPtrs();
617
618 // read in the entries in occurrence order storing the
619 // "chunkWordCount" in "start" and the "chunkFragCount"
620 // in "here"
621 unsigned long numOccur;
622 unsigned long wordNum;
623 for (numOccur=0; numOccur<numChunkWords; numOccur++) {
624 wordNum = occurConvert.TranslateWord (numOccur);
625 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
626 wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
627 if (wordPtr.start >= 2)
628 wordPtr.here = chunkBuf.gamma_decode (NULL);
629 else wordPtr.here = wordPtr.start;
630 }
631 unsigned long tagNum;
632 for (numOccur=0; numOccur<numChunkTags; numOccur++) {
633 tagNum = occurConvert.TranslateTag (numOccur);
634 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
635 // only chunkFragCount is encoded for tags
636 tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
637 tagPtr.here = tagPtr.start;
638 }
639
640 /* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
641
642 // create the bit ptrs in dictionary order
643 unsigned long totalIBits = 0; // only dealing with memory
644 unsigned long chunkWordCount, chunkFragCount;
645 for (wordNum=0; wordNum<_idh.word_dict_size; wordNum++) {
646 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
647 chunkWordCount = wordPtr.start;
648 chunkFragCount = wordPtr.here;
649 wordPtr.start = totalIBits;
650 wordPtr.here = totalIBits;
651 wordPtr.lastFragNum = chunkStartFragNum;
652 wordPtr.lgB = 0;
653 if (chunkWordCount > 0) {
654 wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
655 chunkFragCount));
656 totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
657 // use unary encoding for memory buffer encoding of fragment freq
658 if (!wordLevelIndex) {
659 totalIBits += chunkWordCount;
660 }
661 }
662 }
663 for (tagNum=0; tagNum<_idh.tag_dict_size; tagNum++) {
664 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
665 chunkFragCount = tagPtr.here;
666 tagPtr.start = totalIBits;
667 tagPtr.here = totalIBits;
668 tagPtr.lastFragNum = chunkStartFragNum;
669 tagPtr.lgB = 0;
670 if (chunkFragCount > 0) {
671 unsigned long pTag = chunkFragCount*2;
672 tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
673 pTag));
674 unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
675 pTag);
676// cout << tagNum + _idh.word_dict_size << " ";
677// cout << "numFrags: " << numFragsInChunk
678// << " chunkFragCount: " << chunkFragCount
679// << " B: " << 1 << tagPtr.lgB
680// << " blen: " << blen << "\n";
681 totalIBits += bLen;
682 }
683 }
684 bitPtrs.GetEndStart() = totalIBits;
685 bitPtrs.GetEndHere() = totalIBits;
686
687 if ((totalIBits + 7ul) >> 3ul > chunkMem) {
688 cerr << "totalIBits: " << totalIBits << "\n";
689 cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
690 cerr << "chunkMem: " << chunkMem << "\n";
691 FatalError (1, "Pointers exceed buffer size");
692 }
693}
694
695
696
697
698int init_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
699 // read in compressed dictionary header
700 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
701 MAGIC_STEM_BUILD, MG_ABORT);
702 idh.Read (dictFile);
703 fclose (dictFile);
704
705 // set the size of the bit ptrs
706 bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
707
708 // open the chunk file and read in the maximum memory needed
709 // for the inverted memory buffer
710 chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
711 MAGIC_CHUNK, MG_ABORT);
712 ReadUL (chunkFile, ivfMemBufSize);
713 chunkBuf.attachFile (chunkFile);
714
715 // allocate memory for the inverted buffer
716 ivfMemBuf = new char [ivfMemBufSize];
717 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
718
719 // read in the word dictionary
720 ReadWordDict (filename);
721
722 // read in the tag dictionary
723 ReadTagDict (filename, idh);
724
725 // read in the level information
726 ReadLevelFile (filename);
727 bool wordLevelIndex = ivfLevel.indexLevel.empty();
728
729 // set up the translation table file
730 occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
731
732 // reset some globals
733 numDocs = 0;
734 numChunkDocs = 0;
735 numDocsInChunk = 0;
736 numFrags = 0;
737 numFragsInChunk = 0;
738 chunkStartFragNum = 0;
739
740 strcpy (collectFilename, filename);
741
742
743 // create the inverted file
744 mg_ullong totalIBits = 0;
745 FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
746 MAGIC_INVF, MG_ABORT);
747 totalIBits += sizeof (unsigned long) * 8; // magic number
748 totalIBits += 8 * 200; // 200 byte gap -- why??????
749 fclose (invfFile);
750
751 // init the inverted file state cache
752 invfState.Open (filename);
753 InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
754
755 return COMPALLOK;
756}
757
758static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/*tagName*/) {
759 if (!tInfo.inTag) return;
760
761 // add this tag to the inverted list
762 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
763 unsigned long endFrag = numFrags;
764 int b = 1 << tagBitPtr.lgB;
765
766 /*
767 cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
768 << tInfo.startFrag << " " << endFrag << "\n";
769 */
770
771 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
772 buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
773 b, NULL);
774 buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
775 tagBitPtr.lastFragNum = endFrag;
776 tagBitPtr.here = buffer.position();
777 buffer.encodeDone();
778
779 // check for buffer overrun
780 bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
781
782 // reset information about this tag
783 tInfo.inTag = false;
784 tInfo.startFrag = 0;
785}
786
787static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
788 // close tag if open
789 IP2TagInfo &tInfo = tagMapDict[el.tagName];
790 if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
791
792 // open this tag
793 tInfo.inTag = true;
794 tInfo.startFrag = numFrags;
795
796 // check for start of next fragment
797 bool wordLevelIndex = ivfLevel.indexLevel.empty();
798 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
799 numFrags++;
800 inFrag = true;
801 }
802}
803
804static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
805 // check for end of fragment
806 bool wordLevelIndex = ivfLevel.indexLevel.empty();
807 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
808 inFrag = false;
809 }
810
811 IP2TagInfo &tInfo = tagMapDict[el.tagName];
812 CloseTextTag (tInfo, el.tagName);
813}
814
815static void ProcessText (const TextEl &el, bool &inFrag) {
816 // make sure this text is to be indexed
817 bool wordLevelIndex = ivfLevel.indexLevel.empty();
818 if (!wordLevelIndex && !inFrag) return;
819
820 const unsigned char *textHere = el.text.begin();
821 const unsigned char *textEnd = el.text.end() - 1;
822 unsigned char mgWord[MAXSTEMLEN + 1];
823
824 if (!inaword (textHere, textEnd))
825 ParseNonindexWord (textHere, textEnd);
826
827
828 // Alternately parse off words and non-words from the input
829
830 while (textHere <= textEnd) {
831 textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
832 textHere = ParseNonindexWord (textHere, textEnd);
833
834 if (mgWord[0] > 0) {
835 if (wordLevelIndex) numFrags++;
836
837 unsigned long wordNum = perf_hash (wordHashDict, mgWord);
838
839 /*
840 cout << wordNum << " \"";
841 cout.write (mgWord+1, *mgWord);
842 cout << "\" " << numFrags << "\n";
843 */
844
845 // add this word to the inverted list
846 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
847 unsigned long fragNum = numFrags;
848 int b = 1 << wordBitPtr.lgB;
849
850 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
851
852 // note: this assumes that fragments don't carry over between
853 // chunks (which they don't because all tags are closed at the
854 // end of each document and chunks are based on document
855 // boundaries), i.e. the first fragment number must be greater
856 // than the starting fragment number of the chunk.
857 if (fragNum > wordBitPtr.lastFragNum) {
858 buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
859 b, NULL);
860 if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
861
862 } else if (!wordLevelIndex) {
863 // add one to the frequency count for this word
864 buffer.seek (buffer.position()-1);
865 buffer.encodeBit (0); // unary encoding -- last = 1
866 buffer.encodeBit (1);
867 }
868
869 wordBitPtr.lastFragNum = fragNum;
870 wordBitPtr.here = buffer.position();
871 buffer.encodeDone();
872
873 // check for buffer overrun
874 bitPtrs.CheckWordBufOverrun (wordNum);
875 }
876 }
877}
878
879// combine the in memory inverted buffer with the disk
880// based inverted file
881static void DiskMerge (char *filename) {
882 bool wordLevelIndex = ivfLevel.indexLevel.empty();
883
884 // make sure we have something to process
885 if (numChunkDocs <= 0) return;
886
887 // open the inverted file
888 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
889 MAGIC_INVF, MG_ABORT);
890 random_bitio_buffer invfOutBuf (invfFile);
891
892 // set up to decode the entries in memory
893 mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
894
895 // write out the word information
896 unsigned long wordNum;
897 int b;
898 unsigned long currFragNum;
899 unsigned long delta;
900 unsigned long currFreq;
901 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
902 // go to the end of the last inverted file entry
903 InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
904 invfOutBuf.SEEK_X (wordDiskState.here);
905
906 // go to the start of the inverted chunk info in memory
907 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
908 memInBuf.seek(wordBitPtr.start);
909
910 // decode each entry and re-write to disk
911 currFragNum = chunkStartFragNum;
912 while (memInBuf.position() < wordBitPtr.here) {
913 // decode word entry
914 b = 1 << wordBitPtr.lgB;
915 delta = memInBuf.bblock_decode (b, NULL);
916 currFragNum += delta;
917 if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
918 else currFreq = 1;
919
920 // recode on disk
921 invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
922 wordDiskState.B, NULL);
923 if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
924 wordDiskState.lastFragNum = currFragNum;
925 }
926
927 wordDiskState.here = invfOutBuf.TELL_X();
928 }
929
930 // write out the tag information
931 unsigned long tagNum;
932 unsigned long currTagStart;
933 unsigned long currTagEnd;
934 for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
935 // go to the end of the last inverted file entry
936 InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
937 invfOutBuf.SEEK_X (tagDiskState.here);
938
939 // go to the start of the inverted chunk info in memory
940 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
941 memInBuf.seek(tagBitPtr.start);
942
943 // decode each entry and re-write to disk
944 currTagEnd = chunkStartFragNum;
945 while (memInBuf.position() < tagBitPtr.here) {
946 // decode tag entry
947 b = 1 << tagBitPtr.lgB;
948 delta = memInBuf.bblock_decode (b, NULL) - 1;
949 currTagStart = currTagEnd + delta;
950 delta = memInBuf.bblock_decode (b, NULL) - 1;
951 currTagEnd = currTagStart + delta;
952
953 // recode on disk
954 invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
955 tagDiskState.B, NULL);
956 invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
957 tagDiskState.B, NULL);
958
959 tagDiskState.lastFragNum = currTagEnd;
960 }
961
962 tagDiskState.here = invfOutBuf.TELL_X();
963 }
964
965 memInBuf.done();
966
967 invfOutBuf.encodeDone();
968 fclose (invfFile);
969}
970
971
972int process_ivf_2 (const TagInfo &/*tagInfo*/, const TextElArray &doc) {
973 bool wordLevelIndex = ivfLevel.indexLevel.empty();
974 bool inFrag = false;
975 if (wordLevelIndex) inFrag = true; // unconditional
976
977 // get next chunk information if need to. the chunk information
978 // is needed before the first document is processed
979 if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
980
981 // process each text element
982 TextElArray::const_iterator here = doc.begin();
983 TextElArray::const_iterator end = doc.end();
984 while (here != end) {
985 // process this element
986 if ((*here).elType == OpenTagE) ProcessOpenTag (*here, inFrag);
987 else if ((*here).elType == CloseTagE) ProcessCloseTag (*here, inFrag);
988 else ProcessText (*here, inFrag);
989
990 here++;
991 }
992
993 // close off any unclosed tags
994 TagMapDict::iterator tdHere = tagMapDict.begin();
995 TagMapDict::iterator tdEnd = tagMapDict.end();
996 while (tdHere != tdEnd) {
997 CloseTextTag ((*tdHere).second, (*tdHere).first);
998 tdHere++;
999 }
1000
1001 // we've processed one more document
1002 numDocs++;
1003 numChunkDocs++;
1004
1005 // merge the memory based inverted file with the one on
1006 // disk if this is the end of this chunk
1007 if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
1008
1009 return COMPALLOK;
1010}
1011
1012
1013static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
1014 FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1015 MAGIC_INVF, MG_ABORT);
1016 FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1017 MAGIC_INVF, MG_ABORT);
1018
1019 // skip the magic number
1020 fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
1021
1022 // write the inverted file header -- use defaults for most things
1023 invf_file_header ifh;
1024 ifh.no_of_words = idh.word_dict_size;
1025 ifh.no_of_tags = idh.tag_dict_size;
1026 ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1027 ifh.Write (outInvfFile);
1028
1029 bytesOutput = ftell (outInvfFile);
1030
1031 // process each meaningful byte in the file
1032 unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
1033 unsigned long entryNum;
1034 mg_ullong lastStart = 0;
1035 for (entryNum = 0; entryNum < numEntries; entryNum++) {
1036 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1037
1038 // overrun check
1039 if (stateRec.start < lastStart)
1040 FatalError (1, "Inverted file Buffer overrun");
1041 lastStart = stateRec.start;
1042
1043 unsigned long oldEntryStart = stateRec.start >> 3;
1044 unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
1045 unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1046 unsigned long oldEntryEndOver = stateRec.here & 7;
1047
1048 fseek (inInvfFile, oldEntryStart, SEEK_SET);
1049
1050 stateRec.here -= stateRec.start;
1051 stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver;
1052 stateRec.here += stateRec.start;
1053 while (oldEntryStart < oldEntryEnd) {
1054 unsigned char c = getc (inInvfFile);
1055 if (oldEntryStart == oldEntryEnd - 1) {
1056 u_char ands[8] =
1057 {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1058 c &= ands[oldEntryEndOver];
1059 }
1060 putc (c, outInvfFile);
1061 bytesOutput++;
1062 oldEntryStart++;
1063 }
1064 }
1065
1066 fclose (inInvfFile);
1067
1068#ifdef __WIN32__
1069 if (!(_chsize (_fileno (outInvfFile), bytesOutput)))
1070 Message ("Could not truncate invf.");
1071#else
1072 ftruncate (fileno (outInvfFile), bytesOutput);
1073#endif
1074
1075 fclose (outInvfFile);
1076}
1077
1078static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
1079 FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1080 MAGIC_INVI, MG_ABORT);
1081
1082 // process each meaningful byte in the file
1083 unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
1084 unsigned long entryNum;
1085 for (entryNum = 0; entryNum < numEntries; entryNum++) {
1086 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1087
1088 // assumes that inverted entries start at beginning of each byte
1089 if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1090 }
1091
1092 WriteUL (invfIdxFile, invfNumBytes);
1093
1094 fclose (invfIdxFile);
1095}
1096
1097
1098int done_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
1099 // close most open files
1100 if (chunkFile != NULL) {
1101 chunkBuf.done();
1102 fclose (chunkFile);
1103 chunkFile = NULL;
1104 }
1105 occurConvert.Close();
1106
1107 // free allocated memory
1108 bitPtrs.Clear();
1109 if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1110 free_perf_hash (wordHashDict);
1111 wordHashDict = NULL;
1112 tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1113
1114 // condense the inverted file and truncate it
1115 // this function also writes out the inverted header
1116 unsigned long invfNumBytes = 0;
1117 CondenseInvfFile (filename, invfNumBytes);
1118
1119 OutputInvfIdx (filename, invfNumBytes);
1120
1121 // close the rest of the open files
1122 invfState.Close ();
1123
1124 return COMPALLOK;
1125}
Note: See TracBrowser for help on using the repository browser.