source: trunk/gsdl/src/mgpp/text/ivf.pass2.cpp@ 879

Last change on this file since 879 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 31.2 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass2.cpp -- Memory efficient pass 2 inversion
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: ivf.pass2.cpp 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24/*
25 $Log$
26 Revision 1.1 2000/01/14 02:26:07 sjboddie
27 Rodgers new C++ mg
28
29 */
30
31#include <stdio.h>
32#include <unistd.h>
33
34#include "non_ansi.h"
35
36#include "sysfuncs.h"
37
38#include "mg_files.h"
39#include "invf.h"
40#include "mg.h"
41#include "build.h"
42#include "locallib.h"
43#include "UCArray.h"
44#include "bitio_m_random.h"
45#include "bitio_m_stdio.h"
46#include "bitio_m_mems.h"
47#include "bitio_gen.h"
48#include <stdio.h>
49#include "words.h"
50#include "messages.h"
51#include "netorder.h"
52#include "FIvfLevelInfo.h"
53#include "perf_hash.h"
54#include "string.h"
55
56#include "longlong.h"
57
58#ifdef __WIN32_
59#include <io.h>
60#endif
61
62#if defined(GSDL_USE_OBJECTSPACE)
63# include <ospace\std\map>
64#elif defined(GSDL_USE_STL_H)
65# include <map.h>
66#else
67# include <map>
68#endif
69
70
71#ifdef USE_LONG_LONG
72#define SEEK_X seek_LL
73#define TELL_X tell_LL
74#else
75#define SEEK_X seek
76#define TELL_X tell
77#endif
78
79#ifndef RND_BUF_SIZE
80#define RND_BUF_SIZE 8*1024
81#endif
82
83
84static unsigned long numDocs = 0;
85static unsigned long numChunkDocs = 0;
86static unsigned long numDocsInChunk = 0;
87
88static unsigned long numFrags = 0;
89static unsigned long numFragsInChunk = 0;
90static unsigned long chunkStartFragNum = 0;
91
92
93
94
95struct BitPtr {
96 unsigned long start;
97 unsigned long here;
98 unsigned long lastFragNum;
99 unsigned long lgB;
100
101 void Clear () { start = here = lastFragNum = lgB = 0; }
102 BitPtr () { Clear(); }
103};
104
105class WordBitPtrs {
106protected:
107 unsigned long numWords;
108 unsigned long numTags;
109 unsigned long size;
110 BitPtr *wordBitPtrs;
111
112 void CheckBufOverrun (unsigned long num) {
113 if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) {
114 cerr << "numDocs: " << numDocs << "\n";
115 cerr << "numChunkDocs: " << numChunkDocs << "\n";
116 cerr << "numDocsInChunk: " << numDocsInChunk << "\n";
117 cerr << "numFrags: " << numFrags << "\n";
118 cerr << "numFragsInChunk: " << numFragsInChunk << "\n";
119 cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n";
120 cerr << "num: " << num << "\n";
121 cerr << "[num].start: " << wordBitPtrs[num].start << "\n";
122 cerr << "[num].here: " << wordBitPtrs[num].here << "\n";
123 cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n";
124 FatalError (1, "Bit buffer overrun");
125 }
126 }
127
128public:
129 void Clear ();
130 WordBitPtrs () { wordBitPtrs = NULL; Clear(); }
131 ~WordBitPtrs ();
132 void SetSize (unsigned long _numWords,
133 unsigned long _numTags);
134
135 void ResetPtrs () {
136 if (wordBitPtrs == NULL) return;
137 unsigned long i;
138 for (i=0; i<size; i++) wordBitPtrs[i].Clear();
139 }
140
141 BitPtr &GetWordBitPtr (unsigned long wordNum)
142 { return wordBitPtrs[wordNum]; }
143 unsigned long &GetWordStart (unsigned long wordNum)
144 { return wordBitPtrs[wordNum].start; }
145 unsigned long &GetWordHere (unsigned long wordNum)
146 { return wordBitPtrs[wordNum].here; }
147 void CheckWordBufOverrun (unsigned long wordNum)
148 { CheckBufOverrun (wordNum); }
149
150 BitPtr &GetTagBitPtr (unsigned long tagNum)
151 { return wordBitPtrs[tagNum + numWords]; }
152 unsigned long &GetTagStart (unsigned long tagNum)
153 { return wordBitPtrs[tagNum + numWords].start; }
154 unsigned long &GetTagHere (unsigned long tagNum)
155 { return wordBitPtrs[tagNum + numWords].here; }
156 void CheckTagBufOverrun (unsigned long tagNum)
157 { CheckBufOverrun (tagNum + numWords); }
158
159 BitPtr &GetEndBitPtr ()
160 { return wordBitPtrs[size-1]; }
161 unsigned long &GetEndStart ()
162 { return wordBitPtrs[size-1].start; }
163 unsigned long &GetEndHere ()
164 { return wordBitPtrs[size-1].here; }
165};
166
167
168struct IP2TagInfo {
169 bool inTag;
170 unsigned long startFrag;
171 unsigned long tagNum;
172
173 IP2TagInfo () {
174 inTag = false;
175 startFrag = 0;
176 tagNum = 0;
177 }
178};
179
180// maps tags to tag information
181typedef map<UCArray, IP2TagInfo, DictLTUCArray> TagMapDict;
182
183
184// class to handle the translation of occurrence order
185// to dictionary order for words and tags
186class OccurToDictConverter {
187protected:
188 unsigned long pos;
189 unsigned long val;
190 FILE *transFile;
191 random_bitio_buffer rbs;
192
193 unsigned long wordDictSize;
194 unsigned long tagDictSize;
195
196 void SeekStart ();
197 unsigned long TranslateNum (unsigned long num);
198
199public:
200 OccurToDictConverter ();
201 ~OccurToDictConverter ();
202
203 void Open (char *filename, unsigned long _wordDictSize,
204 unsigned long _tagDictSize);
205
206 // Close frees all allocated memory
207 void Close ();
208
209 unsigned long TranslateWord (unsigned long occurNum)
210 { return TranslateNum (occurNum); }
211 unsigned long TranslateTag (unsigned long occurNum)
212 { return TranslateNum (occurNum+wordDictSize); }
213};
214
215
216struct InvfStateRec {
217 mg_ullong start;
218 mg_ullong here;
219 unsigned long lastFragNum;
220 unsigned long B;
221
222 void Clear () {
223 start = here = 0;
224 lastFragNum = B = 0;
225 }
226 InvfStateRec () { Clear (); }
227};
228
229
230#define ISR_SIZE 1024
231
232class InvfStateCache {
233protected:
234 InvfStateRec recCache [ISR_SIZE];
235 unsigned long startNum;
236
237 FILE *stateFile;
238
239 void ClearCache () {
240 unsigned int i = 0;
241 for (i=0; i<ISR_SIZE; i++) recCache[i].Clear();
242 }
243
244public:
245 InvfStateCache ();
246 ~InvfStateCache ();
247
248 void Open (char *filename);
249 void Close ();
250
251 // previous references to state records may be
252 // invalidated calling GetRec
253 InvfStateRec &GetRec (unsigned long num);
254};
255
256
257static invf_dict_header idh;
258static WordBitPtrs bitPtrs;
259
260static FILE *chunkFile = NULL;
261static stdio_bitio_buffer chunkBuf;
262
263static unsigned long ivfMemBufSize = 0;
264static char *ivfMemBuf = NULL;
265
266// word and tag dictionaries. a map is used for the tag dictionary
267// as it should never be very big (and the perfect hash function
268// sometimes has trouble with small values).
269static perf_hash_data *wordHashDict = NULL;
270static TagMapDict tagMapDict;
271
272// information about all the different levels
273static FIvfLevel ivfLevel;
274
275static OccurToDictConverter occurConvert;
276
277// information about the state of the inverted file
278static InvfStateCache invfState;
279
280static char collectFilename[512];
281
282
283void WordBitPtrs::Clear () {
284 numWords = 0;
285 numTags = 0;
286 size=0;
287 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
288 wordBitPtrs = NULL;
289}
290
291WordBitPtrs::~WordBitPtrs () {
292 if (wordBitPtrs != NULL) delete [] wordBitPtrs;
293}
294
295void WordBitPtrs::SetSize (unsigned long _numWords,
296 unsigned long _numTags){
297 Clear();
298 numWords = _numWords;
299 numTags = _numTags;
300 size = numWords + numTags + 1;
301 wordBitPtrs = new BitPtr [size];
302}
303
304
305void OccurToDictConverter::SeekStart () {
306 if (transFile == NULL) return;
307 rbs.SEEK_X (sizeof (unsigned long) * 8);
308 pos = 0;
309}
310
311unsigned long OccurToDictConverter::TranslateNum (unsigned long num) {
312 if (num < pos) SeekStart ();
313 while (pos <= num) {
314 if (pos < wordDictSize)
315 val = rbs.binary_decode (wordDictSize + 1, NULL) - 1;
316 else
317 val = rbs.binary_decode (tagDictSize + 1, NULL) - 1;
318 pos++;
319 }
320 return val;
321}
322
323OccurToDictConverter::OccurToDictConverter () {
324 pos = 0;
325 val = 0;
326 transFile = NULL;
327 wordDictSize = 0;
328 tagDictSize = 0;
329}
330
331OccurToDictConverter::~OccurToDictConverter () {
332 if (transFile != NULL) Close ();
333}
334
335void OccurToDictConverter::Open (char *filename, unsigned long _wordDictSize,
336 unsigned long _tagDictSize) {
337 if (transFile != NULL) Close ();
338
339 wordDictSize = _wordDictSize;
340 tagDictSize = _tagDictSize;
341
342 transFile = open_file (filename, INVF_CHUNK_TRANS_SUFFIX, "rb",
343 MAGIC_CHUNK_TRANS, MG_ABORT);
344 rbs.attachFile (transFile, RND_BUF_SIZE);
345 SeekStart ();
346 val = 0;
347}
348
349void OccurToDictConverter::Close () {
350 if (transFile == NULL) return;
351
352 rbs.done ();
353 fclose (transFile);
354 transFile = NULL;
355 pos = 0;
356 val = 0;
357
358 wordDictSize = 0;
359 tagDictSize = 0;
360}
361
362
363
364
365InvfStateCache::InvfStateCache () {
366 startNum = 0;
367 stateFile = NULL;
368}
369
370InvfStateCache::~InvfStateCache () {
371 if (stateFile != NULL) Close ();
372}
373
374void InvfStateCache::Open (char *filename) {
375 if (stateFile != NULL) Close();
376
377 // open the state file
378 char path[512];
379 sprintf (path, FILE_NAME_FORMAT ".%ld", get_basepath (), filename,
380 ".invf.state", (long) getpid ());
381 if (!(stateFile = fopen (path, "wb+"))) {
382 Message ("Unable to create \"%s\"", path);
383 exit (1);
384 }
385 unlink (path); // file will be deleted after it is closed
386
387 // reset the buffer
388 startNum = 0;
389 ClearCache();
390}
391
392void InvfStateCache::Close () {
393 if (stateFile == NULL) return;
394 fclose (stateFile);
395 stateFile = NULL;
396 startNum = 0;
397}
398
399InvfStateRec &InvfStateCache::GetRec (unsigned long num) {
400 // see if cached
401 if ((num >= startNum) && (num < startNum + ISR_SIZE))
402 return recCache[num-startNum];
403
404 // not cached, write out this lot of records and read in
405 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
406 fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
407
408 // read in the new set of records
409 ClearCache ();
410 startNum = num - (num % ISR_SIZE);
411 fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET);
412 fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile);
413
414 return recCache[num-startNum];
415}
416
417
418
419static void ClearCharBuf (char *buf, unsigned long size) {
420 char *end = buf + size;
421 while (buf != end) *buf++ = 0;
422}
423
424static void ReadWordDict (char *filename) {
425 // read in the perfect hash function for the word dictionary
426 FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb",
427 MAGIC_HASH, MG_ABORT);
428 if (!(wordHashDict = read_perf_hash_data (wordHashFile))) {
429 FatalError (1, "Unable to read in hash data for word dictionary");
430 }
431 fclose (wordHashFile);
432}
433
434static void ReadTagDict (char *filename, invf_dict_header &_idh) {
435 // open the file
436 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
437 MAGIC_STEM_BUILD, MG_ABORT);
438
439 // seek to the start of the tag dictionary
440 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
441
442 unsigned long tagNum;
443 dict_el thisEl;
444 for (tagNum = 0; tagNum < _idh.tag_dict_size; tagNum++) {
445 thisEl.Read (dictFile);
446 tagMapDict[thisEl.el].tagNum = tagNum;
447 }
448
449 fclose (dictFile);
450}
451
452static void ReadLevelFile (char *filename) {
453 FILE *f;
454 f = open_file (filename, INVF_LEVEL_SUFFIX, "rb",
455 MAGIC_INVF_LEVELS, MG_ABORT);
456 ivfLevel.Read (f);
457 fclose (f);
458}
459
460void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) {
461 if (totalIBits < lastTotalIBits) {
462 fprintf(stderr, "ERROR: The totalIBits counter (%d byte unsigned integer) has overflowed.\n", sizeof (mg_ullong));
463 if (sizeof (mg_ullong) < 8) {
464 fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n");
465 }
466 fprintf(stderr, " Build aborted.\n");
467 exit(1);
468 }
469}
470
471// assumes the inverted file state file has been opened
472static void InitInvfState (char *filename,
473 invf_dict_header &_idh,
474 InvfStateCache &_invfState,
475 mg_ullong &totalIBits,
476 bool wordLevelIndex) {
477 // read in the dictionary, setting inverted state information
478 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
479 MAGIC_STEM_BUILD, MG_ABORT);
480
481 // seek to the start of the word dictionary
482 fseek (dictFile, _idh.word_dict_start, SEEK_SET);
483
484 // add the word entries
485 word_dict_el wordEl;
486 wordEl.SetNumLevels (_idh.num_levels);
487 unsigned long dictWordNum, p;
488 mg_ullong lastTotalIBits;
489 unsigned long N = _idh.num_frags;
490 for (dictWordNum=0; dictWordNum<_idh.word_dict_size; dictWordNum++) {
491 // lastTotalIBits is used to detect integer overflow
492 lastTotalIBits = totalIBits;
493
494 // read the next word and associated information
495 wordEl.Read (dictFile, _idh.num_levels);
496
497 // update the state record
498 p = wordEl.frag_occur;
499 InvfStateRec &wisr = _invfState.GetRec (dictWordNum);
500 wisr.start = totalIBits;
501 wisr.here = totalIBits;
502 wisr.B = BIO_Bblock_Init (N, p);
503
504 // add the length of the fragment numbers
505 totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B);
506
507 // if needed, add the length of the fragment frequency information
508 if (!wordLevelIndex)
509 totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur);
510
511 // align next byte
512#ifdef USE_LONG_LONG
513 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
514#else
515 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
516#endif
517
518 CheckIntOverflow (totalIBits, lastTotalIBits);
519 }
520
521
522 // seek to the start of the tag dictionary
523 fseek (dictFile, _idh.tag_dict_start, SEEK_SET);
524
525 // add the tag entries
526 dict_el tagEl;
527 unsigned long dictTagNum;
528 N = _idh.num_frags;
529 for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; dictTagNum++) {
530 // lastTotalIBits is used to detect integer overflow
531 lastTotalIBits = totalIBits;
532
533 // read the next tag and associated information
534 tagEl.Read (dictFile);
535
536 // update the state record
537 p = tagEl.frag_occur*2;
538 InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size);
539 tisr.start = totalIBits;
540 tisr.here = totalIBits;
541 tisr.B = BIO_Bblock_Init (N+p, p);
542
543 // add the length of the fragment numbers (two numbers for each
544 // tag, one for start and one for end)
545 totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B);
546
547 // align next byte
548#ifdef USE_LONG_LONG
549 totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull;
550#else
551 totalIBits = (totalIBits + 7ul) & 0xfffffff8ul;
552#endif
553
554 CheckIntOverflow (totalIBits, lastTotalIBits);
555 }
556
557 fclose (dictFile);
558}
559
560/*
561// assumes the chunk tag information has been placed in .first
562static void PrintChunkInfo (unsigned long chunkMem,
563 unsigned long numChunkWords,
564 unsigned long numChunkTags) {
565 static unsigned long chunksRead = 0;
566 chunksRead++;
567 cout << "Chunk Number: " << chunksRead << "\n";
568 cout << "numChunkDocs " << numDocsInChunk << "\n";
569 cout << "numChunkFrags " << numFragsInChunk << "\n";
570 cout << "mem " << chunkMem << "\n";
571 cout << "numWords " << numChunkWords << "\n";
572 cout << "numTags " << numChunkTags << "\n\n";
573
574 TagMapDict::iterator tagMapHere = tagMapDict.begin();
575 TagMapDict::iterator tagMapEnd = tagMapDict.end();
576 while (tagMapHere != tagMapEnd) {
577 unsigned long tagMapNum = (*tagMapHere).second.tagNum;
578 cout << (*tagMapHere).first << " " << tagMapNum << " "
579 << bitPtrs.GetTagBitPtr(tagMapNum).here << "\n";
580 tagMapHere++;
581 }
582}
583*/
584
585void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) {
586 // reset globals
587 numChunkDocs = 0;
588 chunkStartFragNum = numFrags;
589
590 // read in information about this chunk
591 numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1;
592 if (numDocsInChunk == 0)
593 FatalError (1, "The number of docs in the current chunk is 0");
594
595 numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1;
596 unsigned long chunkMem = chunkBuf.gamma_decode (NULL) - 1;
597
598 if (chunkMem > ivfMemBufSize)
599 FatalError (1, "Chunk memory size is greater than maximum");
600
601 unsigned long numChunkWords = chunkBuf.gamma_decode (NULL) - 1;
602 unsigned long numChunkTags = chunkBuf.gamma_decode (NULL) - 1;
603
604
605 // reset stuff
606 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
607 bitPtrs.ResetPtrs();
608
609 // read in the entries in occurrence order storing the
610 // "chunkWordCount" in "start" and the "chunkFragCount"
611 // in "here"
612 unsigned long numOccur;
613 unsigned long wordNum;
614 for (numOccur=0; numOccur<numChunkWords; numOccur++) {
615 wordNum = occurConvert.TranslateWord (numOccur);
616 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
617 wordPtr.start = chunkBuf.gamma_decode (NULL) - 1;
618 if (wordPtr.start >= 2)
619 wordPtr.here = chunkBuf.gamma_decode (NULL);
620 else wordPtr.here = wordPtr.start;
621 }
622 unsigned long tagNum;
623 for (numOccur=0; numOccur<numChunkTags; numOccur++) {
624 tagNum = occurConvert.TranslateTag (numOccur);
625 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
626 // only chunkFragCount is encoded for tags
627 tagPtr.start = chunkBuf.gamma_decode (NULL) - 1;
628 tagPtr.here = tagPtr.start;
629 }
630
631 /* PrintChunkInfo (chunkMem, numChunkWords, numChunkTags);*/
632
633 // create the bit ptrs in dictionary order
634 unsigned long totalIBits = 0; // only dealing with memory
635 unsigned long chunkWordCount, chunkFragCount;
636 for (wordNum=0; wordNum<_idh.word_dict_size; wordNum++) {
637 BitPtr &wordPtr = bitPtrs.GetWordBitPtr (wordNum);
638 chunkWordCount = wordPtr.start;
639 chunkFragCount = wordPtr.here;
640 wordPtr.start = totalIBits;
641 wordPtr.here = totalIBits;
642 wordPtr.lastFragNum = chunkStartFragNum;
643 wordPtr.lgB = 0;
644 if (chunkWordCount > 0) {
645 wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk,
646 chunkFragCount));
647 totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount);
648 // use unary encoding for memory buffer encoding of fragment freq
649 if (!wordLevelIndex) {
650 totalIBits += chunkWordCount;
651 }
652 }
653 }
654 for (tagNum=0; tagNum<_idh.tag_dict_size; tagNum++) {
655 BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum);
656 chunkFragCount = tagPtr.here;
657 tagPtr.start = totalIBits;
658 tagPtr.here = totalIBits;
659 tagPtr.lastFragNum = chunkStartFragNum;
660 tagPtr.lgB = 0;
661 if (chunkFragCount > 0) {
662 unsigned long pTag = chunkFragCount*2;
663 tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag,
664 pTag));
665 unsigned long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag,
666 pTag);
667// cout << tagNum + _idh.word_dict_size << " ";
668// cout << "numFrags: " << numFragsInChunk
669// << " chunkFragCount: " << chunkFragCount
670// << " B: " << 1 << tagPtr.lgB
671// << " blen: " << blen << "\n";
672 totalIBits += bLen;
673 }
674 }
675 bitPtrs.GetEndStart() = totalIBits;
676 bitPtrs.GetEndHere() = totalIBits;
677
678 if ((totalIBits + 7ul) >> 3ul > chunkMem) {
679 cerr << "totalIBits: " << totalIBits << "\n";
680 cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n";
681 cerr << "chunkMem: " << chunkMem << "\n";
682 FatalError (1, "Pointers exceed buffer size");
683 }
684}
685
686
687
688
689int init_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
690 // read in compressed dictionary header
691 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
692 MAGIC_STEM_BUILD, MG_ABORT);
693 idh.Read (dictFile);
694 fclose (dictFile);
695
696 // set the size of the bit ptrs
697 bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size);
698
699 // open the chunk file and read in the maximum memory needed
700 // for the inverted memory buffer
701 chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb",
702 MAGIC_CHUNK, MG_ABORT);
703 ReadUL (chunkFile, ivfMemBufSize);
704 chunkBuf.attachFile (chunkFile);
705
706 // allocate memory for the inverted buffer
707 ivfMemBuf = new char [ivfMemBufSize];
708 ClearCharBuf (ivfMemBuf, ivfMemBufSize);
709
710 // read in the word dictionary
711 ReadWordDict (filename);
712
713 // read in the tag dictionary
714 ReadTagDict (filename, idh);
715
716 // read in the level information
717 ReadLevelFile (filename);
718 bool wordLevelIndex = ivfLevel.indexLevel.empty();
719
720 // set up the translation table file
721 occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size);
722
723 // reset some globals
724 numDocs = 0;
725 numChunkDocs = 0;
726 numDocsInChunk = 0;
727 numFrags = 0;
728 numFragsInChunk = 0;
729 chunkStartFragNum = 0;
730
731 strcpy (collectFilename, filename);
732
733
734 // create the inverted file
735 mg_ullong totalIBits = 0;
736 FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb",
737 MAGIC_INVF, MG_ABORT);
738 totalIBits += sizeof (unsigned long) * 8; // magic number
739 totalIBits += 8 * 200; // 200 byte gap -- why??????
740 fclose (invfFile);
741
742 // init the inverted file state cache
743 invfState.Open (filename);
744 InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex);
745
746 return COMPALLOK;
747}
748
749static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/*tagName*/) {
750 if (!tInfo.inTag) return;
751
752 // add this tag to the inverted list
753 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum);
754 unsigned long endFrag = numFrags;
755 int b = 1 << tagBitPtr.lgB;
756
757 /*
758 cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" "
759 << tInfo.startFrag << " " << endFrag << "\n";
760 */
761
762 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here);
763 buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1,
764 b, NULL);
765 buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL);
766 tagBitPtr.lastFragNum = endFrag;
767 tagBitPtr.here = buffer.position();
768 buffer.encodeDone();
769
770 // check for buffer overrun
771 bitPtrs.CheckTagBufOverrun (tInfo.tagNum);
772
773 // reset information about this tag
774 tInfo.inTag = false;
775 tInfo.startFrag = 0;
776}
777
778static void ProcessOpenTag (const TextEl &el, bool &inFrag) {
779 // close tag if open
780 IP2TagInfo &tInfo = tagMapDict[el.tagName];
781 if (tInfo.inTag) CloseTextTag (tInfo, el.tagName);
782
783 // open this tag
784 tInfo.inTag = true;
785 tInfo.startFrag = numFrags;
786
787 // check for start of next fragment
788 bool wordLevelIndex = ivfLevel.indexLevel.empty();
789 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
790 numFrags++;
791 inFrag = true;
792 }
793}
794
795static void ProcessCloseTag (const TextEl &el, bool &inFrag) {
796 // check for end of fragment
797 bool wordLevelIndex = ivfLevel.indexLevel.empty();
798 if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) {
799 inFrag = false;
800 }
801
802 IP2TagInfo &tInfo = tagMapDict[el.tagName];
803 CloseTextTag (tInfo, el.tagName);
804}
805
806static void ProcessText (const TextEl &el, bool &inFrag) {
807 // make sure this text is to be indexed
808 bool wordLevelIndex = ivfLevel.indexLevel.empty();
809 if (!wordLevelIndex && !inFrag) return;
810
811 const unsigned char *textHere = el.text.begin();
812 const unsigned char *textEnd = el.text.end() - 1;
813 unsigned char mgWord[MAXSTEMLEN + 1];
814
815 if (!inaword (textHere, textEnd))
816 ParseNonindexWord (textHere, textEnd);
817
818
819 // Alternately parse off words and non-words from the input
820
821 while (textHere <= textEnd) {
822 textHere = ParseIndexMGWord (textHere, textEnd, mgWord);
823 textHere = ParseNonindexWord (textHere, textEnd);
824
825 if (mgWord[0] > 0) {
826 if (wordLevelIndex) numFrags++;
827
828 unsigned long wordNum = perf_hash (wordHashDict, mgWord);
829
830 /*
831 cout << wordNum << " \"";
832 cout.write (mgWord+1, *mgWord);
833 cout << "\" " << numFrags << "\n";
834 */
835
836 // add this word to the inverted list
837 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
838 unsigned long fragNum = numFrags;
839 int b = 1 << wordBitPtr.lgB;
840
841 mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here);
842
843 // note: this assumes that fragments don't carry over between
844 // chunks (which they don't because all tags are closed at the
845 // end of each document and chunks are based on document
846 // boundaries), i.e. the first fragment number must be greater
847 // than the starting fragment number of the chunk.
848 if (fragNum > wordBitPtr.lastFragNum) {
849 buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1,
850 b, NULL);
851 if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1
852
853 } else if (!wordLevelIndex) {
854 // add one to the frequency count for this word
855 buffer.seek (buffer.position()-1);
856 buffer.encodeBit (0); // unary encoding -- last = 1
857 buffer.encodeBit (1);
858 }
859
860 wordBitPtr.lastFragNum = fragNum;
861 wordBitPtr.here = buffer.position();
862 buffer.encodeDone();
863
864 // check for buffer overrun
865 bitPtrs.CheckWordBufOverrun (wordNum);
866 }
867 }
868}
869
870// combine the in memory inverted buffer with the disk
871// based inverted file
872static void DiskMerge (char *filename) {
873 bool wordLevelIndex = ivfLevel.indexLevel.empty();
874
875 // make sure we have something to process
876 if (numChunkDocs <= 0) return;
877
878 // open the inverted file
879 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+",
880 MAGIC_INVF, MG_ABORT);
881 random_bitio_buffer invfOutBuf (invfFile);
882
883 // set up to decode the entries in memory
884 mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0);
885
886 // write out the word information
887 unsigned long wordNum;
888 int b;
889 unsigned long currFragNum;
890 unsigned long delta;
891 unsigned long currFreq;
892 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
893 // go to the end of the last inverted file entry
894 InvfStateRec &wordDiskState = invfState.GetRec (wordNum);
895 invfOutBuf.SEEK_X (wordDiskState.here);
896
897 // go to the start of the inverted chunk info in memory
898 BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum);
899 memInBuf.seek(wordBitPtr.start);
900
901 // decode each entry and re-write to disk
902 currFragNum = chunkStartFragNum;
903 while (memInBuf.position() < wordBitPtr.here) {
904 // decode word entry
905 b = 1 << wordBitPtr.lgB;
906 delta = memInBuf.bblock_decode (b, NULL);
907 currFragNum += delta;
908 if (!wordLevelIndex) currFreq = memInBuf.unary_decode (NULL);
909 else currFreq = 1;
910
911 // recode on disk
912 invfOutBuf.bblock_encode (currFragNum-wordDiskState.lastFragNum,
913 wordDiskState.B, NULL);
914 if (!wordLevelIndex) invfOutBuf.gamma_encode (currFreq, NULL);
915 wordDiskState.lastFragNum = currFragNum;
916 }
917
918 wordDiskState.here = invfOutBuf.TELL_X();
919 }
920
921 // write out the tag information
922 unsigned long tagNum;
923 unsigned long currTagStart;
924 unsigned long currTagEnd;
925 for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
926 // go to the end of the last inverted file entry
927 InvfStateRec &tagDiskState = invfState.GetRec (tagNum+idh.word_dict_size);
928 invfOutBuf.SEEK_X (tagDiskState.here);
929
930 // go to the start of the inverted chunk info in memory
931 BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tagNum);
932 memInBuf.seek(tagBitPtr.start);
933
934 // decode each entry and re-write to disk
935 currTagEnd = chunkStartFragNum;
936 while (memInBuf.position() < tagBitPtr.here) {
937 // decode tag entry
938 b = 1 << tagBitPtr.lgB;
939 delta = memInBuf.bblock_decode (b, NULL) - 1;
940 currTagStart = currTagEnd + delta;
941 delta = memInBuf.bblock_decode (b, NULL) - 1;
942 currTagEnd = currTagStart + delta;
943
944 // recode on disk
945 invfOutBuf.bblock_encode (currTagStart-tagDiskState.lastFragNum+1,
946 tagDiskState.B, NULL);
947 invfOutBuf.bblock_encode (currTagEnd-currTagStart+1,
948 tagDiskState.B, NULL);
949
950 tagDiskState.lastFragNum = currTagEnd;
951 }
952
953 tagDiskState.here = invfOutBuf.TELL_X();
954 }
955
956 memInBuf.done();
957
958 invfOutBuf.encodeDone();
959 fclose (invfFile);
960}
961
962
963int process_ivf_2 (const TagInfo &/*tagInfo*/, const TextElArray &doc) {
964 bool wordLevelIndex = ivfLevel.indexLevel.empty();
965 bool inFrag = false;
966 if (wordLevelIndex) inFrag = true; // unconditional
967
968 // get next chunk information if need to. the chunk information
969 // is needed before the first document is processed
970 if (numChunkDocs >= numDocsInChunk) ReadChunk (idh, wordLevelIndex);
971
972 // process each text element
973 TextElArray::const_iterator here = doc.begin();
974 TextElArray::const_iterator end = doc.end();
975 while (here != end) {
976 // process this element
977 if ((*here).elType == OpenTagE) ProcessOpenTag (*here, inFrag);
978 else if ((*here).elType == CloseTagE) ProcessCloseTag (*here, inFrag);
979 else ProcessText (*here, inFrag);
980
981 here++;
982 }
983
984 // close off any unclosed tags
985 TagMapDict::iterator tdHere = tagMapDict.begin();
986 TagMapDict::iterator tdEnd = tagMapDict.end();
987 while (tdHere != tdEnd) {
988 CloseTextTag ((*tdHere).second, (*tdHere).first);
989 tdHere++;
990 }
991
992 // we've processed one more document
993 numDocs++;
994 numChunkDocs++;
995
996 // merge the memory based inverted file with the one on
997 // disk if this is the end of this chunk
998 if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename);
999
1000 return COMPALLOK;
1001}
1002
1003
1004static void CondenseInvfFile (char *filename, unsigned long &bytesOutput) {
1005 FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb",
1006 MAGIC_INVF, MG_ABORT);
1007 FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+",
1008 MAGIC_INVF, MG_ABORT);
1009
1010 // skip the magic number
1011 fseek (outInvfFile, sizeof (unsigned long), SEEK_SET);
1012
1013 // write the inverted file header -- use defaults for most things
1014 invf_file_header ifh;
1015 ifh.no_of_words = idh.word_dict_size;
1016 ifh.no_of_tags = idh.tag_dict_size;
1017 ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0;
1018 ifh.Write (outInvfFile);
1019
1020 bytesOutput = ftell (outInvfFile);
1021
1022 // process each meaningful byte in the file
1023 unsigned long numEntries = ifh.no_of_words + ifh.no_of_tags;
1024 unsigned long entryNum;
1025 mg_ullong lastStart = 0;
1026 for (entryNum = 0; entryNum < numEntries; entryNum++) {
1027 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1028
1029 // overrun check
1030 if (stateRec.start < lastStart)
1031 FatalError (1, "Inverted file Buffer overrun");
1032 lastStart = stateRec.start;
1033
1034 unsigned long oldEntryStart = stateRec.start >> 3;
1035 unsigned long oldEntryStartOver = stateRec.start & 7; // should be 0
1036 unsigned long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end
1037 unsigned long oldEntryEndOver = stateRec.here & 7;
1038
1039 fseek (inInvfFile, oldEntryStart, SEEK_SET);
1040
1041 stateRec.here -= stateRec.start;
1042 stateRec.start = bytesOutput * 8 + oldEntryStartOver;
1043 stateRec.here += stateRec.start;
1044 while (oldEntryStart < oldEntryEnd) {
1045 unsigned char c = getc (inInvfFile);
1046 if (oldEntryStart == oldEntryEnd - 1) {
1047 u_char ands[8] =
1048 {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
1049 c &= ands[oldEntryEndOver];
1050 }
1051 putc (c, outInvfFile);
1052 bytesOutput++;
1053 oldEntryStart++;
1054 }
1055 }
1056
1057 fclose (inInvfFile);
1058
1059#ifdef __WIN32__
1060 if (!(_chsize (_fileno (outInvfFile), bytesOutput)))
1061 Message ("Could not truncate invf.");
1062#else
1063 ftruncate (fileno (outInvfFile), bytesOutput);
1064#endif
1065
1066 fclose (outInvfFile);
1067}
1068
1069static void OutputInvfIdx (char *filename, unsigned long invfNumBytes) {
1070 FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb",
1071 MAGIC_INVI, MG_ABORT);
1072
1073 // process each meaningful byte in the file
1074 unsigned long numEntries = idh.word_dict_size + idh.tag_dict_size;
1075 unsigned long entryNum;
1076 for (entryNum = 0; entryNum < numEntries; entryNum++) {
1077 InvfStateRec &stateRec = invfState.GetRec (entryNum);
1078
1079 // assumes that inverted entries start at beginning of each byte
1080 if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break;
1081 }
1082
1083 WriteUL (invfIdxFile, invfNumBytes);
1084
1085 fclose (invfIdxFile);
1086}
1087
1088
1089int done_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) {
1090 // close most open files
1091 if (chunkFile != NULL) {
1092 chunkBuf.done();
1093 fclose (chunkFile);
1094 chunkFile = NULL;
1095 }
1096 occurConvert.Close();
1097
1098 // free allocated memory
1099 bitPtrs.Clear();
1100 if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; }
1101 free_perf_hash (wordHashDict);
1102 wordHashDict = NULL;
1103 tagMapDict.erase (tagMapDict.begin(), tagMapDict.end());
1104
1105 // condense the inverted file and truncate it
1106 // this function also writes out the inverted header
1107 unsigned long invfNumBytes = 0;
1108 CondenseInvfFile (filename, invfNumBytes);
1109
1110 OutputInvfIdx (filename, invfNumBytes);
1111
1112 // close the rest of the open files
1113 invfState.Close ();
1114
1115 return COMPALLOK;
1116}
Note: See TracBrowser for help on using the repository browser.