source: trunk/indexers/mgpp/text/ivf.pass1.cpp@ 3365

Last change on this file since 3365 was 3365, checked in by kjdon, 22 years ago

Initial revision

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 21.9 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass1.cpp -- Memory efficient pass 1 inversion
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "UCArray.h"
23#include "sysfuncs.h"
24#include "mg_files.h"
25#include "invf.h"
26#include "mg.h"
27#include "build.h"
28#include "locallib.h"
29#include "bitio_m_stdio.h"
30#include "bitio_gen.h"
31#include <stdio.h>
32#include "words.h"
33#include "messages.h"
34#include "netorder.h"
35#include "FIvfLevelInfo.h"
36
37#include "longlong.h"
38
39#if defined(GSDL_USE_OBJECTSPACE)
40# include <ospace\std\map>
41#elif defined(GSDL_USE_STL_H)
42# include <map.h>
43#else
44# include <map>
45#endif
46
47// a fragment corresponds to the index level (word-level is the
48// minimum index level)
49
50
51// structure to determine level information
52struct LevelWorker {
53 unsigned long lastLevelDocNum;
54 unsigned long count;
55
56 LevelWorker () {
57 lastLevelDocNum = count = 0;
58 }
59};
60
61
62// note: the word is stored in the map
63struct IvfWordInfo {
64 unsigned long wordCount; // word frequency
65 unsigned long fragCount; // number of fragments that contain the word
66 unsigned long lastFragNum; // last fragment to contain the word
67 unsigned long chunkWordCount; // word frequency within this chunk
68 unsigned long chunkFragCount; // number of fragments within this chunk that
69 // contain the word
70
71 LevelWorker *levels; // level info for this word
72
73 IvfWordInfo ();
74 ~IvfWordInfo ();
75 void Clear (); // will delete levels
76};
77
78typedef map<UCArray, IvfWordInfo, DictLTUCArray> IvfWordInfoMap;
79typedef vector<IvfWordInfoMap::iterator> IvfWordInfoItArray;
80
81// tags don't require as much information
82struct IvfTagInfo {
83 unsigned long tagCount; // tag frequency
84 unsigned long fragCount; // number of fragments that contain the tag
85 unsigned long lastFragNum; // last fragment to contain the tag
86 unsigned long chunkFragCount; // number of fragments within this chunk that
87 // contain the tag
88
89 IvfTagInfo ();
90 void Clear ();
91};
92
93typedef map<UCArray, IvfTagInfo, DictLTUCArray> IvfTagInfoMap;
94typedef vector<IvfTagInfoMap::iterator> IvfTagInfoItArray;
95
96
97#define INIT_CHECK_FRAC 0.10
98#define CHECK_FRAC 0.75
99#define CHECK_CLOSE 0.999
100#define CHECK_DIV 1.5
101
102
103static FILE *ic; // the invf chunk file
104static stdio_bitio_buffer icb;
105
106IvfWordInfoMap ivfWordInfo;
107IvfWordInfoItArray ivfWordInfoOccurOrder;
108IvfTagInfoMap ivfTagInfo;
109IvfTagInfoItArray ivfTagInfoOccurOrder;
110
111static unsigned long chunksWritten;
112static unsigned long maxMemNeeded;
113
114static unsigned long numDocs;
115static unsigned long numChunkDocs;
116
117static unsigned long numFrags;
118static unsigned long numChunkFrags;
119
120static unsigned long numWords;
121
122// the number of document numbers in the inverted file
123static unsigned long numChunkEntries;
124
125// next entry in the inverted file to check memory
126// requirements for the current chunk
127static unsigned long entryCheckPoint;
128
129// information about all the different levels
130static FIvfLevel ivfLevel;
131
132
133
134IvfWordInfo::IvfWordInfo () {
135 levels = NULL;
136 Clear();
137}
138
139IvfWordInfo::~IvfWordInfo () {
140 if (levels != NULL) delete [] levels;
141}
142
143void IvfWordInfo::Clear () {
144 wordCount = 0;
145 chunkWordCount = 0;
146 lastFragNum = 0;
147 fragCount = 0;
148 chunkFragCount = 0;
149
150 if (levels != NULL) {
151 delete [] levels;
152 levels = NULL;
153 }
154}
155
156
157IvfTagInfo::IvfTagInfo () {
158 Clear();
159}
160
161void IvfTagInfo::Clear () {
162 tagCount = 0;
163 lastFragNum = 0;
164 fragCount = 0;
165 chunkFragCount = 0;
166}
167
168
169
170int init_ivf_1 (const TagInfo &tagInfo, char *file_name) {
171 // set up the chunk file
172 if (!(ic = create_file (file_name, INVF_CHUNK_SUFFIX, "wb",
173 MAGIC_CHUNK, MG_MESSAGE)))
174 return COMPERROR;
175 fwrite (" ", sizeof (u_long), 1, ic); // Space for the maxmem
176 icb.attachFile (ic);
177 icb.encodeStart();
178
179 // reset global variables
180 ivfWordInfo.erase (ivfWordInfo.begin(), ivfWordInfo.end());
181 ivfWordInfoOccurOrder.erase (ivfWordInfoOccurOrder.begin(), ivfWordInfoOccurOrder.end());
182 ivfTagInfo.erase (ivfTagInfo.begin(), ivfTagInfo.end());
183 ivfTagInfoOccurOrder.erase (ivfTagInfoOccurOrder.begin(), ivfTagInfoOccurOrder.end());
184
185 chunksWritten = 0;
186 maxMemNeeded = 0;
187
188 numDocs = 0;
189 numChunkDocs = 0;
190
191 numFrags = 0;
192 numChunkFrags = 0;
193
194 numWords = 0;
195
196 numChunkEntries = 0;
197 entryCheckPoint = (unsigned long) ((invf_buffer_size * INIT_CHECK_FRAC) / CHECK_DIV);
198
199 // init the level information
200 ivfLevel.Clear();
201 ivfLevel.docTag = tagInfo.docTag;
202 ivfLevel.indexLevel = tagInfo.indexLevel;
203 IvfLevelInfo blankLevel;
204 UCArraySet::const_iterator levelHere = tagInfo.levelTags.begin();
205 UCArraySet::const_iterator levelEnd = tagInfo.levelTags.end();
206 while (levelHere != levelEnd) {
207 blankLevel.levelTag = *levelHere;
208 ivfLevel.levelInfo[*levelHere] = blankLevel;
209 levelHere++;
210 }
211
212 return COMPALLOK;
213}
214
215static void ProcessOpenTag (const TagInfo &tagInfo, const TextEl &el,
216 bool &inFrag) {
217 bool wordLevelIndex = tagInfo.indexLevel.empty();
218
219 // check for start of next fragment
220 if (!wordLevelIndex && el.tagName == tagInfo.indexLevel) {
221 numFrags++;
222 numChunkFrags++;
223 inFrag = true;
224 }
225
226 // update tag stats
227 IvfTagInfo &i = ivfTagInfo[el.tagName];
228 if (i.tagCount == 0) {
229 // new tag, add to list of iterators
230 IvfTagInfoMap::iterator iIt = ivfTagInfo.find (el.tagName);
231 ivfTagInfoOccurOrder.push_back (iIt);
232 }
233 i.tagCount++;
234
235 // all open tags count as new tags
236 numChunkEntries++;
237 i.fragCount++;
238 i.chunkFragCount++;
239 i.lastFragNum = numFrags;
240
241 // update level information
242 IvfLevelInfoMap::iterator levelIt = ivfLevel.levelInfo.find (el.tagName);
243 if (levelIt != ivfLevel.levelInfo.end()) {
244 // is a level tag
245 (*levelIt).second.numEntries++;
246 (*levelIt).second.workInLevel = true;
247 }
248}
249
250static void ProcessCloseTag (const TagInfo &tagInfo, const TextEl &el,
251 bool &inFrag) {
252 bool wordLevelIndex = tagInfo.indexLevel.empty();
253
254 // check for end of fragment
255 if (!wordLevelIndex && el.tagName == tagInfo.indexLevel) {
256 inFrag = false;
257 }
258
259 // update level information
260 IvfLevelInfoMap::iterator levelIt = ivfLevel.levelInfo.find (el.tagName);
261 if (levelIt != ivfLevel.levelInfo.end()) {
262 // is a level tag
263 (*levelIt).second.workInLevel = false;
264 }
265}
266
267static void ProcessText (const TagInfo &tagInfo, const TextEl &el,
268 bool &inFrag) {
269 bool wordLevelIndex = tagInfo.indexLevel.empty();
270
271 // make sure this text is to be indexed
272 if (!wordLevelIndex && !inFrag) return;
273
274 const unsigned char *textHere = &(el.text[0]);
275 const unsigned char *textEnd = &(el.text[el.text.size() - 1]);
276 UCArray word;
277
278 if (!inaword (textHere, textEnd))
279 ParseNonindexWord (textHere, textEnd);
280
281
282 // Alternately parse off words and non-words from the input
283 // Each token is then inserted into the set if it does
284 // not exist or has it's frequency count incremented if it does.
285
286 while (textHere <= textEnd) {
287 textHere = ParseIndexWord (textHere, textEnd, word);
288 textHere = ParseNonindexWord (textHere, textEnd);
289
290 if (!word.empty()) {
291 numWords++;
292
293 if (wordLevelIndex) {
294 numFrags++;
295 numChunkFrags++;
296 }
297
298 // update word stats
299
300 IvfWordInfo &i = ivfWordInfo[word];
301 if (i.wordCount == 0) {
302 // new word
303 // add to list of iterators
304 IvfWordInfoMap::iterator iIt = ivfWordInfo.find (word);
305 ivfWordInfoOccurOrder.push_back (iIt);
306
307 // add level information array
308 if (ivfLevel.levelInfo.size() > 0)
309 i.levels = new LevelWorker [ivfLevel.levelInfo.size()];
310 }
311
312 i.wordCount++;
313 i.chunkWordCount++;
314 if (numFrags > i.lastFragNum) {
315 numChunkEntries++;
316 i.fragCount++;
317 i.chunkFragCount++;
318 i.lastFragNum = numFrags;
319 }
320
321 // update level information for this word
322 if (i.levels != NULL) {
323 IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin();
324 IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end();
325 LevelWorker *levelWorkerPtr = i.levels;
326 while (levelHere != levelEnd) {
327
328 // check to make sure the level encompases this fragment
329 if (!(*levelHere).second.workInLevel) {
330 cerr << "Level tag <" << (*levelHere).first
331 << "> does not encompass all fragments\n";
332 exit (1);
333 }
334
335 if ((*levelHere).second.numEntries > (*levelWorkerPtr).lastLevelDocNum) {
336 (*levelWorkerPtr).lastLevelDocNum = (*levelHere).second.numEntries;
337 (*levelWorkerPtr).count ++;
338 }
339
340 levelHere++;
341 levelWorkerPtr++;
342 }
343 }
344 }
345 }
346}
347
348
349static unsigned long MemoryRequired (bool wordLevelIndex) {
350 register unsigned long total = 0;
351
352 // add memory required for word entries
353 IvfWordInfoMap::const_iterator wordHere = ivfWordInfo.begin();
354 IvfWordInfoMap::const_iterator wordEnd = ivfWordInfo.end();
355 while (wordHere != wordEnd) {
356 register const IvfWordInfo &info = (*wordHere).second;
357 if (info.chunkFragCount > 0) {
358 total += BIO_Bblock_Bound (numChunkFrags, info.chunkFragCount);
359 if (!wordLevelIndex) {
360 total += info.chunkWordCount;
361 }
362 }
363
364 wordHere++;
365 }
366
367 // add memory required for tag entries
368 IvfTagInfoMap::const_iterator tagHere = ivfTagInfo.begin();
369 IvfTagInfoMap::const_iterator tagEnd = ivfTagInfo.end();
370 while (tagHere != tagEnd) {
371 register const IvfTagInfo &info = (*tagHere).second;
372 if (info.chunkFragCount > 0) {
373 // two d entries for each frag entry
374 unsigned long pTag = info.chunkFragCount*2;
375 total += BIO_Bblock_Bound (numChunkFrags+pTag, pTag);
376 }
377
378 tagHere++;
379 }
380
381 total = (total + 7) >> 3;
382 return total;
383}
384
385
386/*
387static void PrintChunkInfo (unsigned long mem) {
388 cout << "Chunk Number: " << chunksWritten << "\n";
389 cout << "numChunkDocs " << numChunkDocs << "\n";
390 cout << "numChunkFrags " << numChunkFrags << "\n";
391 cout << "mem " << mem << "\n";
392 cout << "numWords " << ivfWordInfo.size() << "\n";
393 cout << "numTags " << ivfTagInfo.size() << "\n\n";
394
395 // output debug tag information in dictionary order
396 IvfTagInfoMap::iterator tagMapHere = ivfTagInfo.begin();
397 IvfTagInfoMap::iterator tagMapEnd = ivfTagInfo.end();
398 unsigned long tagNum = 0;
399 while (tagMapHere != tagMapEnd) {
400 cout << (*tagMapHere).first << " " << tagNum
401 << " " << (*tagMapHere).second.chunkFragCount << "\n";
402 tagNum++;
403 tagMapHere++;
404 }
405}
406*/
407
408static void OutputChunkInfo (unsigned long mem, bool /*wordLevelIndex*/) {
409 chunksWritten++;
410
411 // sanity check
412 if (ivfWordInfo.size() != ivfWordInfoOccurOrder.size()) {
413 Message ("ERROR: Word information size mismatch: %u vs %u\n",
414 (unsigned int)ivfWordInfo.size(),
415 (unsigned int)ivfWordInfoOccurOrder.size());
416 exit (1);
417 }
418 if (ivfTagInfo.size() != ivfTagInfoOccurOrder.size()) {
419 Message ("ERROR: Tag information size mismatch: %u vs %u\n",
420 (unsigned int)ivfTagInfo.size(),
421 (unsigned int)ivfTagInfoOccurOrder.size());
422 exit (1);
423 }
424
425 icb.gamma_encode (numChunkDocs + 1, NULL);
426 icb.gamma_encode (numChunkFrags + 1, NULL);
427 icb.gamma_encode (mem + 1, NULL);
428 icb.gamma_encode (ivfWordInfo.size() + 1, NULL);
429 icb.gamma_encode (ivfTagInfo.size() + 1, NULL);
430
431 /* PrintChunkInfo (mem);*/
432
433 // output word information in occurance order
434 IvfWordInfoItArray::iterator wordHere = ivfWordInfoOccurOrder.begin();
435 IvfWordInfoItArray::iterator wordEnd = ivfWordInfoOccurOrder.end();
436 while (wordHere != wordEnd) {
437 register IvfWordInfo &ivfWordInfo = (*(*wordHere)).second;
438
439 icb.gamma_encode (ivfWordInfo.chunkWordCount + 1, NULL);
440 if (ivfWordInfo.chunkWordCount >= 2) {
441 icb.gamma_encode (ivfWordInfo.chunkFragCount, NULL);
442 }
443
444 ivfWordInfo.lastFragNum = 0;
445 ivfWordInfo.chunkWordCount = 0;
446 ivfWordInfo.chunkFragCount = 0;
447
448 wordHere++;
449 }
450
451 // output tag information in occurance order
452 IvfTagInfoItArray::iterator tagHere = ivfTagInfoOccurOrder.begin();
453 IvfTagInfoItArray::iterator tagEnd = ivfTagInfoOccurOrder.end();
454 while (tagHere != tagEnd) {
455 register IvfTagInfo &ivfTagInfo = (*(*tagHere)).second;
456
457 icb.gamma_encode (ivfTagInfo.chunkFragCount + 1, NULL);
458
459 ivfTagInfo.lastFragNum = 0;
460 ivfTagInfo.chunkFragCount = 0;
461
462 tagHere++;
463 }
464
465 numChunkDocs = 0;
466 numChunkFrags = 0;
467 numChunkEntries = 0;
468}
469
470
471int process_ivf_1 (const TagInfo &tagInfo, const TextElArray &doc) {
472 bool wordLevelIndex = tagInfo.indexLevel.empty();
473 bool inFrag = false;
474 if (wordLevelIndex) inFrag = true; // unconditional
475
476 numDocs++;
477 numChunkDocs++;
478
479 // process each text element in this document
480 TextElArray::const_iterator here = doc.begin();
481 TextElArray::const_iterator end = doc.end();
482 while (here != end) {
483 if ((*here).elType == OpenTagE)
484 ProcessOpenTag (tagInfo, *here, inFrag);
485 else if ((*here).elType == CloseTagE)
486 ProcessCloseTag (tagInfo, *here, inFrag);
487 else if ((*here).elType == TextE)
488 ProcessText (tagInfo, *here, inFrag);
489
490 here++;
491 }
492
493 // check the amount of memory needed for this chunk
494 if (numChunkEntries >= entryCheckPoint) {
495 unsigned long mem = MemoryRequired (wordLevelIndex);
496 if (mem >= invf_buffer_size * CHECK_CLOSE) {
497 if (mem > maxMemNeeded) maxMemNeeded = mem;
498 OutputChunkInfo (mem, wordLevelIndex);
499 entryCheckPoint = (unsigned long)
500 ((invf_buffer_size * INIT_CHECK_FRAC) / CHECK_DIV);
501
502 } else {
503 entryCheckPoint = (unsigned long)
504 (entryCheckPoint * ((CHECK_FRAC * (invf_buffer_size - mem)) / mem) +
505 entryCheckPoint);
506 if (entryCheckPoint <= numChunkEntries)
507 entryCheckPoint = numChunkEntries + 1;
508 }
509 }
510
511 return COMPALLOK;
512}
513
514
515static void CalcInvfDictSize (unsigned long &totalBytes,
516 unsigned long &indexStringBytes) {
517 totalBytes = 0; // The sum of the length of all words, including
518 // the length byte
519 indexStringBytes = 0; // The amount of space required to store the
520 // words in the diction, this takes into account
521 // the prefixes
522 const UCArray *lastWord = NULL;
523
524 // calculate size of word information
525 IvfWordInfoMap::iterator wordHere = ivfWordInfo.begin();
526 IvfWordInfoMap::iterator wordEnd = ivfWordInfo.end();
527 while (wordHere != wordEnd) {
528 unsigned long wordSize = (*wordHere).first.size();
529 totalBytes += wordSize + 1;
530 indexStringBytes += wordSize + 2;
531 if (lastWord != NULL)
532 indexStringBytes -= PrefixLen (*lastWord, (*wordHere).first);
533 lastWord = &((*wordHere).first);
534
535 wordHere++;
536 }
537
538 // calculate size of tag information
539 lastWord = NULL;
540 IvfTagInfoMap::iterator tagHere = ivfTagInfo.begin();
541 IvfTagInfoMap::iterator tagEnd = ivfTagInfo.end();
542 while (tagHere != tagEnd) {
543 unsigned long tagSize = (*tagHere).first.size();
544 totalBytes += tagSize + 1;
545 indexStringBytes += tagSize + 2;
546 if (lastWord != NULL)
547 indexStringBytes -= PrefixLen (*lastWord, (*tagHere).first);
548 lastWord = &((*tagHere).first);
549
550 tagHere++;
551 }
552}
553
554
555// OutputInvfDict ():
556// writes out the stemmed dictionary file
557// in the following format
558// lookback value (int)
559// totalbytes value (int)
560// indexstringbytes (int)
561// for each word
562// wordlen (4 bits)
563// prefix match (4 bits)
564// word (wordlen bytes)
565// word frequency (int)
566// word count (int)
567//
568// Accesses outside variables:
569//
570// Return value...:
571static void OutputInvfDict (char *filename) {
572 // create the dictionary header
573 invf_dict_header idh;
574 idh.word_dict_size = ivfWordInfo.size();
575 idh.tag_dict_size = ivfTagInfo.size();
576 idh.num_docs = numDocs;
577 idh.num_frags = numFrags;
578 idh.num_words = numWords;
579 idh.num_levels = ivfLevel.levelInfo.size();
580 CalcInvfDictSize (idh.total_bytes, idh.index_string_bytes);
581
582 // create the inverted dictionary file
583 FILE *sp;
584 if (!(sp = create_file (filename, INVF_DICT_SUFFIX, "wb", MAGIC_STEM_BUILD,
585 MG_MESSAGE)))
586 return;
587
588 // write out the dictionary header
589 if (!idh.Write (sp)) { fclose (sp); return; }
590
591 // remember where the word dictionary starts
592 idh.word_dict_start = ftell (sp);
593
594 // output the word dictionary
595 const UCArray *lastWord = NULL;
596 IvfWordInfoMap::iterator wordHere = ivfWordInfo.begin();
597 IvfWordInfoMap::iterator wordEnd = ivfWordInfo.end();
598 while (wordHere != wordEnd) {
599 // get the prefix and suffix lengths
600 const UCArray &thisWord = (*wordHere).first;
601 WritePreSufStr (sp, lastWord, thisWord);
602
603 // output the number of fragments the word appeared in and the
604 // number of times the word appeared
605 WriteUL (sp, (*wordHere).second.fragCount);
606 WriteUL (sp, (*wordHere).second.wordCount);
607
608 // output frequency information for each level
609 // note that we are expecting every word to have
610 // level information
611 LevelWorker *lwHere = (*wordHere).second.levels;
612 LevelWorker *lwEnd = lwHere + idh.num_levels;
613 while (lwHere != lwEnd) {
614 WriteUL (sp, (*lwHere).count);
615 lwHere++;
616 }
617
618 lastWord = &thisWord;
619 wordHere++;
620 }
621
622 // remember where the tag dictionary starts
623 idh.tag_dict_start = ftell (sp);
624
625 // output the tag dictionary
626 const UCArray *lastTag = NULL;
627 IvfTagInfoMap::iterator tagHere = ivfTagInfo.begin();
628 IvfTagInfoMap::iterator tagEnd = ivfTagInfo.end();
629 while (tagHere != tagEnd) {
630 // get the prefix and suffix lengths
631 const UCArray &thisTag = (*tagHere).first;
632 WritePreSufStr (sp, lastTag, thisTag);
633
634 // output the number of fragments the tag appeared in and the
635 // number of times the tag appeared
636 WriteUL (sp, (*tagHere).second.fragCount);
637 WriteUL (sp, (*tagHere).second.tagCount);
638
639 lastTag = &thisTag;
640 tagHere++;
641 }
642
643 // write out the updated header
644 fseek (sp, sizeof (u_long), SEEK_SET);
645 if (!idh.Write (sp)) { fclose (sp); return; }
646
647 fclose (sp);
648}
649
650static void OutputLevelFile (char *filename) {
651 // create the level file
652 FILE *f;
653 if (!(f = create_file (filename, INVF_LEVEL_SUFFIX, "wb", MAGIC_INVF_LEVELS,
654 MG_MESSAGE)))
655 return;
656
657 // write out the information
658 ivfLevel.Write (f);
659
660 // close the file
661 fclose (f);
662}
663
664static void OutputTransFile (char *filename) {
665 // save the word number (in lastFragNum :-/)
666 int i = 0;
667 IvfWordInfoMap::iterator wordHere = ivfWordInfo.begin();
668 IvfWordInfoMap::iterator wordEnd = ivfWordInfo.end();
669 while (wordHere != wordEnd) {
670 (*wordHere).second.lastFragNum = i;
671 i++; wordHere++;
672 }
673
674 // save the tag number (in lastFragNum :-/)
675 i = 0;
676 IvfTagInfoMap::iterator tagHere = ivfTagInfo.begin();
677 IvfTagInfoMap::iterator tagEnd = ivfTagInfo.end();
678 while (tagHere != tagEnd) {
679 (*tagHere).second.lastFragNum = i;
680 i++; tagHere++;
681 }
682
683 // create the translation file
684 FILE *f;
685 if (!(f = create_file (filename, INVF_CHUNK_TRANS_SUFFIX, "wb",
686 MAGIC_CHUNK_TRANS, MG_MESSAGE)))
687 return;
688
689 stdio_bitio_buffer buffer(f);
690 buffer.encodeStart();
691
692 // write out the word translation table
693 unsigned long wordDictSize = ivfWordInfoOccurOrder.size();
694 IvfWordInfoItArray::iterator wordItHere = ivfWordInfoOccurOrder.begin();
695 IvfWordInfoItArray::iterator wordItEnd = ivfWordInfoOccurOrder.end();
696 unsigned long oN = 0;
697 while (wordItHere != wordItEnd) {
698 register IvfWordInfo &ivfWordInfo = (*(*wordItHere)).second;
699 buffer.binary_encode (ivfWordInfo.lastFragNum + 1, wordDictSize + 1, NULL);
700 oN++;
701 wordItHere++;
702 }
703
704 // write out the tag translation table
705 unsigned long tagDictSize = ivfTagInfoOccurOrder.size();
706 IvfTagInfoItArray::iterator tagItHere = ivfTagInfoOccurOrder.begin();
707 IvfTagInfoItArray::iterator tagItEnd = ivfTagInfoOccurOrder.end();
708 while (tagItHere != tagItEnd) {
709 register IvfTagInfo &ivfTagInfo = (*(*tagItHere)).second;
710 buffer.binary_encode (ivfTagInfo.lastFragNum + 1, tagDictSize + 1, NULL);
711 oN++;
712 tagItHere++;
713 }
714
715 // finish encoding and close the file
716 buffer.encodeDone();
717 fclose (f);
718}
719
720#ifndef SILENT
721static void PrintStats () {
722 Message ("Inverted buffer size: %8u bytes\n", invf_buffer_size);
723 Message ("Max memory needed for 1 chunk: %8u bytes\n", maxMemNeeded);
724
725 Message ("Number of chunks written: %8u\n", chunksWritten);
726 Message ("Number of documents: %8u\n", numDocs);
727 Message ("Number of fragments: %8u\n", numFrags);
728 Message ("Number of words: %8u\n", numWords);
729
730 Message ("Size of word dictionary: %8u\n", ivfWordInfo.size());
731 Message ("Size of tag dictionary: %8u\n", ivfTagInfo.size());
732}
733#endif
734
735int done_ivf_1 (const TagInfo &tagInfo, char * filename) {
736 bool wordLevelIndex = tagInfo.indexLevel.empty();
737
738 char *temp_str = msg_prefix;
739 msg_prefix = "ivf.pass1";
740
741 // output the last chunk
742 if (numChunkDocs > 0) {
743 unsigned long mem = MemoryRequired (wordLevelIndex);
744 OutputChunkInfo (mem, wordLevelIndex);
745 if (mem > maxMemNeeded) maxMemNeeded = mem;
746 }
747
748 // write out and encoded 1 to say there are no more chunks
749 icb.gamma_encode (1, NULL);
750 icb.encodeDone ();
751
752 // write out the maximum memory required and close the file
753 fseek (ic, sizeof (long), 0);
754 WriteUL (ic, maxMemNeeded);
755 fclose (ic);
756
757 // output the inverted dictionary
758 OutputInvfDict (filename);
759
760 // write out the translation file
761 OutputTransFile (filename);
762
763 // output the level information
764 OutputLevelFile (filename);
765
766 // output statistics
767#ifndef SILENT
768 PrintStats ();
769#endif
770
771 msg_prefix = temp_str;
772
773 return COMPALLOK;
774}
775
Note: See TracBrowser for help on using the repository browser.