source: trunk/gsdl/src/mgpp/text/ivf.pass1.cpp@ 879

Last change on this file since 879 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 22.0 KB
Line 
1/**************************************************************************
2 *
3 * ivf.pass1.cpp -- Memory efficient pass 1 inversion
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: ivf.pass1.cpp 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "mg_files.h"
27#include "invf.h"
28#include "mg.h"
29#include "build.h"
30#include "locallib.h"
31#include "UCArray.h"
32#include "bitio_m_stdio.h"
33#include "bitio_gen.h"
34#include <stdio.h>
35#include "words.h"
36#include "messages.h"
37#include "netorder.h"
38#include "FIvfLevelInfo.h"
39
40#include "longlong.h"
41
42#if defined(GSDL_USE_OBJECTSPACE)
43# include <ospace\std\map>
44#elif defined(GSDL_USE_STL_H)
45# include <map.h>
46#else
47# include <map>
48#endif
49
50
51/*
52 $Log$
53 Revision 1.1 2000/01/14 02:26:07 sjboddie
54 Rodgers new C++ mg
55
56 *
57 */
58
59
60
61// a fragment corresponds to the index level (word-level is the
62// minimum index level)
63
64
65// structure to determine level information
66struct LevelWorker {
67 unsigned long lastLevelDocNum;
68 unsigned long count;
69
70 LevelWorker () {
71 lastLevelDocNum = count = 0;
72 }
73};
74
75
76// note: the word is stored in the map
77struct IvfWordInfo {
78 unsigned long wordCount; // word frequency
79 unsigned long fragCount; // number of fragments that contain the word
80 unsigned long lastFragNum; // last fragment to contain the word
81 unsigned long chunkWordCount; // word frequency within this chunk
82 unsigned long chunkFragCount; // number of fragments within this chunk that
83 // contain the word
84
85 LevelWorker *levels; // level info for this word
86
87 IvfWordInfo ();
88 ~IvfWordInfo ();
89 void Clear (); // will delete levels
90};
91
92typedef map<UCArray, IvfWordInfo, DictLTUCArray> IvfWordInfoMap;
93typedef vector<IvfWordInfoMap::iterator> IvfWordInfoItArray;
94
95// tags don't require as much information
96struct IvfTagInfo {
97 unsigned long tagCount; // tag frequency
98 unsigned long fragCount; // number of fragments that contain the tag
99 unsigned long lastFragNum; // last fragment to contain the tag
100 unsigned long chunkFragCount; // number of fragments within this chunk that
101 // contain the tag
102
103 IvfTagInfo ();
104 void Clear ();
105};
106
107typedef map<UCArray, IvfTagInfo, DictLTUCArray> IvfTagInfoMap;
108typedef vector<IvfTagInfoMap::iterator> IvfTagInfoItArray;
109
110
111#define INIT_CHECK_FRAC 0.10
112#define CHECK_FRAC 0.75
113#define CHECK_CLOSE 0.999
114#define CHECK_DIV 1.5
115
116
117static FILE *ic; // the invf chunk file
118static stdio_bitio_buffer icb;
119
120IvfWordInfoMap ivfWordInfo;
121IvfWordInfoItArray ivfWordInfoOccurOrder;
122IvfTagInfoMap ivfTagInfo;
123IvfTagInfoItArray ivfTagInfoOccurOrder;
124
125static unsigned long chunksWritten;
126static unsigned long maxMemNeeded;
127
128static unsigned long numDocs;
129static unsigned long numChunkDocs;
130
131static unsigned long numFrags;
132static unsigned long numChunkFrags;
133
134static unsigned long numWords;
135
136// the number of document numbers in the inverted file
137static unsigned long numChunkEntries;
138
139// next entry in the inverted file to check memory
140// requirements for the current chunk
141static unsigned long entryCheckPoint;
142
143// information about all the different levels
144static FIvfLevel ivfLevel;
145
146
147
148IvfWordInfo::IvfWordInfo () {
149 levels = NULL;
150 Clear();
151}
152
153IvfWordInfo::~IvfWordInfo () {
154 if (levels != NULL) delete [] levels;
155}
156
157void IvfWordInfo::Clear () {
158 wordCount = 0;
159 chunkWordCount = 0;
160 lastFragNum = 0;
161 fragCount = 0;
162 chunkFragCount = 0;
163
164 if (levels != NULL) {
165 delete [] levels;
166 levels = NULL;
167 }
168}
169
170
171IvfTagInfo::IvfTagInfo () {
172 Clear();
173}
174
175void IvfTagInfo::Clear () {
176 tagCount = 0;
177 lastFragNum = 0;
178 fragCount = 0;
179 chunkFragCount = 0;
180}
181
182
183
184int init_ivf_1 (const TagInfo &tagInfo, char *file_name) {
185 // set up the chunk file
186 if (!(ic = create_file (file_name, INVF_CHUNK_SUFFIX, "wb",
187 MAGIC_CHUNK, MG_MESSAGE)))
188 return COMPERROR;
189 fwrite (" ", sizeof (u_long), 1, ic); // Space for the maxmem
190 icb.attachFile (ic);
191 icb.encodeStart();
192
193 // reset global variables
194 ivfWordInfo.erase (ivfWordInfo.begin(), ivfWordInfo.end());
195 ivfWordInfoOccurOrder.erase (ivfWordInfoOccurOrder.begin(), ivfWordInfoOccurOrder.end());
196 ivfTagInfo.erase (ivfTagInfo.begin(), ivfTagInfo.end());
197 ivfTagInfoOccurOrder.erase (ivfTagInfoOccurOrder.begin(), ivfTagInfoOccurOrder.end());
198
199 chunksWritten = 0;
200 maxMemNeeded = 0;
201
202 numDocs = 0;
203 numChunkDocs = 0;
204
205 numFrags = 0;
206 numChunkFrags = 0;
207
208 numWords = 0;
209
210 numChunkEntries = 0;
211 entryCheckPoint = (unsigned long) ((invf_buffer_size * INIT_CHECK_FRAC) / CHECK_DIV);
212
213 // init the level information
214 ivfLevel.Clear();
215 ivfLevel.docTag = tagInfo.docTag;
216 ivfLevel.indexLevel = tagInfo.indexLevel;
217 IvfLevelInfo blankLevel;
218 UCArraySet::const_iterator levelHere = tagInfo.levelTags.begin();
219 UCArraySet::const_iterator levelEnd = tagInfo.levelTags.end();
220 while (levelHere != levelEnd) {
221 blankLevel.levelTag = *levelHere;
222 ivfLevel.levelInfo[*levelHere] = blankLevel;
223 levelHere++;
224 }
225
226 return COMPALLOK;
227}
228
229static void ProcessOpenTag (const TagInfo &tagInfo, const TextEl &el,
230 bool &inFrag) {
231 bool wordLevelIndex = tagInfo.indexLevel.empty();
232
233 // check for start of next fragment
234 if (!wordLevelIndex && el.tagName == tagInfo.indexLevel) {
235 numFrags++;
236 numChunkFrags++;
237 inFrag = true;
238 }
239
240 // update tag stats
241 IvfTagInfo &i = ivfTagInfo[el.tagName];
242 if (i.tagCount == 0) {
243 // new tag, add to list of iterators
244 IvfTagInfoMap::iterator iIt = ivfTagInfo.find (el.tagName);
245 ivfTagInfoOccurOrder.push_back (iIt);
246 }
247 i.tagCount++;
248
249 // all open tags count as new tags
250 numChunkEntries++;
251 i.fragCount++;
252 i.chunkFragCount++;
253 i.lastFragNum = numFrags;
254
255 // update level information
256 IvfLevelInfoMap::iterator levelIt = ivfLevel.levelInfo.find (el.tagName);
257 if (levelIt != ivfLevel.levelInfo.end()) {
258 // is a level tag
259 (*levelIt).second.numEntries++;
260 (*levelIt).second.workInLevel = true;
261 }
262}
263
264static void ProcessCloseTag (const TagInfo &tagInfo, const TextEl &el,
265 bool &inFrag) {
266 bool wordLevelIndex = tagInfo.indexLevel.empty();
267
268 // check for end of fragment
269 if (!wordLevelIndex && el.tagName == tagInfo.indexLevel) {
270 inFrag = false;
271 }
272
273 // update level information
274 IvfLevelInfoMap::iterator levelIt = ivfLevel.levelInfo.find (el.tagName);
275 if (levelIt != ivfLevel.levelInfo.end()) {
276 // is a level tag
277 (*levelIt).second.workInLevel = false;
278 }
279}
280
281static void ProcessText (const TagInfo &tagInfo, const TextEl &el,
282 bool &inFrag) {
283 bool wordLevelIndex = tagInfo.indexLevel.empty();
284
285 // make sure this text is to be indexed
286 if (!wordLevelIndex && !inFrag) return;
287
288 const unsigned char *textHere = el.text.begin();
289 const unsigned char *textEnd = el.text.end() - 1;
290 UCArray word;
291
292 if (!inaword (textHere, textEnd))
293 ParseNonindexWord (textHere, textEnd);
294
295
296 // Alternately parse off words and non-words from the input
297 // Each token is then inserted into the set if it does
298 // not exist or has it's frequency count incremented if it does.
299
300 while (textHere <= textEnd) {
301 textHere = ParseIndexWord (textHere, textEnd, word);
302 textHere = ParseNonindexWord (textHere, textEnd);
303
304 if (!word.empty()) {
305 numWords++;
306
307 if (wordLevelIndex) {
308 numFrags++;
309 numChunkFrags++;
310 }
311
312 // update word stats
313
314 IvfWordInfo &i = ivfWordInfo[word];
315 if (i.wordCount == 0) {
316 // new word
317 // add to list of iterators
318 IvfWordInfoMap::iterator iIt = ivfWordInfo.find (word);
319 ivfWordInfoOccurOrder.push_back (iIt);
320
321 // add level information array
322 if (ivfLevel.levelInfo.size() > 0)
323 i.levels = new LevelWorker [ivfLevel.levelInfo.size()];
324 }
325
326 i.wordCount++;
327 i.chunkWordCount++;
328 if (numFrags > i.lastFragNum) {
329 numChunkEntries++;
330 i.fragCount++;
331 i.chunkFragCount++;
332 i.lastFragNum = numFrags;
333 }
334
335 // update level information for this word
336 if (i.levels != NULL) {
337 IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin();
338 IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end();
339 LevelWorker *levelWorkerPtr = i.levels;
340 while (levelHere != levelEnd) {
341
342 // check to make sure the level encompases this fragment
343 if (!(*levelHere).second.workInLevel) {
344 cerr << "Level tag <" << (*levelHere).first
345 << "> does not encompass all fragments\n";
346 exit (1);
347 }
348
349 if ((*levelHere).second.numEntries > (*levelWorkerPtr).lastLevelDocNum) {
350 (*levelWorkerPtr).lastLevelDocNum = (*levelHere).second.numEntries;
351 (*levelWorkerPtr).count ++;
352 }
353
354 levelHere++;
355 levelWorkerPtr++;
356 }
357 }
358 }
359 }
360}
361
362
363static unsigned long MemoryRequired (bool wordLevelIndex) {
364 register unsigned long total = 0;
365
366 // add memory required for word entries
367 IvfWordInfoMap::const_iterator wordHere = ivfWordInfo.begin();
368 IvfWordInfoMap::const_iterator wordEnd = ivfWordInfo.end();
369 while (wordHere != wordEnd) {
370 register const IvfWordInfo &info = (*wordHere).second;
371 if (info.chunkFragCount > 0) {
372 total += BIO_Bblock_Bound (numChunkFrags, info.chunkFragCount);
373 if (!wordLevelIndex) {
374 total += info.chunkWordCount;
375 }
376 }
377
378 wordHere++;
379 }
380
381 // add memory required for tag entries
382 IvfTagInfoMap::const_iterator tagHere = ivfTagInfo.begin();
383 IvfTagInfoMap::const_iterator tagEnd = ivfTagInfo.end();
384 while (tagHere != tagEnd) {
385 register const IvfTagInfo &info = (*tagHere).second;
386 if (info.chunkFragCount > 0) {
387 // two d entries for each frag entry
388 unsigned long pTag = info.chunkFragCount*2;
389 total += BIO_Bblock_Bound (numChunkFrags+pTag, pTag);
390 }
391
392 tagHere++;
393 }
394
395 total = (total + 7) >> 3;
396 return total;
397}
398
399
400/*
401static void PrintChunkInfo (unsigned long mem) {
402 cout << "Chunk Number: " << chunksWritten << "\n";
403 cout << "numChunkDocs " << numChunkDocs << "\n";
404 cout << "numChunkFrags " << numChunkFrags << "\n";
405 cout << "mem " << mem << "\n";
406 cout << "numWords " << ivfWordInfo.size() << "\n";
407 cout << "numTags " << ivfTagInfo.size() << "\n\n";
408
409 // output debug tag information in dictionary order
410 IvfTagInfoMap::iterator tagMapHere = ivfTagInfo.begin();
411 IvfTagInfoMap::iterator tagMapEnd = ivfTagInfo.end();
412 unsigned long tagNum = 0;
413 while (tagMapHere != tagMapEnd) {
414 cout << (*tagMapHere).first << " " << tagNum
415 << " " << (*tagMapHere).second.chunkFragCount << "\n";
416 tagNum++;
417 tagMapHere++;
418 }
419}
420*/
421
422static void OutputChunkInfo (unsigned long mem, bool /*wordLevelIndex*/) {
423 chunksWritten++;
424
425 // sanity check
426 if (ivfWordInfo.size() != ivfWordInfoOccurOrder.size()) {
427 Message ("ERROR: Word information size mismatch: %u vs %u\n",
428 (unsigned int)ivfWordInfo.size(),
429 (unsigned int)ivfWordInfoOccurOrder.size());
430 exit (1);
431 }
432 if (ivfTagInfo.size() != ivfTagInfoOccurOrder.size()) {
433 Message ("ERROR: Tag information size mismatch: %u vs %u\n",
434 (unsigned int)ivfTagInfo.size(),
435 (unsigned int)ivfTagInfoOccurOrder.size());
436 exit (1);
437 }
438
439 icb.gamma_encode (numChunkDocs + 1, NULL);
440 icb.gamma_encode (numChunkFrags + 1, NULL);
441 icb.gamma_encode (mem + 1, NULL);
442 icb.gamma_encode (ivfWordInfo.size() + 1, NULL);
443 icb.gamma_encode (ivfTagInfo.size() + 1, NULL);
444
445 /* PrintChunkInfo (mem);*/
446
447 // output word information in occurance order
448 IvfWordInfoItArray::iterator wordHere = ivfWordInfoOccurOrder.begin();
449 IvfWordInfoItArray::iterator wordEnd = ivfWordInfoOccurOrder.end();
450 while (wordHere != wordEnd) {
451 register IvfWordInfo &ivfWordInfo = (*(*wordHere)).second;
452
453 icb.gamma_encode (ivfWordInfo.chunkWordCount + 1, NULL);
454 if (ivfWordInfo.chunkWordCount >= 2) {
455 icb.gamma_encode (ivfWordInfo.chunkFragCount, NULL);
456 }
457
458 ivfWordInfo.lastFragNum = 0;
459 ivfWordInfo.chunkWordCount = 0;
460 ivfWordInfo.chunkFragCount = 0;
461
462 wordHere++;
463 }
464
465 // output tag information in occurance order
466 IvfTagInfoItArray::iterator tagHere = ivfTagInfoOccurOrder.begin();
467 IvfTagInfoItArray::iterator tagEnd = ivfTagInfoOccurOrder.end();
468 while (tagHere != tagEnd) {
469 register IvfTagInfo &ivfTagInfo = (*(*tagHere)).second;
470
471 icb.gamma_encode (ivfTagInfo.chunkFragCount + 1, NULL);
472
473 ivfTagInfo.lastFragNum = 0;
474 ivfTagInfo.chunkFragCount = 0;
475
476 tagHere++;
477 }
478
479 numChunkDocs = 0;
480 numChunkFrags = 0;
481 numChunkEntries = 0;
482}
483
484
485int process_ivf_1 (const TagInfo &tagInfo, const TextElArray &doc) {
486 bool wordLevelIndex = tagInfo.indexLevel.empty();
487 bool inFrag = false;
488 if (wordLevelIndex) inFrag = true; // unconditional
489
490 numDocs++;
491 numChunkDocs++;
492
493 // process each text element in this document
494 TextElArray::const_iterator here = doc.begin();
495 TextElArray::const_iterator end = doc.end();
496 while (here != end) {
497 if ((*here).elType == OpenTagE)
498 ProcessOpenTag (tagInfo, *here, inFrag);
499 else if ((*here).elType == CloseTagE)
500 ProcessCloseTag (tagInfo, *here, inFrag);
501 else if ((*here).elType == TextE)
502 ProcessText (tagInfo, *here, inFrag);
503
504 here++;
505 }
506
507 // check the amount of memory needed for this chunk
508 if (numChunkEntries >= entryCheckPoint) {
509 unsigned long mem = MemoryRequired (wordLevelIndex);
510 if (mem >= invf_buffer_size * CHECK_CLOSE) {
511 if (mem > maxMemNeeded) maxMemNeeded = mem;
512 OutputChunkInfo (mem, wordLevelIndex);
513 entryCheckPoint = (unsigned long)
514 ((invf_buffer_size * INIT_CHECK_FRAC) / CHECK_DIV);
515
516 } else {
517 entryCheckPoint = (unsigned long)
518 (entryCheckPoint * ((CHECK_FRAC * (invf_buffer_size - mem)) / mem) +
519 entryCheckPoint);
520 if (entryCheckPoint <= numChunkEntries)
521 entryCheckPoint = numChunkEntries + 1;
522 }
523 }
524
525 return COMPALLOK;
526}
527
528
529static void CalcInvfDictSize (unsigned long &totalBytes,
530 unsigned long &indexStringBytes) {
531 totalBytes = 0; // The sum of the length of all words, including
532 // the length byte
533 indexStringBytes = 0; // The amount of space required to store the
534 // words in the diction, this takes into account
535 // the prefixes
536 const UCArray *lastWord = NULL;
537
538 // calculate size of word information
539 IvfWordInfoMap::iterator wordHere = ivfWordInfo.begin();
540 IvfWordInfoMap::iterator wordEnd = ivfWordInfo.end();
541 while (wordHere != wordEnd) {
542 unsigned long wordSize = (*wordHere).first.size();
543 totalBytes += wordSize + 1;
544 indexStringBytes += wordSize + 2;
545 if (lastWord != NULL)
546 indexStringBytes -= PrefixLen (*lastWord, (*wordHere).first);
547 lastWord = &((*wordHere).first);
548
549 wordHere++;
550 }
551
552 // calculate size of tag information
553 lastWord = NULL;
554 IvfTagInfoMap::iterator tagHere = ivfTagInfo.begin();
555 IvfTagInfoMap::iterator tagEnd = ivfTagInfo.end();
556 while (tagHere != tagEnd) {
557 unsigned long tagSize = (*tagHere).first.size();
558 totalBytes += tagSize + 1;
559 indexStringBytes += tagSize + 2;
560 if (lastWord != NULL)
561 indexStringBytes -= PrefixLen (*lastWord, (*tagHere).first);
562 lastWord = &((*tagHere).first);
563
564 tagHere++;
565 }
566}
567
568
569// OutputInvfDict ():
570// writes out the stemmed dictionary file
571// in the following format
572// lookback value (int)
573// totalbytes value (int)
574// indexstringbytes (int)
575// for each word
576// wordlen (4 bits)
577// prefix match (4 bits)
578// word (wordlen bytes)
579// word frequency (int)
580// word count (int)
581//
582// Accesses outside variables:
583//
584// Return value...:
585static void OutputInvfDict (char *filename) {
586 // create the dictionary header
587 invf_dict_header idh;
588 idh.word_dict_size = ivfWordInfo.size();
589 idh.tag_dict_size = ivfTagInfo.size();
590 idh.num_docs = numDocs;
591 idh.num_frags = numFrags;
592 idh.num_words = numWords;
593 idh.num_levels = ivfLevel.levelInfo.size();
594 CalcInvfDictSize (idh.total_bytes, idh.index_string_bytes);
595
596 // create the inverted dictionary file
597 FILE *sp;
598 if (!(sp = create_file (filename, INVF_DICT_SUFFIX, "wb", MAGIC_STEM_BUILD,
599 MG_MESSAGE)))
600 return;
601
602 // write out the dictionary header
603 if (!idh.Write (sp)) { fclose (sp); return; }
604
605 // remember where the word dictionary starts
606 idh.word_dict_start = ftell (sp);
607
608 // output the word dictionary
609 const UCArray *lastWord = NULL;
610 IvfWordInfoMap::iterator wordHere = ivfWordInfo.begin();
611 IvfWordInfoMap::iterator wordEnd = ivfWordInfo.end();
612 while (wordHere != wordEnd) {
613 // get the prefix and suffix lengths
614 const UCArray &thisWord = (*wordHere).first;
615 WritePreSufStr (sp, lastWord, thisWord);
616
617 // output the number of fragments the word appeared in and the
618 // number of times the word appeared
619 WriteUL (sp, (*wordHere).second.fragCount);
620 WriteUL (sp, (*wordHere).second.wordCount);
621
622 // output frequency information for each level
623 // note that we are expecting every word to have
624 // level information
625 LevelWorker *lwHere = (*wordHere).second.levels;
626 LevelWorker *lwEnd = lwHere + idh.num_levels;
627 while (lwHere != lwEnd) {
628 WriteUL (sp, (*lwHere).count);
629 lwHere++;
630 }
631
632 lastWord = &thisWord;
633 wordHere++;
634 }
635
636 // remember where the tag dictionary starts
637 idh.tag_dict_start = ftell (sp);
638
639 // output the tag dictionary
640 const UCArray *lastTag = NULL;
641 IvfTagInfoMap::iterator tagHere = ivfTagInfo.begin();
642 IvfTagInfoMap::iterator tagEnd = ivfTagInfo.end();
643 while (tagHere != tagEnd) {
644 // get the prefix and suffix lengths
645 const UCArray &thisTag = (*tagHere).first;
646 WritePreSufStr (sp, lastTag, thisTag);
647
648 // output the number of fragments the tag appeared in and the
649 // number of times the tag appeared
650 WriteUL (sp, (*tagHere).second.fragCount);
651 WriteUL (sp, (*tagHere).second.tagCount);
652
653 lastTag = &thisTag;
654 tagHere++;
655 }
656
657 // write out the updated header
658 fseek (sp, sizeof (u_long), SEEK_SET);
659 if (!idh.Write (sp)) { fclose (sp); return; }
660
661 fclose (sp);
662}
663
664static void OutputLevelFile (char *filename) {
665 // create the level file
666 FILE *f;
667 if (!(f = create_file (filename, INVF_LEVEL_SUFFIX, "wb", MAGIC_INVF_LEVELS,
668 MG_MESSAGE)))
669 return;
670
671 // write out the information
672 ivfLevel.Write (f);
673
674 // close the file
675 fclose (f);
676}
677
678static void OutputTransFile (char *filename) {
679 // save the word number (in lastFragNum :-/)
680 int i = 0;
681 IvfWordInfoMap::iterator wordHere = ivfWordInfo.begin();
682 IvfWordInfoMap::iterator wordEnd = ivfWordInfo.end();
683 while (wordHere != wordEnd) {
684 (*wordHere).second.lastFragNum = i;
685 i++; wordHere++;
686 }
687
688 // save the tag number (in lastFragNum :-/)
689 i = 0;
690 IvfTagInfoMap::iterator tagHere = ivfTagInfo.begin();
691 IvfTagInfoMap::iterator tagEnd = ivfTagInfo.end();
692 while (tagHere != tagEnd) {
693 (*tagHere).second.lastFragNum = i;
694 i++; tagHere++;
695 }
696
697 // create the translation file
698 FILE *f;
699 if (!(f = create_file (filename, INVF_CHUNK_TRANS_SUFFIX, "wb",
700 MAGIC_CHUNK_TRANS, MG_MESSAGE)))
701 return;
702
703 stdio_bitio_buffer buffer(f);
704 buffer.encodeStart();
705
706 // write out the word translation table
707 unsigned long wordDictSize = ivfWordInfoOccurOrder.size();
708 IvfWordInfoItArray::iterator wordItHere = ivfWordInfoOccurOrder.begin();
709 IvfWordInfoItArray::iterator wordItEnd = ivfWordInfoOccurOrder.end();
710 unsigned long oN = 0;
711 while (wordItHere != wordItEnd) {
712 register IvfWordInfo &ivfWordInfo = (*(*wordItHere)).second;
713 buffer.binary_encode (ivfWordInfo.lastFragNum + 1, wordDictSize + 1, NULL);
714 oN++;
715 wordItHere++;
716 }
717
718 // write out the tag translation table
719 unsigned long tagDictSize = ivfTagInfoOccurOrder.size();
720 IvfTagInfoItArray::iterator tagItHere = ivfTagInfoOccurOrder.begin();
721 IvfTagInfoItArray::iterator tagItEnd = ivfTagInfoOccurOrder.end();
722 while (tagItHere != tagItEnd) {
723 register IvfTagInfo &ivfTagInfo = (*(*tagItHere)).second;
724 buffer.binary_encode (ivfTagInfo.lastFragNum + 1, tagDictSize + 1, NULL);
725 oN++;
726 tagItHere++;
727 }
728
729 // finish encoding and close the file
730 buffer.encodeDone();
731 fclose (f);
732}
733
734#ifndef SILENT
735static void PrintStats () {
736 Message ("Inverted buffer size: %8u bytes\n", invf_buffer_size);
737 Message ("Max memory needed for 1 chunk: %8u bytes\n", maxMemNeeded);
738
739 Message ("Number of chunks written: %8u\n", chunksWritten);
740 Message ("Number of documents: %8u\n", numDocs);
741 Message ("Number of fragments: %8u\n", numFrags);
742 Message ("Number of words: %8u\n", numWords);
743
744 Message ("Size of word dictionary: %8u\n", ivfWordInfo.size());
745 Message ("Size of tag dictionary: %8u\n", ivfTagInfo.size());
746}
747#endif
748
749int done_ivf_1 (const TagInfo &tagInfo, char * filename) {
750 bool wordLevelIndex = tagInfo.indexLevel.empty();
751
752 char *temp_str = msg_prefix;
753 msg_prefix = "ivf.pass1";
754
755 // output the last chunk
756 if (numChunkDocs > 0) {
757 unsigned long mem = MemoryRequired (wordLevelIndex);
758 OutputChunkInfo (mem, wordLevelIndex);
759 if (mem > maxMemNeeded) maxMemNeeded = mem;
760 }
761
762 // write out and encoded 1 to say there are no more chunks
763 icb.gamma_encode (1, NULL);
764 icb.encodeDone ();
765
766 // write out the maximum memory required and close the file
767 fseek (ic, sizeof (long), 0);
768 WriteUL (ic, maxMemNeeded);
769 fclose (ic);
770
771 // output the inverted dictionary
772 OutputInvfDict (filename);
773
774 // write out the translation file
775 OutputTransFile (filename);
776
777 // output the level information
778 OutputLevelFile (filename);
779
780 // output statistics
781#ifndef SILENT
782 PrintStats ();
783#endif
784
785 msg_prefix = temp_str;
786
787 return COMPALLOK;
788}
789
Note: See TracBrowser for help on using the repository browser.