/************************************************************************** * * ivf.pass2.cpp -- Memory efficient pass 2 inversion * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #define _XOPEN_SOURCE 1 // This was added for Solaris, but it makes things worse on Solaris for me... // #define _XOPEN_SOURCE_EXTENDED 1 #include #if defined __WIN32__ # include # include # define getpid _getpid # define unlink _unlink #else # include # include "non_ansi.h" #endif #include "UCArray.h" #include "sysfuncs.h" #include "mg_files.h" #include "invf.h" #include "mg.h" #include "build.h" #include "locallib.h" #include "bitio_m_random.h" #include "bitio_m_stdio.h" #include "bitio_m_mems.h" #include "bitio_gen.h" #include #include "words.h" #include "messages.h" #include "netorder.h" #include "FIvfLevelInfo.h" #include "perf_hash.h" #include "string.h" #include "longlong.h" #if defined(GSDL_USE_OBJECTSPACE) # include #elif defined(GSDL_USE_STL_H) # include #else # include #endif #ifdef USE_LONG_LONG #define SEEK_X seek_LL #define TELL_X tell_LL #else #define SEEK_X seek #define TELL_X tell #endif #ifndef RND_BUF_SIZE #define RND_BUF_SIZE 8*1024 #endif static mg_u_long numDocs = 0; static mg_u_long numChunkDocs = 0; static mg_u_long numDocsInChunk = 0; static mg_u_long numFrags = 0; static mg_u_long numFragsInChunk = 0; static mg_u_long chunkStartFragNum = 0; struct BitPtr { mg_u_long start; mg_u_long here; mg_u_long lastFragNum; mg_u_long lgB; void Clear () { start = here = lastFragNum = lgB = 0; } BitPtr () { Clear(); } }; class WordBitPtrs { protected: mg_u_long numWords; mg_u_long numTags; mg_u_long size; BitPtr *wordBitPtrs; void CheckBufOverrun (mg_u_long num) { if (wordBitPtrs[num].here > wordBitPtrs[num+1].start) { cerr << "numDocs: " << numDocs << "\n"; cerr << "numChunkDocs: " << numChunkDocs << "\n"; cerr << "numDocsInChunk: " << numDocsInChunk << "\n"; cerr << "numFrags: " << numFrags << "\n"; cerr << "numFragsInChunk: " << numFragsInChunk << "\n"; cerr << "chunkStartFragNum: " << chunkStartFragNum << "\n"; cerr << "num: " << num << "\n"; cerr << "[num].start: " << wordBitPtrs[num].start << "\n"; cerr << "[num].here: " << wordBitPtrs[num].here << "\n"; cerr << "[num+1].start: " << wordBitPtrs[num+1].start << "\n"; FatalError (1, "Bit buffer overrun"); } } public: void Clear (); WordBitPtrs () { wordBitPtrs = NULL; Clear(); } ~WordBitPtrs (); void SetSize (mg_u_long _numWords, mg_u_long _numTags); void ResetPtrs () { if (wordBitPtrs == NULL) return; mg_u_long i; for (i=0; i TagMapDict; // class to handle the translation of occurrence order // to dictionary order for words and tags class OccurToDictConverter { protected: mg_u_long pos; mg_u_long val; FILE *transFile; random_bitio_buffer rbs; mg_u_long wordDictSize; mg_u_long tagDictSize; void SeekStart (); mg_u_long TranslateNum (mg_u_long num); public: OccurToDictConverter (); ~OccurToDictConverter (); void Open (char *filename, mg_u_long _wordDictSize, mg_u_long _tagDictSize); // Close frees all allocated memory void Close (); mg_u_long TranslateWord (mg_u_long occurNum) { return TranslateNum (occurNum); } mg_u_long TranslateTag (mg_u_long occurNum) { return TranslateNum (occurNum+wordDictSize); } }; struct InvfStateRec { mg_ullong start; mg_ullong here; mg_u_long lastFragNum; mg_u_long B; void Clear () { start = here = 0; lastFragNum = B = 0; } InvfStateRec () { Clear (); } }; #define ISR_SIZE 1024 class InvfStateCache { protected: InvfStateRec recCache [ISR_SIZE]; mg_u_long startNum; FILE *stateFile; void ClearCache () { unsigned int i = 0; for (i=0; i= startNum) && (num < startNum + ISR_SIZE)) return recCache[num-startNum]; // not cached, write out this lot of records and read in fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET); fwrite ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile); // read in the new set of records ClearCache (); startNum = num - (num % ISR_SIZE); fseek (stateFile, startNum*sizeof (InvfStateRec), SEEK_SET); size_t numbytes = fread ((char *) recCache, sizeof (InvfStateRec), ISR_SIZE, stateFile); if (numbytes != ISR_SIZE){ #ifdef DEBUG fprintf(stderr, "[mgpp/text/ivf.pass2.cpp L410] number of bytes read by fread does not match the requested amount\n"); #endif } return recCache[num-startNum]; } static void ClearCharBuf (char *buf, mg_u_long size) { char *end = buf + size; while (buf != end) *buf++ = 0; } static void ReadWordDict (char *filename) { // read in the perfect hash function for the word dictionary FILE *wordHashFile = open_file (filename, INVF_DICT_HASH_SUFFIX, "rb", MAGIC_HASH, MG_ABORT); if (!(wordHashDict = read_perf_hash_data (wordHashFile))) { FatalError (1, "Unable to read in hash data for word dictionary"); } fclose (wordHashFile); } static void ReadTagDict (char *filename, invf_dict_header &_idh) { // open the file FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); // seek to the start of the tag dictionary fseek (dictFile, _idh.tag_dict_start, SEEK_SET); mg_u_long tagNum; dict_el thisEl; for (tagNum = 0; tagNum < _idh.tag_dict_size; ++tagNum) { thisEl.Read (dictFile); tagMapDict[thisEl.el].tagNum = tagNum; } fclose (dictFile); } static void ReadLevelFile (char *filename) { FILE *f; f = open_file (filename, INVF_LEVEL_SUFFIX, "rb", MAGIC_INVF_LEVELS, MG_ABORT); ivfLevel.Read (f); fclose (f); } void CheckIntOverflow (mg_ullong totalIBits, mg_ullong lastTotalIBits) { if (totalIBits < lastTotalIBits) { fprintf(stderr, "ERROR: The totalIBits counter (%lu byte unsigned integer) has overflowed.\n", sizeof (mg_ullong)); if (sizeof (mg_ullong) < 8) { fprintf(stderr, " Try compiling with GCC to enable use of 8 bytes for this counter.\n"); } fprintf(stderr, " Build aborted.\n"); exit(1); } } // assumes the inverted file state file has been opened static void InitInvfState (char *filename, invf_dict_header &_idh, InvfStateCache &_invfState, mg_ullong &totalIBits, bool wordLevelIndex) { // read in the dictionary, setting inverted state information FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); // seek to the start of the word dictionary fseek (dictFile, _idh.word_dict_start, SEEK_SET); // add the word entries word_dict_el wordEl; wordEl.SetNumLevels (_idh.num_levels); mg_u_long dictWordNum, p; mg_ullong lastTotalIBits; mg_u_long N = _idh.num_frags; for (dictWordNum=0; dictWordNum<_idh.word_dict_size; ++dictWordNum) { // lastTotalIBits is used to detect integer overflow lastTotalIBits = totalIBits; // read the next word and associated information wordEl.Read (dictFile, _idh.num_levels); // update the state record p = wordEl.frag_occur; InvfStateRec &wisr = _invfState.GetRec (dictWordNum); wisr.start = totalIBits; wisr.here = totalIBits; wisr.B = BIO_Bblock_Init (N, p); // add the length of the fragment numbers totalIBits += BIO_Bblock_Bound_b (N, p, wisr.B); // if needed, add the length of the fragment frequency information if (!wordLevelIndex) totalIBits += BIO_Gamma_Bound (wordEl.freq, wordEl.frag_occur); // align next byte #ifdef USE_LONG_LONG totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull; #else totalIBits = (totalIBits + 7ul) & 0xfffffff8ul; #endif CheckIntOverflow (totalIBits, lastTotalIBits); } // seek to the start of the tag dictionary fseek (dictFile, _idh.tag_dict_start, SEEK_SET); // add the tag entries dict_el tagEl; mg_u_long dictTagNum; N = _idh.num_frags; for (dictTagNum=0; dictTagNum<_idh.tag_dict_size; ++dictTagNum) { // lastTotalIBits is used to detect integer overflow lastTotalIBits = totalIBits; // read the next tag and associated information tagEl.Read (dictFile); // update the state record p = tagEl.frag_occur*2; InvfStateRec &tisr = _invfState.GetRec (dictTagNum + _idh.word_dict_size); tisr.start = totalIBits; tisr.here = totalIBits; tisr.B = BIO_Bblock_Init (N+p, p); // add the length of the fragment numbers (two numbers for each // tag, one for start and one for end) totalIBits += BIO_Bblock_Bound_b (N+p, p, tisr.B); // align next byte #ifdef USE_LONG_LONG totalIBits = (totalIBits + 7ull) & 0xfffffffffffffff8ull; #else totalIBits = (totalIBits + 7ul) & 0xfffffff8ul; #endif CheckIntOverflow (totalIBits, lastTotalIBits); } fclose (dictFile); } /* // assumes the chunk tag information has been placed in .first static void PrintChunkInfo (mg_u_long chunkMem, mg_u_long numChunkWords, mg_u_long numChunkTags) { static mg_u_long chunksRead = 0; ++chunksRead; cout << "Chunk Number: " << chunksRead << "\n"; cout << "numChunkDocs " << numDocsInChunk << "\n"; cout << "numChunkFrags " << numFragsInChunk << "\n"; cout << "mem " << chunkMem << "\n"; cout << "numWords " << numChunkWords << "\n"; cout << "numTags " << numChunkTags << "\n\n"; TagMapDict::iterator tagMapHere = tagMapDict.begin(); TagMapDict::iterator tagMapEnd = tagMapDict.end(); while (tagMapHere != tagMapEnd) { mg_u_long tagMapNum = (*tagMapHere).second.tagNum; cout << (*tagMapHere).first << " " << tagMapNum << " " << bitPtrs.GetTagBitPtr(tagMapNum).here << "\n"; ++tagMapHere; } } */ void ReadChunk (invf_dict_header &_idh, bool wordLevelIndex) { // reset globals numChunkDocs = 0; chunkStartFragNum = numFrags; // read in information about this chunk numDocsInChunk = chunkBuf.gamma_decode (NULL) - 1; if (numDocsInChunk == 0) FatalError (1, "The number of docs in the current chunk is 0"); numFragsInChunk = chunkBuf.gamma_decode (NULL) - 1; mg_u_long chunkMem = chunkBuf.gamma_decode (NULL) - 1; if (chunkMem > ivfMemBufSize) FatalError (1, "Chunk memory size is greater than maximum"); mg_u_long numChunkWords = chunkBuf.gamma_decode (NULL) - 1; mg_u_long numChunkTags = chunkBuf.gamma_decode (NULL) - 1; // reset stuff ClearCharBuf (ivfMemBuf, ivfMemBufSize); bitPtrs.ResetPtrs(); // read in the entries in occurrence order storing the // "chunkWordCount" in "start" and the "chunkFragCount" // in "here" mg_u_long numOccur; mg_u_long wordNum; for (numOccur=0; numOccur= 2) wordPtr.here = chunkBuf.gamma_decode (NULL); else wordPtr.here = wordPtr.start; } mg_u_long tagNum; for (numOccur=0; numOccur 0) { wordPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk, chunkFragCount)); totalIBits += BIO_Bblock_Bound (numFragsInChunk, chunkFragCount); // use unary encoding for memory buffer encoding of fragment freq if (!wordLevelIndex) { totalIBits += chunkWordCount; } } } for (tagNum=0; tagNum<_idh.tag_dict_size; ++tagNum) { BitPtr &tagPtr = bitPtrs.GetTagBitPtr (tagNum); chunkFragCount = tagPtr.here; tagPtr.start = totalIBits; tagPtr.here = totalIBits; tagPtr.lastFragNum = chunkStartFragNum; tagPtr.lgB = 0; if (chunkFragCount > 0) { mg_u_long pTag = chunkFragCount*2; tagPtr.lgB = floorlog_2 (BIO_Bblock_Init_W (numFragsInChunk+pTag, pTag)); mg_u_long bLen = BIO_Bblock_Bound (numFragsInChunk+pTag, pTag); // cout << tagNum + _idh.word_dict_size << " "; // cout << "numFrags: " << numFragsInChunk // << " chunkFragCount: " << chunkFragCount // << " B: " << 1 << tagPtr.lgB // << " blen: " << blen << "\n"; totalIBits += bLen; } } bitPtrs.GetEndStart() = totalIBits; bitPtrs.GetEndHere() = totalIBits; if ((totalIBits + 7ul) >> 3ul > chunkMem) { cerr << "totalIBits: " << totalIBits << "\n"; cerr << "bytes: " << ((totalIBits + 7ul) >> 3ul) << "\n"; cerr << "chunkMem: " << chunkMem << "\n"; FatalError (1, "Pointers exceed buffer size"); } } int init_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) { // read in compressed dictionary header FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); idh.Read (dictFile); fclose (dictFile); // set the size of the bit ptrs bitPtrs.SetSize (idh.word_dict_size, idh.tag_dict_size); // open the chunk file and read in the maximum memory needed // for the inverted memory buffer chunkFile = open_file (filename, INVF_CHUNK_SUFFIX, "rb", MAGIC_CHUNK, MG_ABORT); ReadUL (chunkFile, ivfMemBufSize); chunkBuf.attachFile (chunkFile); // allocate memory for the inverted buffer ivfMemBuf = new char [ivfMemBufSize]; ClearCharBuf (ivfMemBuf, ivfMemBufSize); // read in the word dictionary ReadWordDict (filename); // read in the tag dictionary ReadTagDict (filename, idh); // read in the level information ReadLevelFile (filename); bool wordLevelIndex = ivfLevel.indexLevel.empty(); // set up the translation table file occurConvert.Open (filename, idh.word_dict_size, idh.tag_dict_size); // reset some globals numDocs = 0; numChunkDocs = 0; numDocsInChunk = 0; numFrags = 0; numFragsInChunk = 0; chunkStartFragNum = 0; strcpy (collectFilename, filename); // create the inverted file mg_ullong totalIBits = 0; FILE *invfFile = create_file (filename, INVF_SUFFIX, "wb", MAGIC_INVF, MG_ABORT); totalIBits += sizeof (mg_u_long) * 8; // magic number totalIBits += 8 * 200; // 200 byte gap -- why?????? fclose (invfFile); // init the inverted file state cache invfState.Open (filename); InitInvfState (filename, idh, invfState, totalIBits, wordLevelIndex); return COMPALLOK; } static void CloseTextTag (IP2TagInfo &tInfo, const UCArray &/*tagName*/) { if (!tInfo.inTag) return; // add this tag to the inverted list BitPtr &tagBitPtr = bitPtrs.GetTagBitPtr (tInfo.tagNum); mg_u_long endFrag = numFrags; int b = 1 << tagBitPtr.lgB; /* cout << (tInfo.tagNum+idh.word_dict_size) << " \"<" << tagName << ">\" " << tInfo.startFrag << " " << endFrag << "\n"; */ mems_bitio_buffer buffer ((u_char *) ivfMemBuf, tagBitPtr.here); buffer.bblock_encode (tInfo.startFrag - tagBitPtr.lastFragNum + 1, b, NULL); buffer.bblock_encode (endFrag - tInfo.startFrag + 1, b, NULL); tagBitPtr.lastFragNum = endFrag; tagBitPtr.here = buffer.position(); buffer.encodeDone(); // check for buffer overrun bitPtrs.CheckTagBufOverrun (tInfo.tagNum); // reset information about this tag tInfo.inTag = false; tInfo.startFrag = 0; } static void ProcessOpenTag (const TextEl &el, bool &inFrag) { // close tag if open IP2TagInfo &tInfo = tagMapDict[el.tagName]; if (tInfo.inTag) CloseTextTag (tInfo, el.tagName); // open this tag tInfo.inTag = true; tInfo.startFrag = numFrags; // check for start of next fragment bool wordLevelIndex = ivfLevel.indexLevel.empty(); if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) { ++numFrags; inFrag = true; } } static void ProcessCloseTag (const TextEl &el, bool &inFrag) { // check for end of fragment bool wordLevelIndex = ivfLevel.indexLevel.empty(); if (!wordLevelIndex && el.tagName == ivfLevel.indexLevel) { inFrag = false; } IP2TagInfo &tInfo = tagMapDict[el.tagName]; CloseTextTag (tInfo, el.tagName); } static void ProcessText (const TextEl &el, bool &inFrag) { // make sure this text is to be indexed bool wordLevelIndex = ivfLevel.indexLevel.empty(); if (!wordLevelIndex && !inFrag) return; const unsigned char *textHere = &(el.text[0]); const unsigned char *textEnd = &(el.text[el.text.size() - 1]); unsigned char mgWord[MAXSTEMLEN + 1]; if (!inaword_mgpp (textHere, textEnd)) ParseNonindexWord (textHere, textEnd); // Alternately parse off words and non-words from the input while (textHere <= textEnd) { textHere = ParseIndexMGWord (textHere, textEnd, mgWord); textHere = ParseNonindexWord (textHere, textEnd); if (mgWord[0] > 0) { if (wordLevelIndex) ++numFrags; mg_u_long wordNum = perf_hash (wordHashDict, mgWord); /* cout << wordNum << " \""; cout.write (mgWord+1, *mgWord); cout << "\" " << numFrags << "\n"; */ // add this word to the inverted list BitPtr &wordBitPtr = bitPtrs.GetWordBitPtr (wordNum); mg_u_long fragNum = numFrags; int b = 1 << wordBitPtr.lgB; mems_bitio_buffer buffer ((u_char *) ivfMemBuf, wordBitPtr.here); // note: this assumes that fragments don't carry over between // chunks (which they don't because all tags are closed at the // end of each document and chunks are based on document // boundaries), i.e. the first fragment number must be greater // than the starting fragment number of the chunk. if (fragNum > wordBitPtr.lastFragNum) { buffer.bblock_encode ((fragNum - wordBitPtr.lastFragNum - 1) + 1, b, NULL); if (!wordLevelIndex) buffer.encodeBit (1); // freq = 1 } else if (!wordLevelIndex) { // add one to the frequency count for this word buffer.seek (buffer.position()-1); buffer.encodeBit (0); // unary encoding -- last = 1 buffer.encodeBit (1); } wordBitPtr.lastFragNum = fragNum; wordBitPtr.here = buffer.position(); buffer.encodeDone(); // check for buffer overrun bitPtrs.CheckWordBufOverrun (wordNum); } } } // combine the in memory inverted buffer with the disk // based inverted file static void DiskMerge (char *filename) { bool wordLevelIndex = ivfLevel.indexLevel.empty(); // make sure we have something to process if (numChunkDocs <= 0) return; // open the inverted file FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb+", MAGIC_INVF, MG_ABORT); random_bitio_buffer invfOutBuf (invfFile); // set up to decode the entries in memory mems_bitio_buffer memInBuf ((u_char *) ivfMemBuf, 0); // write out the word information mg_u_long wordNum; int b; mg_u_long currFragNum; mg_u_long delta; mg_u_long currFreq; for (wordNum=0; wordNum= numDocsInChunk) ReadChunk (idh, wordLevelIndex); // process each text element TextElArray::const_iterator here = doc.begin(); TextElArray::const_iterator end = doc.end(); while (here != end) { // process this element if ((*here).elType == OpenTagE) ProcessOpenTag (*here, inFrag); else if ((*here).elType == CloseTagE) ProcessCloseTag (*here, inFrag); else ProcessText (*here, inFrag); ++here; } // close off any unclosed tags TagMapDict::iterator tdHere = tagMapDict.begin(); TagMapDict::iterator tdEnd = tagMapDict.end(); while (tdHere != tdEnd) { CloseTextTag ((*tdHere).second, (*tdHere).first); ++tdHere; } // we've processed one more document ++numDocs; ++numChunkDocs; // merge the memory based inverted file with the one on // disk if this is the end of this chunk if (numChunkDocs >= numDocsInChunk) DiskMerge (collectFilename); return COMPALLOK; } static void CondenseInvfFile (char *filename, mg_u_long &bytesOutput) { FILE *inInvfFile = open_file (filename, INVF_SUFFIX, "rb", MAGIC_INVF, MG_ABORT); FILE *outInvfFile = open_file (filename, INVF_SUFFIX, "rb+", MAGIC_INVF, MG_ABORT); // skip the magic number fseek (outInvfFile, sizeof (mg_u_long), SEEK_SET); // write the inverted file header -- use defaults for most things invf_file_header ifh; ifh.no_of_words = idh.word_dict_size; ifh.no_of_tags = idh.tag_dict_size; ifh.word_level_index = (ivfLevel.indexLevel.empty()) ? 1 : 0; ifh.Write (outInvfFile); bytesOutput = ftell (outInvfFile); // process each meaningful byte in the file mg_u_long numEntries = ifh.no_of_words + ifh.no_of_tags; mg_u_long entryNum; mg_ullong lastStart = 0; for (entryNum = 0; entryNum < numEntries; ++entryNum) { InvfStateRec &stateRec = invfState.GetRec (entryNum); // overrun check if (stateRec.start < lastStart) FatalError (1, "Inverted file Buffer overrun"); lastStart = stateRec.start; mg_u_long oldEntryStart = stateRec.start >> 3; mg_u_long oldEntryStartOver = stateRec.start & 7; // should be 0 mg_u_long oldEntryEnd = (stateRec.here + 7) >> 3; // byte after end mg_u_long oldEntryEndOver = stateRec.here & 7; fseek (inInvfFile, oldEntryStart, SEEK_SET); stateRec.here -= stateRec.start; stateRec.start = (mg_ullong)bytesOutput * 8 + oldEntryStartOver; stateRec.here += stateRec.start; while (oldEntryStart < oldEntryEnd) { unsigned char c = getc (inInvfFile); if (oldEntryStart == oldEntryEnd - 1) { u_char ands[8] = {0xff, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe}; c &= ands[oldEntryEndOver]; } putc (c, outInvfFile); ++bytesOutput; ++oldEntryStart; } } fclose (inInvfFile); #ifdef __WIN32__ if (_chsize (_fileno (outInvfFile), bytesOutput) != 0) Message ("Could not truncate invf."); #else if(ftruncate (fileno (outInvfFile), bytesOutput) != 0) { fprintf(stderr, "Could not truncate invf"); } #endif fclose (outInvfFile); } static void OutputInvfIdx (char *filename, mg_u_long invfNumBytes) { FILE *invfIdxFile = create_file (filename, INVF_IDX_SUFFIX, "wb", MAGIC_INVI, MG_ABORT); // process each meaningful byte in the file mg_u_long numEntries = idh.word_dict_size + idh.tag_dict_size; mg_u_long entryNum; for (entryNum = 0; entryNum < numEntries; ++entryNum) { InvfStateRec &stateRec = invfState.GetRec (entryNum); // assumes that inverted entries start at beginning of each byte if (!WriteUL (invfIdxFile, (stateRec.start >> 3))) break; } WriteUL (invfIdxFile, invfNumBytes); fclose (invfIdxFile); } int done_ivf_2 (const TagInfo &/*tagInfo*/, char *filename) { // close most open files if (chunkFile != NULL) { chunkBuf.done(); fclose (chunkFile); chunkFile = NULL; } occurConvert.Close(); // free allocated memory bitPtrs.Clear(); if (ivfMemBuf != NULL) { delete [] ivfMemBuf; ivfMemBuf = NULL; } free_perf_hash (wordHashDict); wordHashDict = NULL; tagMapDict.erase (tagMapDict.begin(), tagMapDict.end()); // condense the inverted file and truncate it // this function also writes out the inverted header mg_u_long invfNumBytes = 0; CondenseInvfFile (filename, invfNumBytes); OutputInvfIdx (filename, invfNumBytes); // close the rest of the open files invfState.Close (); return COMPALLOK; }