/************************************************************************** * * ivf.pass1.cpp -- Memory efficient pass 1 inversion * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: ivf.pass1.cpp 856 2000-01-14 02:26:25Z sjboddie $ * **************************************************************************/ #include "sysfuncs.h" #include "mg_files.h" #include "invf.h" #include "mg.h" #include "build.h" #include "locallib.h" #include "UCArray.h" #include "bitio_m_stdio.h" #include "bitio_gen.h" #include #include "words.h" #include "messages.h" #include "netorder.h" #include "FIvfLevelInfo.h" #include "longlong.h" #if defined(GSDL_USE_OBJECTSPACE) # include #elif defined(GSDL_USE_STL_H) # include #else # include #endif /* $Log$ Revision 1.1 2000/01/14 02:26:07 sjboddie Rodgers new C++ mg * */ // a fragment corresponds to the index level (word-level is the // minimum index level) // structure to determine level information struct LevelWorker { unsigned long lastLevelDocNum; unsigned long count; LevelWorker () { lastLevelDocNum = count = 0; } }; // note: the word is stored in the map struct IvfWordInfo { unsigned long wordCount; // word frequency unsigned long fragCount; // number of fragments that contain the word unsigned long lastFragNum; // last fragment to contain the word unsigned long chunkWordCount; // word frequency within this chunk unsigned long chunkFragCount; // number of fragments within this chunk that // contain the word LevelWorker *levels; // level info for this word IvfWordInfo (); ~IvfWordInfo (); void Clear (); // will delete levels }; typedef map IvfWordInfoMap; typedef vector IvfWordInfoItArray; // tags don't require as much information struct IvfTagInfo { unsigned long tagCount; // tag frequency unsigned long fragCount; // number of fragments that contain the tag unsigned long lastFragNum; // last fragment to contain the tag unsigned long chunkFragCount; // number of fragments within this chunk that // contain the tag IvfTagInfo (); void Clear (); }; typedef map IvfTagInfoMap; typedef vector IvfTagInfoItArray; #define INIT_CHECK_FRAC 0.10 #define CHECK_FRAC 0.75 #define CHECK_CLOSE 0.999 #define CHECK_DIV 1.5 static FILE *ic; // the invf chunk file static stdio_bitio_buffer icb; IvfWordInfoMap ivfWordInfo; IvfWordInfoItArray ivfWordInfoOccurOrder; IvfTagInfoMap ivfTagInfo; IvfTagInfoItArray ivfTagInfoOccurOrder; static unsigned long chunksWritten; static unsigned long maxMemNeeded; static unsigned long numDocs; static unsigned long numChunkDocs; static unsigned long numFrags; static unsigned long numChunkFrags; static unsigned long numWords; // the number of document numbers in the inverted file static unsigned long numChunkEntries; // next entry in the inverted file to check memory // requirements for the current chunk static unsigned long entryCheckPoint; // information about all the different levels static FIvfLevel ivfLevel; IvfWordInfo::IvfWordInfo () { levels = NULL; Clear(); } IvfWordInfo::~IvfWordInfo () { if (levels != NULL) delete [] levels; } void IvfWordInfo::Clear () { wordCount = 0; chunkWordCount = 0; lastFragNum = 0; fragCount = 0; chunkFragCount = 0; if (levels != NULL) { delete [] levels; levels = NULL; } } IvfTagInfo::IvfTagInfo () { Clear(); } void IvfTagInfo::Clear () { tagCount = 0; lastFragNum = 0; fragCount = 0; chunkFragCount = 0; } int init_ivf_1 (const TagInfo &tagInfo, char *file_name) { // set up the chunk file if (!(ic = create_file (file_name, INVF_CHUNK_SUFFIX, "wb", MAGIC_CHUNK, MG_MESSAGE))) return COMPERROR; fwrite (" ", sizeof (u_long), 1, ic); // Space for the maxmem icb.attachFile (ic); icb.encodeStart(); // reset global variables ivfWordInfo.erase (ivfWordInfo.begin(), ivfWordInfo.end()); ivfWordInfoOccurOrder.erase (ivfWordInfoOccurOrder.begin(), ivfWordInfoOccurOrder.end()); ivfTagInfo.erase (ivfTagInfo.begin(), ivfTagInfo.end()); ivfTagInfoOccurOrder.erase (ivfTagInfoOccurOrder.begin(), ivfTagInfoOccurOrder.end()); chunksWritten = 0; maxMemNeeded = 0; numDocs = 0; numChunkDocs = 0; numFrags = 0; numChunkFrags = 0; numWords = 0; numChunkEntries = 0; entryCheckPoint = (unsigned long) ((invf_buffer_size * INIT_CHECK_FRAC) / CHECK_DIV); // init the level information ivfLevel.Clear(); ivfLevel.docTag = tagInfo.docTag; ivfLevel.indexLevel = tagInfo.indexLevel; IvfLevelInfo blankLevel; UCArraySet::const_iterator levelHere = tagInfo.levelTags.begin(); UCArraySet::const_iterator levelEnd = tagInfo.levelTags.end(); while (levelHere != levelEnd) { blankLevel.levelTag = *levelHere; ivfLevel.levelInfo[*levelHere] = blankLevel; levelHere++; } return COMPALLOK; } static void ProcessOpenTag (const TagInfo &tagInfo, const TextEl &el, bool &inFrag) { bool wordLevelIndex = tagInfo.indexLevel.empty(); // check for start of next fragment if (!wordLevelIndex && el.tagName == tagInfo.indexLevel) { numFrags++; numChunkFrags++; inFrag = true; } // update tag stats IvfTagInfo &i = ivfTagInfo[el.tagName]; if (i.tagCount == 0) { // new tag, add to list of iterators IvfTagInfoMap::iterator iIt = ivfTagInfo.find (el.tagName); ivfTagInfoOccurOrder.push_back (iIt); } i.tagCount++; // all open tags count as new tags numChunkEntries++; i.fragCount++; i.chunkFragCount++; i.lastFragNum = numFrags; // update level information IvfLevelInfoMap::iterator levelIt = ivfLevel.levelInfo.find (el.tagName); if (levelIt != ivfLevel.levelInfo.end()) { // is a level tag (*levelIt).second.numEntries++; (*levelIt).second.workInLevel = true; } } static void ProcessCloseTag (const TagInfo &tagInfo, const TextEl &el, bool &inFrag) { bool wordLevelIndex = tagInfo.indexLevel.empty(); // check for end of fragment if (!wordLevelIndex && el.tagName == tagInfo.indexLevel) { inFrag = false; } // update level information IvfLevelInfoMap::iterator levelIt = ivfLevel.levelInfo.find (el.tagName); if (levelIt != ivfLevel.levelInfo.end()) { // is a level tag (*levelIt).second.workInLevel = false; } } static void ProcessText (const TagInfo &tagInfo, const TextEl &el, bool &inFrag) { bool wordLevelIndex = tagInfo.indexLevel.empty(); // make sure this text is to be indexed if (!wordLevelIndex && !inFrag) return; const unsigned char *textHere = el.text.begin(); const unsigned char *textEnd = el.text.end() - 1; UCArray word; if (!inaword (textHere, textEnd)) ParseNonindexWord (textHere, textEnd); // Alternately parse off words and non-words from the input // Each token is then inserted into the set if it does // not exist or has it's frequency count incremented if it does. while (textHere <= textEnd) { textHere = ParseIndexWord (textHere, textEnd, word); textHere = ParseNonindexWord (textHere, textEnd); if (!word.empty()) { numWords++; if (wordLevelIndex) { numFrags++; numChunkFrags++; } // update word stats IvfWordInfo &i = ivfWordInfo[word]; if (i.wordCount == 0) { // new word // add to list of iterators IvfWordInfoMap::iterator iIt = ivfWordInfo.find (word); ivfWordInfoOccurOrder.push_back (iIt); // add level information array if (ivfLevel.levelInfo.size() > 0) i.levels = new LevelWorker [ivfLevel.levelInfo.size()]; } i.wordCount++; i.chunkWordCount++; if (numFrags > i.lastFragNum) { numChunkEntries++; i.fragCount++; i.chunkFragCount++; i.lastFragNum = numFrags; } // update level information for this word if (i.levels != NULL) { IvfLevelInfoMap::iterator levelHere = ivfLevel.levelInfo.begin(); IvfLevelInfoMap::iterator levelEnd = ivfLevel.levelInfo.end(); LevelWorker *levelWorkerPtr = i.levels; while (levelHere != levelEnd) { // check to make sure the level encompases this fragment if (!(*levelHere).second.workInLevel) { cerr << "Level tag <" << (*levelHere).first << "> does not encompass all fragments\n"; exit (1); } if ((*levelHere).second.numEntries > (*levelWorkerPtr).lastLevelDocNum) { (*levelWorkerPtr).lastLevelDocNum = (*levelHere).second.numEntries; (*levelWorkerPtr).count ++; } levelHere++; levelWorkerPtr++; } } } } } static unsigned long MemoryRequired (bool wordLevelIndex) { register unsigned long total = 0; // add memory required for word entries IvfWordInfoMap::const_iterator wordHere = ivfWordInfo.begin(); IvfWordInfoMap::const_iterator wordEnd = ivfWordInfo.end(); while (wordHere != wordEnd) { register const IvfWordInfo &info = (*wordHere).second; if (info.chunkFragCount > 0) { total += BIO_Bblock_Bound (numChunkFrags, info.chunkFragCount); if (!wordLevelIndex) { total += info.chunkWordCount; } } wordHere++; } // add memory required for tag entries IvfTagInfoMap::const_iterator tagHere = ivfTagInfo.begin(); IvfTagInfoMap::const_iterator tagEnd = ivfTagInfo.end(); while (tagHere != tagEnd) { register const IvfTagInfo &info = (*tagHere).second; if (info.chunkFragCount > 0) { // two d entries for each frag entry unsigned long pTag = info.chunkFragCount*2; total += BIO_Bblock_Bound (numChunkFrags+pTag, pTag); } tagHere++; } total = (total + 7) >> 3; return total; } /* static void PrintChunkInfo (unsigned long mem) { cout << "Chunk Number: " << chunksWritten << "\n"; cout << "numChunkDocs " << numChunkDocs << "\n"; cout << "numChunkFrags " << numChunkFrags << "\n"; cout << "mem " << mem << "\n"; cout << "numWords " << ivfWordInfo.size() << "\n"; cout << "numTags " << ivfTagInfo.size() << "\n\n"; // output debug tag information in dictionary order IvfTagInfoMap::iterator tagMapHere = ivfTagInfo.begin(); IvfTagInfoMap::iterator tagMapEnd = ivfTagInfo.end(); unsigned long tagNum = 0; while (tagMapHere != tagMapEnd) { cout << (*tagMapHere).first << " " << tagNum << " " << (*tagMapHere).second.chunkFragCount << "\n"; tagNum++; tagMapHere++; } } */ static void OutputChunkInfo (unsigned long mem, bool /*wordLevelIndex*/) { chunksWritten++; // sanity check if (ivfWordInfo.size() != ivfWordInfoOccurOrder.size()) { Message ("ERROR: Word information size mismatch: %u vs %u\n", (unsigned int)ivfWordInfo.size(), (unsigned int)ivfWordInfoOccurOrder.size()); exit (1); } if (ivfTagInfo.size() != ivfTagInfoOccurOrder.size()) { Message ("ERROR: Tag information size mismatch: %u vs %u\n", (unsigned int)ivfTagInfo.size(), (unsigned int)ivfTagInfoOccurOrder.size()); exit (1); } icb.gamma_encode (numChunkDocs + 1, NULL); icb.gamma_encode (numChunkFrags + 1, NULL); icb.gamma_encode (mem + 1, NULL); icb.gamma_encode (ivfWordInfo.size() + 1, NULL); icb.gamma_encode (ivfTagInfo.size() + 1, NULL); /* PrintChunkInfo (mem);*/ // output word information in occurance order IvfWordInfoItArray::iterator wordHere = ivfWordInfoOccurOrder.begin(); IvfWordInfoItArray::iterator wordEnd = ivfWordInfoOccurOrder.end(); while (wordHere != wordEnd) { register IvfWordInfo &ivfWordInfo = (*(*wordHere)).second; icb.gamma_encode (ivfWordInfo.chunkWordCount + 1, NULL); if (ivfWordInfo.chunkWordCount >= 2) { icb.gamma_encode (ivfWordInfo.chunkFragCount, NULL); } ivfWordInfo.lastFragNum = 0; ivfWordInfo.chunkWordCount = 0; ivfWordInfo.chunkFragCount = 0; wordHere++; } // output tag information in occurance order IvfTagInfoItArray::iterator tagHere = ivfTagInfoOccurOrder.begin(); IvfTagInfoItArray::iterator tagEnd = ivfTagInfoOccurOrder.end(); while (tagHere != tagEnd) { register IvfTagInfo &ivfTagInfo = (*(*tagHere)).second; icb.gamma_encode (ivfTagInfo.chunkFragCount + 1, NULL); ivfTagInfo.lastFragNum = 0; ivfTagInfo.chunkFragCount = 0; tagHere++; } numChunkDocs = 0; numChunkFrags = 0; numChunkEntries = 0; } int process_ivf_1 (const TagInfo &tagInfo, const TextElArray &doc) { bool wordLevelIndex = tagInfo.indexLevel.empty(); bool inFrag = false; if (wordLevelIndex) inFrag = true; // unconditional numDocs++; numChunkDocs++; // process each text element in this document TextElArray::const_iterator here = doc.begin(); TextElArray::const_iterator end = doc.end(); while (here != end) { if ((*here).elType == OpenTagE) ProcessOpenTag (tagInfo, *here, inFrag); else if ((*here).elType == CloseTagE) ProcessCloseTag (tagInfo, *here, inFrag); else if ((*here).elType == TextE) ProcessText (tagInfo, *here, inFrag); here++; } // check the amount of memory needed for this chunk if (numChunkEntries >= entryCheckPoint) { unsigned long mem = MemoryRequired (wordLevelIndex); if (mem >= invf_buffer_size * CHECK_CLOSE) { if (mem > maxMemNeeded) maxMemNeeded = mem; OutputChunkInfo (mem, wordLevelIndex); entryCheckPoint = (unsigned long) ((invf_buffer_size * INIT_CHECK_FRAC) / CHECK_DIV); } else { entryCheckPoint = (unsigned long) (entryCheckPoint * ((CHECK_FRAC * (invf_buffer_size - mem)) / mem) + entryCheckPoint); if (entryCheckPoint <= numChunkEntries) entryCheckPoint = numChunkEntries + 1; } } return COMPALLOK; } static void CalcInvfDictSize (unsigned long &totalBytes, unsigned long &indexStringBytes) { totalBytes = 0; // The sum of the length of all words, including // the length byte indexStringBytes = 0; // The amount of space required to store the // words in the diction, this takes into account // the prefixes const UCArray *lastWord = NULL; // calculate size of word information IvfWordInfoMap::iterator wordHere = ivfWordInfo.begin(); IvfWordInfoMap::iterator wordEnd = ivfWordInfo.end(); while (wordHere != wordEnd) { unsigned long wordSize = (*wordHere).first.size(); totalBytes += wordSize + 1; indexStringBytes += wordSize + 2; if (lastWord != NULL) indexStringBytes -= PrefixLen (*lastWord, (*wordHere).first); lastWord = &((*wordHere).first); wordHere++; } // calculate size of tag information lastWord = NULL; IvfTagInfoMap::iterator tagHere = ivfTagInfo.begin(); IvfTagInfoMap::iterator tagEnd = ivfTagInfo.end(); while (tagHere != tagEnd) { unsigned long tagSize = (*tagHere).first.size(); totalBytes += tagSize + 1; indexStringBytes += tagSize + 2; if (lastWord != NULL) indexStringBytes -= PrefixLen (*lastWord, (*tagHere).first); lastWord = &((*tagHere).first); tagHere++; } } // OutputInvfDict (): // writes out the stemmed dictionary file // in the following format // lookback value (int) // totalbytes value (int) // indexstringbytes (int) // for each word // wordlen (4 bits) // prefix match (4 bits) // word (wordlen bytes) // word frequency (int) // word count (int) // // Accesses outside variables: // // Return value...: static void OutputInvfDict (char *filename) { // create the dictionary header invf_dict_header idh; idh.word_dict_size = ivfWordInfo.size(); idh.tag_dict_size = ivfTagInfo.size(); idh.num_docs = numDocs; idh.num_frags = numFrags; idh.num_words = numWords; idh.num_levels = ivfLevel.levelInfo.size(); CalcInvfDictSize (idh.total_bytes, idh.index_string_bytes); // create the inverted dictionary file FILE *sp; if (!(sp = create_file (filename, INVF_DICT_SUFFIX, "wb", MAGIC_STEM_BUILD, MG_MESSAGE))) return; // write out the dictionary header if (!idh.Write (sp)) { fclose (sp); return; } // remember where the word dictionary starts idh.word_dict_start = ftell (sp); // output the word dictionary const UCArray *lastWord = NULL; IvfWordInfoMap::iterator wordHere = ivfWordInfo.begin(); IvfWordInfoMap::iterator wordEnd = ivfWordInfo.end(); while (wordHere != wordEnd) { // get the prefix and suffix lengths const UCArray &thisWord = (*wordHere).first; WritePreSufStr (sp, lastWord, thisWord); // output the number of fragments the word appeared in and the // number of times the word appeared WriteUL (sp, (*wordHere).second.fragCount); WriteUL (sp, (*wordHere).second.wordCount); // output frequency information for each level // note that we are expecting every word to have // level information LevelWorker *lwHere = (*wordHere).second.levels; LevelWorker *lwEnd = lwHere + idh.num_levels; while (lwHere != lwEnd) { WriteUL (sp, (*lwHere).count); lwHere++; } lastWord = &thisWord; wordHere++; } // remember where the tag dictionary starts idh.tag_dict_start = ftell (sp); // output the tag dictionary const UCArray *lastTag = NULL; IvfTagInfoMap::iterator tagHere = ivfTagInfo.begin(); IvfTagInfoMap::iterator tagEnd = ivfTagInfo.end(); while (tagHere != tagEnd) { // get the prefix and suffix lengths const UCArray &thisTag = (*tagHere).first; WritePreSufStr (sp, lastTag, thisTag); // output the number of fragments the tag appeared in and the // number of times the tag appeared WriteUL (sp, (*tagHere).second.fragCount); WriteUL (sp, (*tagHere).second.tagCount); lastTag = &thisTag; tagHere++; } // write out the updated header fseek (sp, sizeof (u_long), SEEK_SET); if (!idh.Write (sp)) { fclose (sp); return; } fclose (sp); } static void OutputLevelFile (char *filename) { // create the level file FILE *f; if (!(f = create_file (filename, INVF_LEVEL_SUFFIX, "wb", MAGIC_INVF_LEVELS, MG_MESSAGE))) return; // write out the information ivfLevel.Write (f); // close the file fclose (f); } static void OutputTransFile (char *filename) { // save the word number (in lastFragNum :-/) int i = 0; IvfWordInfoMap::iterator wordHere = ivfWordInfo.begin(); IvfWordInfoMap::iterator wordEnd = ivfWordInfo.end(); while (wordHere != wordEnd) { (*wordHere).second.lastFragNum = i; i++; wordHere++; } // save the tag number (in lastFragNum :-/) i = 0; IvfTagInfoMap::iterator tagHere = ivfTagInfo.begin(); IvfTagInfoMap::iterator tagEnd = ivfTagInfo.end(); while (tagHere != tagEnd) { (*tagHere).second.lastFragNum = i; i++; tagHere++; } // create the translation file FILE *f; if (!(f = create_file (filename, INVF_CHUNK_TRANS_SUFFIX, "wb", MAGIC_CHUNK_TRANS, MG_MESSAGE))) return; stdio_bitio_buffer buffer(f); buffer.encodeStart(); // write out the word translation table unsigned long wordDictSize = ivfWordInfoOccurOrder.size(); IvfWordInfoItArray::iterator wordItHere = ivfWordInfoOccurOrder.begin(); IvfWordInfoItArray::iterator wordItEnd = ivfWordInfoOccurOrder.end(); unsigned long oN = 0; while (wordItHere != wordItEnd) { register IvfWordInfo &ivfWordInfo = (*(*wordItHere)).second; buffer.binary_encode (ivfWordInfo.lastFragNum + 1, wordDictSize + 1, NULL); oN++; wordItHere++; } // write out the tag translation table unsigned long tagDictSize = ivfTagInfoOccurOrder.size(); IvfTagInfoItArray::iterator tagItHere = ivfTagInfoOccurOrder.begin(); IvfTagInfoItArray::iterator tagItEnd = ivfTagInfoOccurOrder.end(); while (tagItHere != tagItEnd) { register IvfTagInfo &ivfTagInfo = (*(*tagItHere)).second; buffer.binary_encode (ivfTagInfo.lastFragNum + 1, tagDictSize + 1, NULL); oN++; tagItHere++; } // finish encoding and close the file buffer.encodeDone(); fclose (f); } #ifndef SILENT static void PrintStats () { Message ("Inverted buffer size: %8u bytes\n", invf_buffer_size); Message ("Max memory needed for 1 chunk: %8u bytes\n", maxMemNeeded); Message ("Number of chunks written: %8u\n", chunksWritten); Message ("Number of documents: %8u\n", numDocs); Message ("Number of fragments: %8u\n", numFrags); Message ("Number of words: %8u\n", numWords); Message ("Size of word dictionary: %8u\n", ivfWordInfo.size()); Message ("Size of tag dictionary: %8u\n", ivfTagInfo.size()); } #endif int done_ivf_1 (const TagInfo &tagInfo, char * filename) { bool wordLevelIndex = tagInfo.indexLevel.empty(); char *temp_str = msg_prefix; msg_prefix = "ivf.pass1"; // output the last chunk if (numChunkDocs > 0) { unsigned long mem = MemoryRequired (wordLevelIndex); OutputChunkInfo (mem, wordLevelIndex); if (mem > maxMemNeeded) maxMemNeeded = mem; } // write out and encoded 1 to say there are no more chunks icb.gamma_encode (1, NULL); icb.encodeDone (); // write out the maximum memory required and close the file fseek (ic, sizeof (long), 0); WriteUL (ic, maxMemNeeded); fclose (ic); // output the inverted dictionary OutputInvfDict (filename); // write out the translation file OutputTransFile (filename); // output the level information OutputLevelFile (filename); // output statistics #ifndef SILENT PrintStats (); #endif msg_prefix = temp_str; return COMPALLOK; }