/************************************************************************** * * invf.cpp -- Data structures for inverted files * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #include "invf.h" #include "UCArray.h" invf_dict_header::invf_dict_header () { Clear(); } invf_dict_header::~invf_dict_header () { } void invf_dict_header::Clear() { lookback = 0; word_dict_start = 0; word_dict_size = 0; tag_dict_start = 0; tag_dict_size = 0; num_docs = 0; num_frags = 0; num_words = 0; total_bytes = 0; index_string_bytes = 0; num_levels = 0; } bool invf_dict_header::Read (FILE *f) { return (ReadUL (f, lookback) && ReadUL (f, word_dict_start) && ReadUL (f, word_dict_size) && ReadUL (f, tag_dict_start) && ReadUL (f, tag_dict_size) && ReadUL (f, num_docs) && ReadUL (f, num_frags) && ReadUL (f, num_words) && ReadUL (f, total_bytes) && ReadUL (f, index_string_bytes) && ReadUL (f, num_levels)); } bool invf_dict_header::Write (FILE *f) const { return (WriteUL (f, lookback) && WriteUL (f, word_dict_start) && WriteUL (f, word_dict_size) && WriteUL (f, tag_dict_start) && WriteUL (f, tag_dict_size) && WriteUL (f, num_docs) && WriteUL (f, num_frags) && WriteUL (f, num_words) && WriteUL (f, total_bytes) && WriteUL (f, index_string_bytes) && WriteUL (f, num_levels)); } void dict_el::Clear () { el.erase (el.begin(), el.end()); frag_occur = 0; freq = 0; } bool dict_el::Read (FILE *f) { return (ReadPreSufStr (f, el) && ReadUL (f, frag_occur) && ReadUL (f, freq)); } bool dict_el::Write (FILE *f, const UCArray *lastEl) const { return (WritePreSufStr (f, lastEl, el) && WriteUL (f, frag_occur) && WriteUL (f, freq)); } void word_dict_el::Clear () { dict_el::Clear(); if (levelFreqs != NULL) delete [] levelFreqs; levelFreqs = NULL; } word_dict_el::~word_dict_el () { if (levelFreqs != NULL) delete [] levelFreqs; } void word_dict_el::SetNumLevels (unsigned long numLevels) { if (levelFreqs != NULL) delete [] levelFreqs; levelFreqs = new unsigned long [numLevels]; } bool word_dict_el::Read (FILE *f, unsigned long numLevels) { if (!dict_el::Read (f)) return false; if (levelFreqs == NULL) return false; unsigned long i; for (i=0; i 0) { if (!bi.Read (f)) return false; blockIdx.push_back (bi); --arraySize; } return true; } bool WriteBlockIdx (FILE *f, const block_idx &blockIdx) { // write out the array size if (!WriteVarLenUL (f, blockIdx.size())) return false; block_idx::const_iterator here = blockIdx.begin(); block_idx::const_iterator end = blockIdx.end(); while (here != end) { if (!(*here).Write (f)) return false; ++here; } return true; } stem_idx_header::stem_idx_header () { Clear (); } void stem_idx_header::Clear () { lookback = 0; dict_size = 0; entries_per_block = 0; num_blocks = 0; max_block_size = 0; blocks_start = 0; block_idx_start = 0; stemmer_num = 0; stem_method = 0; } bool stem_idx_header::Read (FILE *f) { return (ReadUL (f, lookback) && ReadUL (f, dict_size) && ReadUL (f, entries_per_block) && ReadUL (f, num_blocks) && ReadUL (f, max_block_size) && ReadUL (f, blocks_start) && ReadUL (f, block_idx_start) && ReadUL (f, stemmer_num) && ReadUL (f, stem_method)); } bool stem_idx_header::Write (FILE *f) const { return (WriteUL (f, lookback) && WriteUL (f, dict_size) && WriteUL (f, entries_per_block) && WriteUL (f, num_blocks) && WriteUL (f, max_block_size) && WriteUL (f, blocks_start) && WriteUL (f, block_idx_start) && WriteUL (f, stemmer_num) && WriteUL (f, stem_method)); } stem_block_dict_el::stem_block_dict_el () { Clear (); } void stem_block_dict_el::Clear () { el.erase (el.begin(), el.end()); equivWords.erase (equivWords.begin(), equivWords.end()); } bool stem_block_dict_el::Read (FILE *f) { equivWords.erase (equivWords.begin(), equivWords.end()); if (!ReadPreSufStr (f, el)) return false; // read in the array size unsigned long arraySize = 0; if (!ReadVarLenUL (f, arraySize)) return false; // read in the array unsigned long wordNum; while (arraySize > 0) { if (!ReadUL (f, wordNum)) return false; equivWords.push_back (wordNum); --arraySize; } return true; } bool stem_block_dict_el::Write (FILE *f, const UCArray *lastEl) const { if (!WritePreSufStr (f, lastEl, el)) return false; // write out the array size if (!WriteVarLenUL (f, equivWords.size())) return false; vector::const_iterator here = equivWords.begin(); vector::const_iterator end = equivWords.end(); while (here != end) { if (!WriteUL (f, (*here))) return false; ++here; } return true; } invf_file_header::invf_file_header () { Clear (); } void invf_file_header::Clear () { no_of_words = 0; no_of_tags = 0; skip_mode = SKIP_MODE_NO_SKIPS; word_level_index = 0; int i; for (i=0; i<16; ++i) params[i] = 0; } bool invf_file_header::Read (FILE *f) { if (!ReadUL (f, no_of_words) || !ReadUL (f, no_of_tags) || !ReadUL (f, skip_mode) || !ReadUL (f, word_level_index)) return false; int i; for (i=0; i<16; ++i) { if (!ReadUL (f, params[i])) return false; } return true; } bool invf_file_header::Write (FILE *f) const { if (!WriteUL (f, no_of_words) || !WriteUL (f, no_of_tags) || !WriteUL (f, skip_mode) || !WriteUL (f, word_level_index)) return false; int i; for (i=0; i<16; ++i) { if (!WriteUL (f, params[i])) return false; } return true; } bool SearchElNum (const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long elNum, unsigned long &blockIdxNum, unsigned long &blockStartElNum) { blockIdxNum = 0; blockStartElNum = 0; // make sure the element number isn't out of range if (elNum > bIdx.size()*entriesPerBlock) return false; blockIdxNum = elNum / entriesPerBlock; blockStartElNum = blockIdxNum * entriesPerBlock; return true; } bool SearchEl (const block_idx &bIdx, unsigned long entriesPerBlock, const UCArray &el, unsigned long &blockIdxNum, unsigned long &blockStartElNum) { blockIdxNum = 0; blockStartElNum = 0; unsigned long begin = 0; unsigned long bIdxEnd = bIdx.size(); unsigned long end = bIdxEnd; unsigned long mid; while (begin < end) { mid = (begin+end)/2; if (DictCompare (el, bIdx[mid].el) < 0) { end = mid; } else if (mid+1 < bIdxEnd && DictCompare (el, bIdx[mid+1].el) >= 0) { begin = mid+1; } else { blockIdxNum = mid; blockStartElNum = blockIdxNum * entriesPerBlock; return true; } } return false; } // use the block dictionary functions for tag entries, and word block dict // functions for word entries. bool SearchBlockDictElNum (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long elNum, block_dict_el &dictEl) { UCArrayClear (dictEl.el); if (elNum >= dictSize) return false; // find the block that contains the element unsigned long blockIdxNum, curElNum; if (!SearchElNum (bIdx, entriesPerBlock, elNum, blockIdxNum, curElNum)) return false; // look for the block fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); do { dictEl.Read (dictFile); } while (curElNum++ < elNum); return true; } bool SearchBlockDictEl (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, const UCArray &el, block_dict_el &dictEl, unsigned long &elNum) { UCArrayClear (dictEl.el); // find the block that contains the element unsigned long blockIdxNum; if (!SearchEl (bIdx, entriesPerBlock, el, blockIdxNum, elNum)) return false; unsigned long blockEndElNum = elNum + entriesPerBlock; if (blockEndElNum > dictSize) blockEndElNum = dictSize; // look for the block fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); while (elNum < blockEndElNum) { dictEl.Read (dictFile); int res = DictCompare (dictEl.el, el); if (res == 0) return true; // found it else if (res > 0) break; // not here ++elNum; } return false; } bool SearchWordBlockDictElNum (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, unsigned long elNum, word_block_dict_el &dictEl) { UCArrayClear (dictEl.el); if (elNum >= dictSize) return false; // find the block that contains the element unsigned long blockIdxNum, curElNum; if (!SearchElNum (bIdx, entriesPerBlock, elNum, blockIdxNum, curElNum)) return false; // look for the block fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); do { dictEl.Read (dictFile, numLevels); } while (curElNum++ < elNum); return true; } bool SearchWordBlockDictEl (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, const UCArray &el, word_block_dict_el &dictEl, unsigned long &elNum) { UCArrayClear (dictEl.el); // find the block that contains the element unsigned long blockIdxNum; if (!SearchEl (bIdx, entriesPerBlock, el, blockIdxNum, elNum)) return false; unsigned long blockEndElNum = elNum + entriesPerBlock; if (blockEndElNum > dictSize) blockEndElNum = dictSize; // look for the block fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); while (elNum < blockEndElNum) { dictEl.Read (dictFile, numLevels); int res = DictCompare (dictEl.el, el); if (res == 0) return true; // found it else if (res > 0) break; // not here ++elNum; } return false; } bool SearchStemBlockDictElNum (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long elNum, stem_block_dict_el &dictEl) { UCArrayClear (dictEl.el); if (elNum >= dictSize) return false; // find the block that contains the element unsigned long blockIdxNum, curElNum; if (!SearchElNum (bIdx, entriesPerBlock, elNum, blockIdxNum, curElNum)) return false; // look for the block fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); do { dictEl.Read (dictFile); } while (curElNum++ < elNum); return true; } bool SearchStemBlockDictEl (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, const UCArray &el, stem_block_dict_el &dictEl, unsigned long &elNum) { UCArrayClear (dictEl.el); // find the block that contains the element unsigned long blockIdxNum; if (!SearchEl (bIdx, entriesPerBlock, el, blockIdxNum, elNum)) return false; unsigned long blockEndElNum = elNum + entriesPerBlock; if (blockEndElNum > dictSize) blockEndElNum = dictSize; // look for the block fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); while (elNum < blockEndElNum) { dictEl.Read (dictFile); int res = DictCompare (dictEl.el, el); if (res == 0) return true; // found it else if (res > 0) break; // not here ++elNum; } return false; } // ------------------------------------------ // functions for partial term matching // ie find all words that start with xxx bool PartialMatchSearchWordBlockDictEl (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, const UCArray &el, word_block_dict_el &dictEl, vector &elNumList, bool casefold) { UCArrayClear (dictEl.el); elNumList.erase (elNumList.begin(), elNumList.end()); unsigned long elNum; // find the block that contains the element unsigned long blockIdxNum; // will this work?? if (!SearchEl (bIdx, entriesPerBlock, el, blockIdxNum, elNum)) { return false; } unsigned long blockEndElNum = elNum + entriesPerBlock; if (blockEndElNum > dictSize) blockEndElNum = dictSize; bool still_looking = true; // look for the block fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); // test each element while (elNum < blockEndElNum) { dictEl.Read (dictFile, numLevels); if (StartsWithCasefold(dictEl.el, el)) { if (casefold || StartsWith(dictEl.el, el)) { elNumList.push_back(elNum); } still_looking=false; // we have found one now, so stop the next time we don't have a match } else if (!still_looking) { // we have found a match previously, and now this doesn't match return true; } // else keep looking ++elNum; } // if we get here, we are either still searching for the first // case, or we are collecting terms. if (still_looking) { //we haven't found a match yet, just check the next element, dictEl.Read (dictFile, numLevels); if (!StartsWithCasefold(dictEl.el, el)) { // the first element of the next block is not a match, so there are no matches return false; } else { if (casefold || StartsWith(dictEl.el, el)) { elNumList.push_back(elNum); ++elNum; } } } // just keep accumulating matches until there are no more dictEl.Read (dictFile, numLevels); while (StartsWithCasefold(dictEl.el, el)) { if (casefold || StartsWith(dictEl.el, el)) { elNumList.push_back(elNum); } ++elNum; dictEl.Read (dictFile, numLevels); } return true; } //---------------------------------------------------------------- // functions for full text browse bool NearestSearchWordBlockDictEl (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, const UCArray &el, word_block_dict_el &dictEl, unsigned long &elNum) { UCArrayClear (dictEl.el); // find the block that contains the element unsigned long blockIdxNum; if (!SearchEl (bIdx, entriesPerBlock, el, blockIdxNum, elNum)) return false; unsigned long blockEndElNum = elNum + entriesPerBlock; if (blockEndElNum > dictSize) blockEndElNum = dictSize; // look for the block fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); while (elNum < blockEndElNum) { dictEl.Read (dictFile, numLevels); int res = DictCompare (el, dictEl.el); // look for the first word that is // greater or equal to the el if (res <= 0) { return true; // found one } ++elNum; } // it must be the last term return true; } bool SearchWordBlockDictElNumRange (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, unsigned long elNum, unsigned long numWords, UCArrayVector &terms) { word_block_dict_el dictEl; dictEl.SetNumLevels (numLevels); UCArrayClear(dictEl.el); terms.erase(terms.begin(), terms.end()); if (elNum >= dictSize) return false; // find the block that contains the element unsigned long blockIdxNum, curElNum; if (!SearchElNum (bIdx, entriesPerBlock, elNum, blockIdxNum, curElNum)) return false; unsigned long lastElNum = elNum + numWords - 1; if (lastElNum > dictSize) lastElNum = dictSize; // look for the block fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); // get the first term do { dictEl.Read (dictFile, numLevels); } while (curElNum++ < elNum); terms.push_back(dictEl.el); while (curElNum <= lastElNum ) { dictEl.Read(dictFile, numLevels); terms.push_back(dictEl.el); ++curElNum; } return true; } // NOte: before each addition of dictEl to the array, the level freqs array // is deleted, as this was causing problems - generating a seg fault, I think if // the vector had to be reallocated or something. // setNumLevels has to be called each time before a read, now, to set up the level //freqs array. this is necessary. bool SearchWordBlockDictElNumRange (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, unsigned long elNum, unsigned long numWords, word_block_dict_el_array &terms) { word_block_dict_el dictEl; dictEl.SetNumLevels (numLevels); UCArrayClear(dictEl.el); block_dict_el elem; terms.erase(terms.begin(), terms.end()); if (elNum >= dictSize) return false; // find the block that contains the element unsigned long blockIdxNum, curElNum; if (!SearchElNum (bIdx, entriesPerBlock, elNum, blockIdxNum, curElNum)) return false; unsigned long lastElNum = elNum + numWords - 1; if (lastElNum > dictSize) lastElNum = dictSize; // look for the block fseek (dictFile, bIdx[blockIdxNum].block_ptr, SEEK_SET); // get the first term do { dictEl.Read (dictFile, numLevels); } while (curElNum++ < elNum); dictEl.levelFreqs = NULL; terms.push_back(dictEl); while (curElNum <= lastElNum ) { dictEl.SetNumLevels(numLevels); dictEl.Read(dictFile, numLevels); dictEl.levelFreqs = NULL; terms.push_back(dictEl); ++curElNum; } return true; }