/************************************************************************** * * invf.h -- Data structures for inverted files * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * **************************************************************************/ #ifndef H_INVF #define H_INVF #include #include "UCArray.h" // NOTE: This does not include the magic number // header info for .invf.dict file struct invf_dict_header { unsigned long lookback; unsigned long word_dict_start; unsigned long word_dict_size; unsigned long tag_dict_start; unsigned long tag_dict_size; unsigned long num_docs; unsigned long num_frags; unsigned long num_words; unsigned long total_bytes; unsigned long index_string_bytes; unsigned long num_levels; invf_dict_header (); virtual ~invf_dict_header (); virtual void Clear(); virtual bool Read (FILE *f); virtual bool Write (FILE *f) const; }; struct dict_el { UCArray el; // word or tag unsigned long frag_occur; unsigned long freq; virtual void Clear (); dict_el () { Clear (); } virtual ~dict_el () { } // Read assumes that the last word is in el bool Read (FILE *f); bool Write (FILE *f, const UCArray *lastEl) const; }; struct word_dict_el : public dict_el { unsigned long *levelFreqs; void Clear (); word_dict_el () { levelFreqs = NULL; Clear (); } ~word_dict_el (); void SetNumLevels (unsigned long numLevels); // SetNumLevels should be called before either // reading or writing using Read and Write // Read assumes that the last word is in el bool Read (FILE *f, unsigned long numLevels); bool Write (FILE *f, const UCArray *lastEl, unsigned long numLevels) const; }; // wblk = word block // tblk = tag block // this version of the blocked dictionary uses a fixed number // of entries per block, not a fixed block size // info for .invf.dict.blocked file // blocked dict has a heap of blocks, some for words, some for tags // and an index into each set of blocks. The index has pointers to // the first entry in each block. Can do a binary search on the index // to find out which block an elemnet is in struct block_dict_header : public invf_dict_header { // note: word_dict_start and tag_dict_start are undefined // for blocked dictionaries unsigned long entries_per_wblk; // word blocks unsigned long num_wblks; unsigned long max_wblk_size; unsigned long wblk_start; unsigned long wblk_idx_start; unsigned long entries_per_tblk; // tag blocks unsigned long num_tblks; unsigned long max_tblk_size; unsigned long tblk_start; unsigned long tblk_idx_start; block_dict_header (); void Clear (); bool Read (FILE *f); bool Write (FILE *f) const; }; struct block_dict_el { UCArray el; // word or tag unsigned long frag_occur; // # entries in invf file - if have a // word level index, this is the same as freq, otherwise, its the number // of fragments containing this word unsigned long freq; // # of times this word occurs unsigned long invf_ptr; // pointer into inverted file virtual void Clear (); block_dict_el () { Clear (); } virtual ~block_dict_el () { } // Read assumes that the last word is in el // set lastEl = NULL when no lookback is wanted (eg // for the start of a block bool Read (FILE *f); bool Write (FILE *f, const UCArray *lastEl) const; }; struct word_block_dict_el : public block_dict_el { unsigned long *levelFreqs; // freq of the word at each level void Clear (); word_block_dict_el () { levelFreqs = NULL; Clear (); } ~word_block_dict_el (); void SetNumLevels (unsigned long numLevels); // SetNumLevels should be called before either // reading or writing using Read and Write // Read assumes that the last word is in el bool Read (FILE *f, unsigned long numLevels); bool Write (FILE *f, const UCArray *lastEl, unsigned long numLevels) const; }; typedef vector word_block_dict_el_array; struct block_idx_info { UCArray el; unsigned long block_ptr; block_idx_info (); void Clear (); bool Read (FILE *f); bool Write (FILE *f) const; }; // used for an index into the word and tag blocks typedef vector block_idx; bool ReadBlockIdx (FILE *f, block_idx &blockIdx); bool WriteBlockIdx (FILE *f, const block_idx &blockIdx); struct stem_idx_header { unsigned long lookback; unsigned long dict_size; unsigned long entries_per_block; unsigned long num_blocks; unsigned long max_block_size; unsigned long blocks_start; unsigned long block_idx_start; unsigned long stemmer_num; unsigned long stem_method; stem_idx_header (); void Clear (); bool Read (FILE *f); bool Write (FILE *f) const; }; struct stem_block_dict_el { UCArray el; // word or tag vector equivWords; stem_block_dict_el (); void Clear (); // Read assumes that the last word is in el // set lastEl = NULL when no lookback is wanted (eg // for the start of a block bool Read (FILE *f); bool Write (FILE *f, const UCArray *lastEl) const; }; #define SKIP_MODE_NO_SKIPS 0 // invf file - has a list of frags for each word, but the word is not // stored in the invf file - the dictionaries store the words, along // with num entries, and a pointer into invf file struct invf_file_header { unsigned long no_of_words; unsigned long no_of_tags; unsigned long skip_mode; unsigned long word_level_index; // 1 if word level index unsigned long params[16]; invf_file_header (); void Clear (); bool Read (FILE *f); bool Write (FILE *f) const; }; // the search functions returns true if a block that could // satisfy the request is found. these functions assume that // the block index is sorted by DictCompare (or DictLTUCArray) bool SearchElNum (const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long elNum, unsigned long &blockIdxNum, unsigned long &blockStartElNum); bool SearchEl (const block_idx &bIdx, unsigned long entriesPerBlock, const UCArray &el, unsigned long &blockIdxNum, unsigned long &blockStartElNum); // The next six functions use SearchElNum and SearchEl // for a particular type of dictionary (Block, WordBlock, or StemBlock) // and then look up the entry bool SearchBlockDictElNum (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long elNum, block_dict_el &dictEl); bool SearchBlockDictEl (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, const UCArray &el, block_dict_el &dictEl, unsigned long &elNum); // assumes the numLevels has been set for dictEl bool SearchWordBlockDictElNum (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, unsigned long elNum, word_block_dict_el &dictEl); bool SearchWordBlockDictEl (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, const UCArray &el, word_block_dict_el &dictEl, unsigned long &elNum); bool SearchStemBlockDictElNum (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long elNum, stem_block_dict_el &dictEl); bool SearchStemBlockDictEl (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, const UCArray &el, stem_block_dict_el &dictEl, unsigned long &elNum); //----------------------------------------------------- // New functions for partial matching // returns a list of word numbers from the main block index whose // words start with el. bool PartialMatchSearchWordBlockDictEl (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, const UCArray &el, word_block_dict_el &dictEl, vector &elNumList, bool casefold); //---------------------------------------------------------- // new functions for full text browse bool NearestSearchWordBlockDictEl (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, const UCArray &el, word_block_dict_el &dictEl, unsigned long &elNum); // returns a list of word_block_dict_el, with no levelfreqs bool SearchWordBlockDictElNumRange (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, unsigned long elNum, unsigned long numWords, word_block_dict_el_array &terms); // just returns a list of terms bool SearchWordBlockDictElNumRange (FILE *dictFile, const block_idx &bIdx, unsigned long entriesPerBlock, unsigned long dictSize, unsigned long numLevels, unsigned long elNum, unsigned long numWords, UCArrayVector &terms); #endif