/************************************************************************** * * Terms.cpp -- Query related functions * Copyright (C) 1999 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #include "Terms.h" #include "words.h" #include "stemmer.h" #include "bitio_gen.h" #include "bitio_m_stdio.h" void QueryInfo::Clear () { UCArrayClear (docLevel); maxDocs = 0; sortByRank = true; exactWeights = false; needRankInfo = false; needTermFreqs = false; } void TermFreqData::Clear () { UCArrayClear (tag); UCArrayClear (term); equivTerms.erase(equivTerms.begin(), equivTerms.end()); stemMethod = 0; matchDocs = 0; termFreq = 0; } ostream &operator<< (ostream &s, const TermFreqData &t) { s << "<" << t.tag << ">\"" << t.term << "\"stem(" << t.stemMethod << ")equiv terms("; mg_u_long i; for (i=0; i &equivWords) { equivWords.erase (equivWords.begin(), equivWords.end()); // if the stem method specified is not a valid one (i.e. there was no appropriate stem index, then we set it to 0) // unless we have partial matching, in which case we are not doing stem indexes anyway. if (!(stemMethod & STEM_PARTIAL_MATCH) && indexData.stemFile[stemMethod-1] == NULL) { cerr << "Stem index for method "< STEM_MAX) { return; //TODO: throw an error here } stemmerNum = indexData.sih[stemMethod-1].stemmer_num; // convert the word to an "mg word" mgWord[0] = term.size(); memcpy ((char *)&mgWord[1], &(term[0]), term.size()); // stem the word mgpp_stemmer (stemMethod, stemmerNum, mgWord); // convert the result back to a UCArray stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]); // need to look up this term in the appropriate dictionary stem_block_dict_el stemDictEl; mg_u_long stemElNum; bool result = false; /* [JFG - Mar 06: Accent folding patch] */ result = SearchStemBlockDictEl (indexData.stemFile[stemMethod-1], indexData.sii[stemMethod-1], indexData.sih[stemMethod-1].entries_per_block, indexData.sih[stemMethod-1].dict_size, stemTerm, stemDictEl, stemElNum); if (result) { equivWords = stemDictEl.equivWords; } } void ReadTermFragData (IndexData &indexData, bool needFragFreqs, mg_u_long termNum, FragData &fragData, FragRangeArray *fragLimits, UCArray & termWord) { fragData.Clear(); // look up the word in the dictionary mg_u_long numLevels = indexData.bdh.num_levels; word_block_dict_el wordDictEl; wordDictEl.SetNumLevels (numLevels); if (!SearchWordBlockDictElNum (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, termNum, wordDictEl)) return; // nothing more to do fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum]; termWord = wordDictEl.el; // seek to the appropriate place in the inverted file fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET); stdio_bitio_buffer buffer (indexData.invfFile); mg_u_long B = BIO_Bblock_Init (indexData.bdh.num_frags, wordDictEl.frag_occur); mg_u_long fragNum = 0; mg_u_long termFreq = 0; mg_u_long fragLimitI = 0; mg_u_long i; for (i=0; i (*fragLimits)[fragLimitI+1].rangeStart) { ++fragLimitI; } } // add the entry if it is within the limits if ((fragLimits == NULL) || (fragLimitI < (*fragLimits).size() && fragNum > (*fragLimits)[fragLimitI].rangeStart && fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) { fragData.fragNums.push_back (fragNum); if (needFragFreqs) fragData.fragFreqs.push_back (termFreq); } } buffer.done(); } void CombineFragData (bool needFragFreqs, const FragData &f1, const FragData &f2, FragData &outFragData) { outFragData.Clear(); // the new number of matching documents is the maximum // of the two input matching number of documents -- it // is assumed that these are at the same document level outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ? f1.matchDocs : f2.matchDocs; // do or mg_u_long f1I = 0, f1Size = f1.fragNums.size(); mg_u_long f2I = 0, f2Size = f2.fragNums.size(); while (f1I < f1Size || f2I < f2Size) { if (f2I < f2Size && (f1I >= f1Size || f1.fragNums[f1I] > f2.fragNums[f2I])) { // output f2I outFragData.fragNums.push_back (f2.fragNums[f2I]); if (needFragFreqs) outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]); ++f2I; } else if (f1I < f1Size && (f2I >= f2Size || f1.fragNums[f1I] < f2.fragNums[f2I])) { // output f1I outFragData.fragNums.push_back (f1.fragNums[f1I]); if (needFragFreqs) outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]); ++f1I; } else { // must be equal combine f1I and f2I outFragData.fragNums.push_back (f1.fragNums[f1I]); if (needFragFreqs) outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]); ++f1I; ++f2I; } } } void AndCombineFragData (bool needFragFreqs, FragData &fragData, const FragData &comFragData, mg_s_long startRange, mg_s_long endRange, const FragRangeArray *fragLimits) { // sanity check on range if (startRange > endRange) { mg_s_long temp = endRange; endRange = startRange; startRange = temp; } // get min matchdocs if (comFragData.matchDocs < fragData.matchDocs) fragData.matchDocs = comFragData.matchDocs; mg_u_long fragDataI = 0; mg_u_long fragDataSize = fragData.fragNums.size(); mg_u_long comFragDataI = 0; mg_u_long comFragDataSize = comFragData.fragNums.size(); mg_u_long fragLimitI = 0; mg_u_long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size(); mg_u_long outI = 0; while (fragDataI < fragDataSize && comFragDataI < comFragDataSize) { mg_s_long fragNum = (mg_s_long)fragData.fragNums[fragDataI]; mg_s_long comFragNum = (mg_s_long)comFragData.fragNums[comFragDataI]; // go to the right fragment limit (for the com frag) if (fragLimits != NULL) { while (fragLimitI+1 < fragLimitSize && comFragNum > (mg_s_long)(*fragLimits)[fragLimitI+1].rangeStart) { ++fragLimitI; } } if (fragNum <= comFragNum+startRange || (fragLimits!=NULL && fragNum<=(mg_s_long)(*fragLimits)[fragLimitI].rangeStart)) { ++fragDataI; } else if (fragNum > comFragNum+endRange || (fragLimits!=NULL && fragNum>(mg_s_long)(*fragLimits)[fragLimitI].rangeEnd)) { ++comFragDataI; } else { // equal and within tag fragData.fragNums[outI] = comFragNum; if (needFragFreqs) { fragData.fragFreqs[outI] = (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ? fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI]; } ++fragDataI; ++comFragDataI; ++outI; } } // erase unused part of fragData fragData.fragNums.erase (fragData.fragNums.begin()+outI, fragData.fragNums.end()); if (needFragFreqs) fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI, fragData.fragFreqs.end()); else fragData.fragFreqs.erase (fragData.fragFreqs.begin(), fragData.fragFreqs.end()); } void FragsToQueryResult (IndexData &indexData, const QueryInfo &queryInfo, const FragData &termData, const UCArray &tag, const UCArray &term, mg_u_long stemMethod, mg_u_long termWeight, UCArrayVector &equivTerms, QueryResult &result) { bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo); result.Clear(); // log (N / ft) mg_u_long N = indexData.levels.levelInfo[indexData.curLevel].numEntries; float wordLog = log((double)N / (double)termData.matchDocs); // Wqt = fqt * log (N / ft) // note: terms are allowed to have a weight of zero so // they can be excluded from the ranking float Wqt = termWeight * wordLog; // Wdt = fdt * log (N / ft) float Wdt; mg_u_long termDataI = 0; mg_u_long termDataSize = termData.fragNums.size(); mg_u_long levelDocNum = 0; mg_u_long termDocFreq = 0; mg_u_long lastLevelDocNum = 0; mg_u_long overallwordfreq = 0; while (termDataI < termDataSize) { if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI], levelDocNum)) { if (levelDocNum != lastLevelDocNum) { if (lastLevelDocNum > 0) { // add this doc information if (needRanks) { Wdt = termDocFreq * wordLog; result.ranks.push_back (Wqt * Wdt); } result.docs.push_back (lastLevelDocNum); } lastLevelDocNum = levelDocNum; termDocFreq = 0; } if (needRanks){ termDocFreq += termData.fragFreqs[termDataI]; overallwordfreq += termData.fragFreqs[termDataI]; } } ++termDataI; } if (lastLevelDocNum > 0) { // add the last document information if (needRanks) { Wdt = termDocFreq * wordLog; result.ranks.push_back (Wqt * Wdt); } result.docs.push_back (lastLevelDocNum); } // add the term frequency information if (queryInfo.needTermFreqs) { TermFreqData termFreqData; termFreqData.tag = tag; termFreqData.term = term; termFreqData.stemMethod = stemMethod; termFreqData.equivTerms = equivTerms; termFreqData.matchDocs = termData.matchDocs; termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo //not true result.termFreqs.push_back (termFreqData); } } void AndFragsToQueryResult (IndexData &indexData, const QueryInfo &queryInfo, const FragData &termData, const UCArray &tag, const UCArray &term, mg_u_long stemMethod, mg_u_long termWeight, UCArrayVector &equivTerms, QueryResult &result) { bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo); // log (N / ft) float wordLog = log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/ (double)termData.matchDocs); // Wqt = fqt * log (N / ft) // note: terms are allowed to have a weight of zero so // they can be excluded from the ranking float Wqt = termWeight * wordLog; // Wdt = fdt * log (N / ft) float Wdt; mg_u_long termDataI = 0; mg_u_long termDataSize = termData.fragNums.size(); mg_u_long levelDocNum = 0; mg_u_long termDocFreq = 0; mg_u_long lastLevelDocNum = 0; mg_u_long overallwordfreq = 0; mg_u_long resultI = 0; mg_u_long resultSize = result.docs.size(); mg_u_long resultOutI = 0; while (termDataI < termDataSize) { if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI], levelDocNum)) { if (levelDocNum != lastLevelDocNum) { if (lastLevelDocNum > 0) { // add this doc information Wdt = termDocFreq * wordLog; // find this document number while (resultI < resultSize && result.docs[resultI] < lastLevelDocNum) ++resultI; // store the result if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) { result.docs[resultOutI] = lastLevelDocNum; if (needRanks) result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt; ++resultI; ++resultOutI; } } lastLevelDocNum = levelDocNum; termDocFreq = 0; } if (needRanks) termDocFreq += termData.fragFreqs[termDataI]; overallwordfreq += termData.fragFreqs[termDataI]; } ++termDataI; } // while if (lastLevelDocNum > 0) { // add the last document information Wdt = termDocFreq * wordLog; // find this document number while (resultI < resultSize && result.docs[resultI] < lastLevelDocNum) ++resultI; // store the result if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) { result.docs[resultOutI] = lastLevelDocNum; if (needRanks) result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt; ++resultI; ++resultOutI; } } // remove unneeded entries result.docs.erase (result.docs.begin()+resultOutI, result.docs.end()); if (needRanks) result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end()); else result.ranks.erase (result.ranks.begin(), result.ranks.end()); // add the term frequency information if (queryInfo.needTermFreqs) { TermFreqData termFreqData; termFreqData.tag = tag; termFreqData.term = term; termFreqData.stemMethod = stemMethod; termFreqData.equivTerms = equivTerms; termFreqData.matchDocs = termData.matchDocs; termFreqData.termFreq = overallwordfreq; result.termFreqs.push_back (termFreqData); } } void RemoveUnwantedResults (IndexData &indexData, const QueryInfo &queryInfo, const FragData &termData, QueryResult &result) { bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo); mg_u_long termDataI = 0; mg_u_long termDataSize = termData.fragNums.size(); mg_u_long levelDocNum = 0; mg_u_long lastLevelDocNum = 0; mg_u_long resultI = 0; mg_u_long resultSize = result.docs.size(); mg_u_long resultOutI = 0; while (termDataI < termDataSize) { if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI], levelDocNum)) { if (levelDocNum != lastLevelDocNum) { if (lastLevelDocNum > 0) { // find this document number while (resultI < resultSize && result.docs[resultI] < lastLevelDocNum) ++resultI; // store the result if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) { result.docs[resultOutI] = lastLevelDocNum; if (needRanks) result.ranks[resultOutI] = result.ranks[resultI]; ++resultI; ++resultOutI; } } lastLevelDocNum = levelDocNum; } } ++termDataI; } if (lastLevelDocNum > 0) { // find this document number while (resultI < resultSize && result.docs[resultI] < lastLevelDocNum) ++resultI; // store the result if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) { result.docs[resultOutI] = lastLevelDocNum; if (needRanks) result.ranks[resultOutI] = result.ranks[resultI]; ++resultI; ++resultOutI; } } // remove unneeded entries result.docs.erase (result.docs.begin()+resultOutI, result.docs.end()); if (needRanks) result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end()); else result.ranks.erase (result.ranks.begin(), result.ranks.end()); } //-------------------------------------------------------------- // functions to support full text browse void FindNearestWordNumber (IndexData &indexData, const UCArray &term, mg_u_long &number) { // find the word number for this term mg_u_long wordElNum = 0; mg_u_long numLevels = indexData.bdh.num_levels; word_block_dict_el wordDictEl; wordDictEl.SetNumLevels (numLevels); if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, wordElNum)) number = wordElNum; } void GetTermList(IndexData &indexData, mg_u_long startTerm, mg_u_long numTerms, TermFreqArray &terms) { word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array(); TermFreqData termdata; terms.erase(terms.begin(), terms.end()); SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, indexData.bdh.num_levels, startTerm, numTerms, wordBlocks); word_block_dict_el_array::iterator here = wordBlocks.begin(); word_block_dict_el_array::iterator end = wordBlocks.end(); while (here != end) { termdata.Clear(); termdata.term = (*here).el; termdata.termFreq = (*here).freq; terms.push_back(termdata); ++here; } } void GetTermList(IndexData &indexData, mg_u_long startTerm, mg_u_long numTerms, UCArrayVector &terms) { SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, indexData.bdh.num_levels, startTerm, numTerms, terms); }