source: main/trunk/greenstone2/common-src/indexers/mgpp/text/Terms.cpp@ 29629

Last change on this file since 29629 was 29629, checked in by kjdon, 9 years ago

updating AndFragsToQueryResult to use actual num docs in weight generation

  • Property svn:keywords set to Author Date Id Revision
File size: 23.9 KB
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "Terms.h"
23#include "words.h"
24#include "stemmer.h"
25#include "bitio_gen.h"
26#include "bitio_m_stdio.h"
27
28void QueryInfo::Clear () {
29 UCArrayClear (docLevel);
30 maxDocs = 0;
31 sortByRank = true;
32 exactWeights = false;
33 needRankInfo = false;
34 needTermFreqs = false;
35}
36
37
38
39void TermFreqData::Clear () {
40 UCArrayClear (tag);
41 UCArrayClear (term);
42 equivTerms.erase(equivTerms.begin(), equivTerms.end());
43 stemMethod = 0;
44 matchDocs = 0;
45 termFreq = 0;
46}
47
48ostream &operator<< (ostream &s, const TermFreqData &t) {
49 s << "<" << t.tag << ">\"" << t.term << "\"stem("
50 << t.stemMethod << ")equiv terms(";
51
52 mg_u_long i;
53 for (i=0; i<t.equivTerms.size(); ++i) {
54 s << t.equivTerms[i] << ", ";
55 }
56 s <<")docs(" << t.matchDocs << ")"
57 << "count("<<t.termFreq<<")";
58 return s;
59}
60
61bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
62 return ((t1.tag == t2.tag) &&
63 (t1.term == t2.term) &&
64 (t1.stemMethod == t2.stemMethod) &&
65 (t1.equivTerms == t2.equivTerms) &&
66 (t1.matchDocs == t2.matchDocs) &&
67 (t1.termFreq == t2.termFreq));
68}
69
70
71void QueryResult::Clear () {
72 docs.erase (docs.begin(), docs.end());
73 ranks.erase (ranks.begin(), ranks.end());
74 termFreqs.erase (termFreqs.begin(), termFreqs.end());
75 actualNumDocs = 0;
76}
77
78QueryResult::QueryResult () {
79 Clear ();
80}
81
82void QueryResult::printShort(ostream &s) {
83
84 s << "termFreqs: ";
85 for (mg_u_long i=0; i<termFreqs.size(); ++i)
86 s << termFreqs[i] << ", ";
87
88 s << "\nactual number of docs found: " << actualNumDocs;
89 s << "\n\n";
90
91}
92
93
94ostream &operator<< (ostream &s, const QueryResult &r) {
95 s << "docs: ";
96 mg_u_long i;
97 for (i=0; i<r.docs.size(); ++i)
98 s << r.docs[i] << ", ";
99
100 s << "\nranks: ";
101 for (i=0; i<r.ranks.size(); ++i)
102 s << r.ranks[i] << ", ";
103
104 s << "\ntermFreqs: ";
105 for (i=0; i<r.termFreqs.size(); ++i)
106 s << r.termFreqs[i] << ", ";
107
108 s << "\nactual number of docs found: " << r.actualNumDocs;
109 s << "\n\n";
110
111 return s;
112}
113
114
115bool operator== (const QueryResult &r1, const QueryResult &r2) {
116 return ((r1.docs == r2.docs) &&
117 (r1.ranks == r2.ranks) &&
118 (r1.termFreqs == r2.termFreqs) &&
119 (r1.actualNumDocs == r2.actualNumDocs));
120}
121
122//---------------------------------------------------
123// new ExtQueryResult stuff
124void ExtQueryResult::Clear () {
125 docs.erase (docs.begin(), docs.end());
126 levels.erase (levels.begin(), levels.end());
127 ranks.erase (ranks.begin(), ranks.end());
128 termFreqs.erase (termFreqs.begin(), termFreqs.end());
129 actualNumDocs = 0;
130}
131
132ExtQueryResult::ExtQueryResult () {
133 Clear ();
134}
135
136ostream &operator<< (ostream &s, const ExtQueryResult &r) {
137 s << "docs: ";
138 mg_u_long i;
139 for (i=0; i<r.docs.size(); ++i)
140 s << r.docs[i] << ", ";
141
142 s << "\nlevels: ";
143 for (i=0; i<r.levels.size(); ++i)
144 s << r.levels[i] << ", ";
145
146
147 s << "\nranks: ";
148 for (i=0; i<r.ranks.size(); ++i)
149 s << r.ranks[i] << ", ";
150
151 s << "\ntermFreqs: ";
152 for (i=0; i<r.termFreqs.size(); ++i)
153 s << r.termFreqs[i] << ", ";
154 s << "\nactual number of docs found: " << r.actualNumDocs;
155 s << "\n\n";
156
157 return s;
158}
159
160
161bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {
162 return ((r1.docs == r2.docs) &&
163 (r1.levels == r2.levels) &&
164 (r1.ranks == r2.ranks) &&
165 (r1.termFreqs == r2.termFreqs) &&
166 (r1.actualNumDocs == r2.actualNumDocs));
167}
168
169//-------------------------------------------------------
170// new BrowseQueryResult stuff
171void BrowseQueryResult::Clear () {
172 termFreqs.erase (termFreqs.begin(), termFreqs.end());
173}
174
175BrowseQueryResult::BrowseQueryResult () {
176 Clear ();
177}
178
179
180
181ostream &operator<< (ostream &s, const BrowseQueryResult &r) {
182 s << "terms: ";
183 mg_u_long i;
184 for (i=0; i<r.termFreqs.size(); ++i)
185 s << r.termFreqs[i] << ", ";
186 s << "\n\n";
187 return s;
188}
189
190
191bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2) {
192 return ((r1.termFreqs == r2.termFreqs));
193
194}
195
196
197
198
199//--------------------------------------
200void FragData::Clear () {
201 matchDocs = 0;
202 fragNums.erase (fragNums.begin(), fragNums.end());
203 fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
204}
205
206
207void FindWordNumbers (IndexData &indexData,
208 const UCArray &term,
209 mg_u_long stemMethod,
210 vector<mg_u_long> &equivWords) {
211 equivWords.erase (equivWords.begin(), equivWords.end());
212
213 if (!(stemMethod & STEM_PARTIAL_MATCH)) {
214 // If we are not doing a partial match then make sure the stemMethod we have
215 // specified is valid, ie the stem index has been built. If invalid, set it to 0 ie no stemming/casefolding/accentfolding
216 // If we are partial matching then we are not using the stem indexes so
217 // this doesn't matter.
218 if(stemMethod > STEM_MAX) {
219 cerr << "Stem method "<<stemMethod<< " is greater than maximum allowed ("<<STEM_MAX<<"). Not doing stemming\n";
220 stemMethod=0;
221 }
222 else if (stemMethod > 0 && indexData.stemFile[stemMethod-1] == NULL) {
223 cerr << "Stem index for method "<<stemMethod<< " was not built, so not doing stemming\n";
224 stemMethod = 0;
225 }
226 }
227 /* [JFG - Mar 06: Accent folding patch] */
228 /* use flag PARTIAL_MATCH */
229 if (stemMethod == 0 || (stemMethod & STEM_PARTIAL_MATCH)) {
230 // don't need to stem the word,
231 // find the word number(s) for this term
232 mg_u_long wordElNum = 0;
233 mg_u_long numLevels = indexData.bdh.num_levels;
234 word_block_dict_el wordDictEl;
235 wordDictEl.SetNumLevels (numLevels);
236 if (stemMethod == 0) {
237 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
238 indexData.bdh.entries_per_wblk,
239 indexData.bdh.word_dict_size,
240 numLevels, term, wordDictEl, wordElNum))
241 equivWords.push_back (wordElNum);
242
243 return;
244 } else {
245 // partial matching,
246 PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod & STEM_CaseFolding)? true : false);
247 // TODO: Accent Folding is not handled here!!
248 return;
249 }
250 }
251
252 // need to stem this word and find it in the blocked stem index
253 unsigned char mgWord[MAXSTEMLEN + 1];
254 UCArray stemTerm;
255 mg_u_long stemmerNum = 0;
256
257
258 stemmerNum = indexData.sih[stemMethod-1].stemmer_num;
259
260 // convert the word to an "mg word"
261 mgWord[0] = term.size();
262 memcpy ((char *)&mgWord[1], &(term[0]), term.size());
263
264 // stem the word
265 mgpp_stemmer (stemMethod, stemmerNum, mgWord);
266 // convert the result back to a UCArray
267 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
268
269 // need to look up this term in the appropriate dictionary
270 stem_block_dict_el stemDictEl;
271 mg_u_long stemElNum;
272 bool result = false;
273
274 /* [JFG - Mar 06: Accent folding patch] */
275 result = SearchStemBlockDictEl (indexData.stemFile[stemMethod-1],
276 indexData.sii[stemMethod-1],
277 indexData.sih[stemMethod-1].entries_per_block,
278 indexData.sih[stemMethod-1].dict_size,
279 stemTerm,
280 stemDictEl,
281 stemElNum);
282
283 if (result) {
284 equivWords = stemDictEl.equivWords;
285 }
286}
287
288
289
290void ReadTermFragData (IndexData &indexData,
291 bool needFragFreqs,
292 mg_u_long termNum,
293 FragData &fragData,
294 FragRangeArray *fragLimits,
295 UCArray & termWord) {
296 fragData.Clear();
297
298 // look up the word in the dictionary
299 mg_u_long numLevels = indexData.bdh.num_levels;
300 word_block_dict_el wordDictEl;
301 wordDictEl.SetNumLevels (numLevels);
302 if (!SearchWordBlockDictElNum (indexData.dictFile,
303 indexData.biWords,
304 indexData.bdh.entries_per_wblk,
305 indexData.bdh.word_dict_size,
306 numLevels,
307 termNum, wordDictEl))
308 return; // nothing more to do
309
310 fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
311 termWord = wordDictEl.el;
312 // seek to the appropriate place in the inverted file
313 fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
314 stdio_bitio_buffer buffer (indexData.invfFile);
315
316 mg_u_long B = BIO_Bblock_Init (indexData.bdh.num_frags,
317 wordDictEl.frag_occur);
318 mg_u_long fragNum = 0;
319 mg_u_long termFreq = 0;
320
321 mg_u_long fragLimitI = 0;
322 mg_u_long i;
323 for (i=0; i<wordDictEl.frag_occur; ++i) {
324 fragNum += buffer.bblock_decode (B, NULL);
325 if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
326 else termFreq = 1;
327
328 // get the right fragment range
329 if (fragLimits != NULL) {
330 while (fragLimitI+1 < (*fragLimits).size() &&
331 fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
332 ++fragLimitI;
333 }
334 }
335
336 // add the entry if it is within the limits
337 if ((fragLimits == NULL) ||
338 (fragLimitI < (*fragLimits).size() &&
339 fragNum > (*fragLimits)[fragLimitI].rangeStart &&
340 fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
341 fragData.fragNums.push_back (fragNum);
342 if (needFragFreqs)
343 fragData.fragFreqs.push_back (termFreq);
344 }
345 }
346
347 buffer.done();
348}
349
350
351void CombineFragData (bool needFragFreqs,
352 const FragData &f1,
353 const FragData &f2,
354 FragData &outFragData) {
355 outFragData.Clear();
356
357 // the new number of matching documents is the maximum
358 // of the two input matching number of documents -- it
359 // is assumed that these are at the same document level
360 outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
361 f1.matchDocs : f2.matchDocs;
362 // do or
363 mg_u_long f1I = 0, f1Size = f1.fragNums.size();
364 mg_u_long f2I = 0, f2Size = f2.fragNums.size();
365
366 while (f1I < f1Size || f2I < f2Size) {
367 if (f2I < f2Size &&
368 (f1I >= f1Size ||
369 f1.fragNums[f1I] > f2.fragNums[f2I])) {
370 // output f2I
371 outFragData.fragNums.push_back (f2.fragNums[f2I]);
372 if (needFragFreqs)
373 outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
374 ++f2I;
375
376 } else if (f1I < f1Size &&
377 (f2I >= f2Size ||
378 f1.fragNums[f1I] < f2.fragNums[f2I])) {
379 // output f1I
380 outFragData.fragNums.push_back (f1.fragNums[f1I]);
381 if (needFragFreqs)
382 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
383 ++f1I;
384
385 } else {
386 // must be equal combine f1I and f2I
387 outFragData.fragNums.push_back (f1.fragNums[f1I]);
388 if (needFragFreqs)
389 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
390 ++f1I;
391 ++f2I;
392 }
393 }
394}
395
396
397void AndCombineFragData (bool needFragFreqs,
398 FragData &fragData,
399 const FragData &comFragData,
400 mg_s_long startRange,
401 mg_s_long endRange,
402 const FragRangeArray *fragLimits) {
403 // sanity check on range
404 if (startRange > endRange) {
405 mg_s_long temp = endRange;
406 endRange = startRange;
407 startRange = temp;
408 }
409
410 // get min matchdocs
411 if (comFragData.matchDocs < fragData.matchDocs)
412 fragData.matchDocs = comFragData.matchDocs;
413
414 mg_u_long fragDataI = 0;
415 mg_u_long fragDataSize = fragData.fragNums.size();
416 mg_u_long comFragDataI = 0;
417 mg_u_long comFragDataSize = comFragData.fragNums.size();
418 mg_u_long fragLimitI = 0;
419 mg_u_long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
420 mg_u_long outI = 0;
421
422 while (fragDataI < fragDataSize &&
423 comFragDataI < comFragDataSize) {
424 mg_s_long fragNum = (mg_s_long)fragData.fragNums[fragDataI];
425 mg_s_long comFragNum = (mg_s_long)comFragData.fragNums[comFragDataI];
426
427 // go to the right fragment limit (for the com frag)
428 if (fragLimits != NULL) {
429 while (fragLimitI+1 < fragLimitSize &&
430 comFragNum > (mg_s_long)(*fragLimits)[fragLimitI+1].rangeStart) {
431 ++fragLimitI;
432 }
433 }
434
435 if (fragNum <= comFragNum+startRange ||
436 (fragLimits!=NULL &&
437 fragNum<=(mg_s_long)(*fragLimits)[fragLimitI].rangeStart)) {
438 ++fragDataI;
439
440 } else if (fragNum > comFragNum+endRange ||
441 (fragLimits!=NULL &&
442 fragNum>(mg_s_long)(*fragLimits)[fragLimitI].rangeEnd)) {
443 ++comFragDataI;
444
445 } else {
446 // equal and within tag
447 fragData.fragNums[outI] = comFragNum;
448 if (needFragFreqs) {
449 fragData.fragFreqs[outI] =
450 (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
451 fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
452 }
453 ++fragDataI;
454 ++comFragDataI;
455 ++outI;
456 }
457 }
458
459 // erase unused part of fragData
460 fragData.fragNums.erase (fragData.fragNums.begin()+outI,
461 fragData.fragNums.end());
462 if (needFragFreqs)
463 fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
464 fragData.fragFreqs.end());
465 else
466 fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
467 fragData.fragFreqs.end());
468}
469
470
471void FragsToQueryResult (IndexData &indexData,
472 const QueryInfo &queryInfo,
473 const FragData &termData,
474 const UCArray &tag,
475 const UCArray &term,
476 mg_u_long stemMethod,
477 mg_u_long termWeight,
478 UCArrayVector &equivTerms,
479 QueryResult &result) {
480 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
481
482 result.Clear();
483
484 // log (N / ft)
485 mg_u_long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
486 // termData.matchDocs is not accurate - its just the largest docfreq out of the list of equiv terms. We'll delay calculating ranks until after we have worked out exactly how many docs we have
487 //float wordLog = log((double)N / (double)termData.matchDocs);
488
489 // Wqt = fqt * log (N / ft)
490 // note: terms are allowed to have a weight of zero so
491 // they can be excluded from the ranking
492 //float Wqt = termWeight * wordLog;
493
494 // Wdt = fdt * log (N / ft)
495 //float Wdt;
496 mg_u_long actual_num_match_docs = 0;
497 vector<mg_u_long> docFreqsArray;
498
499 mg_u_long termDataI = 0;
500 mg_u_long termDataSize = termData.fragNums.size();
501 mg_u_long levelDocNum = 0;
502
503 mg_u_long termDocFreq = 0;
504 mg_u_long lastLevelDocNum = 0;
505 mg_u_long overallwordfreq = 0;
506
507 while (termDataI < termDataSize) {
508 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
509 levelDocNum)) {
510 if (levelDocNum != lastLevelDocNum) {
511 if (lastLevelDocNum > 0) {
512 // add this doc information
513 if (needRanks) {
514 //Wdt = termDocFreq * wordLog;
515 //result.ranks.push_back (Wqt * Wdt);
516 docFreqsArray.push_back(termDocFreq);
517 }
518 result.docs.push_back (lastLevelDocNum);
519 ++actual_num_match_docs;
520 }
521
522 lastLevelDocNum = levelDocNum;
523 termDocFreq = 0;
524 }
525
526 if (needRanks){
527 termDocFreq += termData.fragFreqs[termDataI];
528 overallwordfreq += termData.fragFreqs[termDataI];
529 }
530 }
531 ++termDataI;
532 }
533
534 if (lastLevelDocNum > 0) {
535 // add the last document information
536 if (needRanks) {
537 //Wdt = termDocFreq * wordLog;
538 //result.ranks.push_back (Wqt * Wdt);
539 docFreqsArray.push_back(termDocFreq);
540 }
541 result.docs.push_back (lastLevelDocNum);
542 ++actual_num_match_docs;
543 }
544 // Now that we know the actual number of docs containing this term, we can calculate ranks
545 float wordLog = log((double)N / (double)actual_num_match_docs);
546 float Wqt = termWeight * wordLog;
547 float factor = wordLog * Wqt;
548
549 mg_u_long docFreqI = 0;
550 mg_u_long docFreqSize = docFreqsArray.size();
551
552 while (docFreqI < docFreqSize) {
553 result.ranks.push_back(docFreqsArray[docFreqI]*factor);
554 ++docFreqI;
555 }
556
557 // add the term frequency information
558 if (queryInfo.needTermFreqs) {
559 TermFreqData termFreqData;
560 termFreqData.tag = tag;
561 termFreqData.term = term;
562 termFreqData.stemMethod = stemMethod;
563 termFreqData.equivTerms = equivTerms;
564 //termFreqData.matchDocs = termData.matchDocs;
565 termFreqData.matchDocs = actual_num_match_docs;
566 termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo
567 //not true
568 result.termFreqs.push_back (termFreqData);
569 }
570}
571
572void AndFragsToQueryResult (IndexData &indexData,
573 const QueryInfo &queryInfo,
574 const FragData &termData,
575 const UCArray &tag,
576 const UCArray &term,
577 mg_u_long stemMethod,
578 mg_u_long termWeight,
579 UCArrayVector &equivTerms,
580 QueryResult &result) {
581 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
582
583 // log (N / ft)
584 //float wordLog =
585 // log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
586 // (double)termData.matchDocs);
587
588 // Wqt = fqt * log (N / ft)
589 // note: terms are allowed to have a weight of zero so
590 // they can be excluded from the ranking
591 //float Wqt = termWeight * wordLog;
592
593 // Wdt = fdt * log (N / ft)
594 float Wdt;
595
596 mg_u_long termDataI = 0;
597 mg_u_long termDataSize = termData.fragNums.size();
598 mg_u_long levelDocNum = 0;
599
600 mg_u_long termDocFreq = 0;
601 mg_u_long lastLevelDocNum = 0;
602 mg_u_long overallwordfreq = 0;
603 mg_u_long resultI = 0;
604 mg_u_long resultSize = result.docs.size();
605 mg_u_long resultOutI = 0;
606
607 mg_u_long actual_num_match_docs = 0;
608 vector<mg_u_long> docFreqsArray;
609 while (termDataI < termDataSize) {
610 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
611 levelDocNum)) {
612 if (levelDocNum != lastLevelDocNum) {
613 if (lastLevelDocNum > 0) {
614 ++actual_num_match_docs;
615
616 //Wdt = termDocFreq * wordLog;
617
618 // find this document number
619 while (resultI < resultSize &&
620 result.docs[resultI] < lastLevelDocNum)
621 ++resultI;
622
623 // store the result
624 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
625 result.docs[resultOutI] = lastLevelDocNum;
626 if (needRanks) {
627 // store the doc freq so we can calulate the rank for the new term
628 // once we know the num docs
629 docFreqsArray.push_back(termDocFreq);
630 // just store the old rank for now, and we'll add on the new bit at the end
631 result.ranks[resultOutI] = result.ranks[resultI]; // + Wqt * Wdt;
632 }
633 ++resultI;
634 ++resultOutI;
635 }
636 }
637
638 lastLevelDocNum = levelDocNum;
639 termDocFreq = 0;
640 }
641
642 if (needRanks) {
643 termDocFreq += termData.fragFreqs[termDataI];
644 }
645 overallwordfreq += termData.fragFreqs[termDataI];
646 }
647
648 ++termDataI;
649 } // while
650
651 if (lastLevelDocNum > 0) {
652 ++actual_num_match_docs;
653 // add the last document information
654 //Wdt = termDocFreq * wordLog;
655
656 // find this document number
657 while (resultI < resultSize &&
658 result.docs[resultI] < lastLevelDocNum)
659 ++resultI;
660
661 // store the result
662 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
663 result.docs[resultOutI] = lastLevelDocNum;
664 if (needRanks) {
665 // store the doc freq so we can calulate the rank for the new term
666 // once we know the num docs
667 docFreqsArray.push_back(termDocFreq);
668 // just store the old rank for now, and we'll add on the new bit at the end
669 result.ranks[resultOutI] = result.ranks[resultI]; // + Wqt * Wdt;
670 }
671 ++resultI;
672 ++resultOutI;
673 }
674 }
675
676 // remove unneeded entries
677 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
678 if (needRanks)
679 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
680 else
681 result.ranks.erase (result.ranks.begin(), result.ranks.end());
682
683 // Calcalate correct ranks
684 float wordLog = log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries / (double)actual_num_match_docs);
685 float Wqt = termWeight * wordLog;
686 float factor = wordLog * Wqt;
687
688 mg_u_long docFreqI = 0;
689 mg_u_long docFreqSize = docFreqsArray.size();
690
691 while (docFreqI < docFreqSize) {
692 result.ranks[docFreqI] = result.ranks[docFreqI] + docFreqsArray[docFreqI]*factor;
693 ++docFreqI;
694 }
695
696 // add the term frequency information
697 if (queryInfo.needTermFreqs) {
698 TermFreqData termFreqData;
699 termFreqData.tag = tag;
700 termFreqData.term = term;
701 termFreqData.stemMethod = stemMethod;
702 termFreqData.equivTerms = equivTerms;
703 //termFreqData.matchDocs = termData.matchDocs;
704 termFreqData.matchDocs = actual_num_match_docs;
705 termFreqData.termFreq = overallwordfreq;
706 result.termFreqs.push_back (termFreqData);
707 }
708}
709
710
711void RemoveUnwantedResults (IndexData &indexData,
712 const QueryInfo &queryInfo,
713 const FragData &termData,
714 QueryResult &result) {
715 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
716
717 mg_u_long termDataI = 0;
718 mg_u_long termDataSize = termData.fragNums.size();
719 mg_u_long levelDocNum = 0;
720
721 mg_u_long lastLevelDocNum = 0;
722
723 mg_u_long resultI = 0;
724 mg_u_long resultSize = result.docs.size();
725 mg_u_long resultOutI = 0;
726
727 while (termDataI < termDataSize) {
728 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
729 levelDocNum)) {
730 if (levelDocNum != lastLevelDocNum) {
731 if (lastLevelDocNum > 0) {
732 // find this document number
733 while (resultI < resultSize &&
734 result.docs[resultI] < lastLevelDocNum)
735 ++resultI;
736
737 // store the result
738 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
739 result.docs[resultOutI] = lastLevelDocNum;
740 if (needRanks)
741 result.ranks[resultOutI] = result.ranks[resultI];
742 ++resultI;
743 ++resultOutI;
744 }
745 }
746
747 lastLevelDocNum = levelDocNum;
748 }
749 }
750
751 ++termDataI;
752 }
753
754 if (lastLevelDocNum > 0) {
755 // find this document number
756 while (resultI < resultSize &&
757 result.docs[resultI] < lastLevelDocNum)
758 ++resultI;
759
760 // store the result
761 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
762 result.docs[resultOutI] = lastLevelDocNum;
763 if (needRanks)
764 result.ranks[resultOutI] = result.ranks[resultI];
765 ++resultI;
766 ++resultOutI;
767 }
768 }
769
770 // remove unneeded entries
771 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
772 if (needRanks)
773 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
774 else
775 result.ranks.erase (result.ranks.begin(), result.ranks.end());
776}
777
778
779
780//--------------------------------------------------------------
781// functions to support full text browse
782
783void FindNearestWordNumber (IndexData &indexData,
784 const UCArray &term,
785 mg_u_long &number) {
786
787 // find the word number for this term
788 mg_u_long wordElNum = 0;
789 mg_u_long numLevels = indexData.bdh.num_levels;
790 word_block_dict_el wordDictEl;
791 wordDictEl.SetNumLevels (numLevels);
792 if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
793 indexData.bdh.entries_per_wblk,
794 indexData.bdh.word_dict_size,
795 numLevels, term, wordDictEl, wordElNum))
796 number = wordElNum;
797
798}
799
800void GetTermList(IndexData &indexData,
801 mg_u_long startTerm,
802 mg_u_long numTerms,
803 TermFreqArray &terms) {
804
805 word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array();
806 TermFreqData termdata;
807
808 terms.erase(terms.begin(), terms.end());
809
810 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
811 indexData.bdh.entries_per_wblk,
812 indexData.bdh.word_dict_size,
813 indexData.bdh.num_levels, startTerm,
814 numTerms, wordBlocks);
815
816 word_block_dict_el_array::iterator here = wordBlocks.begin();
817 word_block_dict_el_array::iterator end = wordBlocks.end();
818
819 while (here != end) {
820 termdata.Clear();
821 termdata.term = (*here).el;
822 termdata.termFreq = (*here).freq;
823 terms.push_back(termdata);
824 ++here;
825 }
826
827}
828
829void GetTermList(IndexData &indexData,
830 mg_u_long startTerm,
831 mg_u_long numTerms,
832 UCArrayVector &terms) {
833
834
835
836 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
837 indexData.bdh.entries_per_wblk,
838 indexData.bdh.word_dict_size,
839 indexData.bdh.num_levels, startTerm,
840 numTerms, terms);
841
842}
Note: See TracBrowser for help on using the repository browser.