source: main/trunk/greenstone2/common-src/indexers/mgpp/text/Terms.cpp@ 29581

Last change on this file since 29581 was 29581, checked in by kjdon, 9 years ago

in gs2mgppdemo, a query of government was coming back with totalMatchDocs 127, but in term info, it said 'government' was found in 108 docs. This is because when generating the list of word nums for government, it looks up the equivalent terms (due to casefolding, stemming etc) and there are 2: government and Government. It gets the list of word positions for each one and merges the lists. When you get the list of word positions, you also get back the number of docs/secs that match the word. Government had 42, and government had 108. The merging code says that for total match docs we'll just take the larger number, ie 108. Later on, this figure is used as total number of matching documents for the ranking calculation, and for the info in the query result.
I have added a new variable, actual_num_match_docs, which we increment as we go through the word position lists and generate doc/sec numbers. This is the point when we actually know how many matches we have. For FragsToQueryResult, instead of calculating ranks as we generate each doc num, I am just storing the doc term freq, then once we know the actual number, we can calculate term weight and query term weight to generate the ranks. I still need to modify AndFragsToQueryResult similarly. This currently calculates actual_num_match_docs and uses it in the query result, but it doesn't yet use it for the rank generation.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.9 KB
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "Terms.h"
23#include "words.h"
24#include "stemmer.h"
25#include "bitio_gen.h"
26#include "bitio_m_stdio.h"
27
28void QueryInfo::Clear () {
29 UCArrayClear (docLevel);
30 maxDocs = 0;
31 sortByRank = true;
32 exactWeights = false;
33 needRankInfo = false;
34 needTermFreqs = false;
35}
36
37
38
39void TermFreqData::Clear () {
40 UCArrayClear (tag);
41 UCArrayClear (term);
42 equivTerms.erase(equivTerms.begin(), equivTerms.end());
43 stemMethod = 0;
44 matchDocs = 0;
45 termFreq = 0;
46}
47
48ostream &operator<< (ostream &s, const TermFreqData &t) {
49 s << "<" << t.tag << ">\"" << t.term << "\"stem("
50 << t.stemMethod << ")equiv terms(";
51
52 mg_u_long i;
53 for (i=0; i<t.equivTerms.size(); ++i) {
54 s << t.equivTerms[i] << ", ";
55 }
56 s <<")docs(" << t.matchDocs << ")"
57 << "count("<<t.termFreq<<")";
58 return s;
59}
60
61bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
62 return ((t1.tag == t2.tag) &&
63 (t1.term == t2.term) &&
64 (t1.stemMethod == t2.stemMethod) &&
65 (t1.equivTerms == t2.equivTerms) &&
66 (t1.matchDocs == t2.matchDocs) &&
67 (t1.termFreq == t2.termFreq));
68}
69
70
71void QueryResult::Clear () {
72 docs.erase (docs.begin(), docs.end());
73 ranks.erase (ranks.begin(), ranks.end());
74 termFreqs.erase (termFreqs.begin(), termFreqs.end());
75 actualNumDocs = 0;
76}
77
78QueryResult::QueryResult () {
79 Clear ();
80}
81
82void QueryResult::printShort(ostream &s) {
83
84 s << "termFreqs: ";
85 for (mg_u_long i=0; i<termFreqs.size(); ++i)
86 s << termFreqs[i] << ", ";
87
88 s << "\nactual number of docs found: " << actualNumDocs;
89 s << "\n\n";
90
91}
92
93
94ostream &operator<< (ostream &s, const QueryResult &r) {
95 s << "docs: ";
96 mg_u_long i;
97 for (i=0; i<r.docs.size(); ++i)
98 s << r.docs[i] << ", ";
99
100 s << "\nranks: ";
101 for (i=0; i<r.ranks.size(); ++i)
102 s << r.ranks[i] << ", ";
103
104 s << "\ntermFreqs: ";
105 for (i=0; i<r.termFreqs.size(); ++i)
106 s << r.termFreqs[i] << ", ";
107
108 s << "\nactual number of docs found: " << r.actualNumDocs;
109 s << "\n\n";
110
111 return s;
112}
113
114
115bool operator== (const QueryResult &r1, const QueryResult &r2) {
116 return ((r1.docs == r2.docs) &&
117 (r1.ranks == r2.ranks) &&
118 (r1.termFreqs == r2.termFreqs) &&
119 (r1.actualNumDocs == r2.actualNumDocs));
120}
121
122//---------------------------------------------------
123// new ExtQueryResult stuff
124void ExtQueryResult::Clear () {
125 docs.erase (docs.begin(), docs.end());
126 levels.erase (levels.begin(), levels.end());
127 ranks.erase (ranks.begin(), ranks.end());
128 termFreqs.erase (termFreqs.begin(), termFreqs.end());
129 actualNumDocs = 0;
130}
131
132ExtQueryResult::ExtQueryResult () {
133 Clear ();
134}
135
136ostream &operator<< (ostream &s, const ExtQueryResult &r) {
137 s << "docs: ";
138 mg_u_long i;
139 for (i=0; i<r.docs.size(); ++i)
140 s << r.docs[i] << ", ";
141
142 s << "\nlevels: ";
143 for (i=0; i<r.levels.size(); ++i)
144 s << r.levels[i] << ", ";
145
146
147 s << "\nranks: ";
148 for (i=0; i<r.ranks.size(); ++i)
149 s << r.ranks[i] << ", ";
150
151 s << "\ntermFreqs: ";
152 for (i=0; i<r.termFreqs.size(); ++i)
153 s << r.termFreqs[i] << ", ";
154 s << "\nactual number of docs found: " << r.actualNumDocs;
155 s << "\n\n";
156
157 return s;
158}
159
160
161bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {
162 return ((r1.docs == r2.docs) &&
163 (r1.levels == r2.levels) &&
164 (r1.ranks == r2.ranks) &&
165 (r1.termFreqs == r2.termFreqs) &&
166 (r1.actualNumDocs == r2.actualNumDocs));
167}
168
169//-------------------------------------------------------
170// new BrowseQueryResult stuff
171void BrowseQueryResult::Clear () {
172 termFreqs.erase (termFreqs.begin(), termFreqs.end());
173}
174
175BrowseQueryResult::BrowseQueryResult () {
176 Clear ();
177}
178
179
180
181ostream &operator<< (ostream &s, const BrowseQueryResult &r) {
182 s << "terms: ";
183 mg_u_long i;
184 for (i=0; i<r.termFreqs.size(); ++i)
185 s << r.termFreqs[i] << ", ";
186 s << "\n\n";
187 return s;
188}
189
190
191bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2) {
192 return ((r1.termFreqs == r2.termFreqs));
193
194}
195
196
197
198
199//--------------------------------------
200void FragData::Clear () {
201 matchDocs = 0;
202 fragNums.erase (fragNums.begin(), fragNums.end());
203 fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
204}
205
206
207void FindWordNumbers (IndexData &indexData,
208 const UCArray &term,
209 mg_u_long stemMethod,
210 vector<mg_u_long> &equivWords) {
211 equivWords.erase (equivWords.begin(), equivWords.end());
212
213 if (!(stemMethod & STEM_PARTIAL_MATCH)) {
214 // If we are not doing a partial match then make sure the stemMethod we have
215 // specified is valid, ie the stem index has been built. If invalid, set it to 0 ie no stemming/casefolding/accentfolding
216 // If we are partial matching then we are not using the stem indexes so
217 // this doesn't matter.
218 if(stemMethod > STEM_MAX) {
219 cerr << "Stem method "<<stemMethod<< " is greater than maximum allowed ("<<STEM_MAX<<"). Not doing stemming\n";
220 stemMethod=0;
221 }
222 else if (stemMethod > 0 && indexData.stemFile[stemMethod-1] == NULL) {
223 cerr << "Stem index for method "<<stemMethod<< " was not built, so not doing stemming\n";
224 stemMethod = 0;
225 }
226 }
227 /* [JFG - Mar 06: Accent folding patch] */
228 /* use flag PARTIAL_MATCH */
229 if (stemMethod == 0 || (stemMethod & STEM_PARTIAL_MATCH)) {
230 // don't need to stem the word,
231 // find the word number(s) for this term
232 mg_u_long wordElNum = 0;
233 mg_u_long numLevels = indexData.bdh.num_levels;
234 word_block_dict_el wordDictEl;
235 wordDictEl.SetNumLevels (numLevels);
236 if (stemMethod == 0) {
237 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
238 indexData.bdh.entries_per_wblk,
239 indexData.bdh.word_dict_size,
240 numLevels, term, wordDictEl, wordElNum))
241 equivWords.push_back (wordElNum);
242
243 return;
244 } else {
245 // partial matching,
246 PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod & STEM_CaseFolding)? true : false);
247 // TODO: Accent Folding is not handled here!!
248 return;
249 }
250 }
251
252 // need to stem this word and find it in the blocked stem index
253 unsigned char mgWord[MAXSTEMLEN + 1];
254 UCArray stemTerm;
255 mg_u_long stemmerNum = 0;
256
257
258 stemmerNum = indexData.sih[stemMethod-1].stemmer_num;
259
260 // convert the word to an "mg word"
261 mgWord[0] = term.size();
262 memcpy ((char *)&mgWord[1], &(term[0]), term.size());
263
264 // stem the word
265 mgpp_stemmer (stemMethod, stemmerNum, mgWord);
266 // convert the result back to a UCArray
267 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
268
269 // need to look up this term in the appropriate dictionary
270 stem_block_dict_el stemDictEl;
271 mg_u_long stemElNum;
272 bool result = false;
273
274 /* [JFG - Mar 06: Accent folding patch] */
275 result = SearchStemBlockDictEl (indexData.stemFile[stemMethod-1],
276 indexData.sii[stemMethod-1],
277 indexData.sih[stemMethod-1].entries_per_block,
278 indexData.sih[stemMethod-1].dict_size,
279 stemTerm,
280 stemDictEl,
281 stemElNum);
282
283 if (result) {
284 equivWords = stemDictEl.equivWords;
285 }
286}
287
288
289
290void ReadTermFragData (IndexData &indexData,
291 bool needFragFreqs,
292 mg_u_long termNum,
293 FragData &fragData,
294 FragRangeArray *fragLimits,
295 UCArray & termWord) {
296 fragData.Clear();
297
298 // look up the word in the dictionary
299 mg_u_long numLevels = indexData.bdh.num_levels;
300 word_block_dict_el wordDictEl;
301 wordDictEl.SetNumLevels (numLevels);
302 if (!SearchWordBlockDictElNum (indexData.dictFile,
303 indexData.biWords,
304 indexData.bdh.entries_per_wblk,
305 indexData.bdh.word_dict_size,
306 numLevels,
307 termNum, wordDictEl))
308 return; // nothing more to do
309
310 fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
311 termWord = wordDictEl.el;
312 // seek to the appropriate place in the inverted file
313 fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
314 stdio_bitio_buffer buffer (indexData.invfFile);
315
316 mg_u_long B = BIO_Bblock_Init (indexData.bdh.num_frags,
317 wordDictEl.frag_occur);
318 mg_u_long fragNum = 0;
319 mg_u_long termFreq = 0;
320
321 mg_u_long fragLimitI = 0;
322 mg_u_long i;
323 for (i=0; i<wordDictEl.frag_occur; ++i) {
324 fragNum += buffer.bblock_decode (B, NULL);
325 if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
326 else termFreq = 1;
327
328 // get the right fragment range
329 if (fragLimits != NULL) {
330 while (fragLimitI+1 < (*fragLimits).size() &&
331 fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
332 ++fragLimitI;
333 }
334 }
335
336 // add the entry if it is within the limits
337 if ((fragLimits == NULL) ||
338 (fragLimitI < (*fragLimits).size() &&
339 fragNum > (*fragLimits)[fragLimitI].rangeStart &&
340 fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
341 fragData.fragNums.push_back (fragNum);
342 if (needFragFreqs)
343 fragData.fragFreqs.push_back (termFreq);
344 }
345 }
346
347 buffer.done();
348}
349
350
351void CombineFragData (bool needFragFreqs,
352 const FragData &f1,
353 const FragData &f2,
354 FragData &outFragData) {
355 outFragData.Clear();
356
357 // the new number of matching documents is the maximum
358 // of the two input matching number of documents -- it
359 // is assumed that these are at the same document level
360 outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
361 f1.matchDocs : f2.matchDocs;
362 // do or
363 mg_u_long f1I = 0, f1Size = f1.fragNums.size();
364 mg_u_long f2I = 0, f2Size = f2.fragNums.size();
365
366 while (f1I < f1Size || f2I < f2Size) {
367 if (f2I < f2Size &&
368 (f1I >= f1Size ||
369 f1.fragNums[f1I] > f2.fragNums[f2I])) {
370 // output f2I
371 outFragData.fragNums.push_back (f2.fragNums[f2I]);
372 if (needFragFreqs)
373 outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
374 ++f2I;
375
376 } else if (f1I < f1Size &&
377 (f2I >= f2Size ||
378 f1.fragNums[f1I] < f2.fragNums[f2I])) {
379 // output f1I
380 outFragData.fragNums.push_back (f1.fragNums[f1I]);
381 if (needFragFreqs)
382 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
383 ++f1I;
384
385 } else {
386 // must be equal combine f1I and f2I
387 outFragData.fragNums.push_back (f1.fragNums[f1I]);
388 if (needFragFreqs)
389 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
390 ++f1I;
391 ++f2I;
392 }
393 }
394}
395
396
397void AndCombineFragData (bool needFragFreqs,
398 FragData &fragData,
399 const FragData &comFragData,
400 mg_s_long startRange,
401 mg_s_long endRange,
402 const FragRangeArray *fragLimits) {
403 // sanity check on range
404 if (startRange > endRange) {
405 mg_s_long temp = endRange;
406 endRange = startRange;
407 startRange = temp;
408 }
409
410 // get min matchdocs
411 if (comFragData.matchDocs < fragData.matchDocs)
412 fragData.matchDocs = comFragData.matchDocs;
413
414 mg_u_long fragDataI = 0;
415 mg_u_long fragDataSize = fragData.fragNums.size();
416 mg_u_long comFragDataI = 0;
417 mg_u_long comFragDataSize = comFragData.fragNums.size();
418 mg_u_long fragLimitI = 0;
419 mg_u_long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
420 mg_u_long outI = 0;
421
422 while (fragDataI < fragDataSize &&
423 comFragDataI < comFragDataSize) {
424 mg_s_long fragNum = (mg_s_long)fragData.fragNums[fragDataI];
425 mg_s_long comFragNum = (mg_s_long)comFragData.fragNums[comFragDataI];
426
427 // go to the right fragment limit (for the com frag)
428 if (fragLimits != NULL) {
429 while (fragLimitI+1 < fragLimitSize &&
430 comFragNum > (mg_s_long)(*fragLimits)[fragLimitI+1].rangeStart) {
431 ++fragLimitI;
432 }
433 }
434
435 if (fragNum <= comFragNum+startRange ||
436 (fragLimits!=NULL &&
437 fragNum<=(mg_s_long)(*fragLimits)[fragLimitI].rangeStart)) {
438 ++fragDataI;
439
440 } else if (fragNum > comFragNum+endRange ||
441 (fragLimits!=NULL &&
442 fragNum>(mg_s_long)(*fragLimits)[fragLimitI].rangeEnd)) {
443 ++comFragDataI;
444
445 } else {
446 // equal and within tag
447 fragData.fragNums[outI] = comFragNum;
448 if (needFragFreqs) {
449 fragData.fragFreqs[outI] =
450 (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
451 fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
452 }
453 ++fragDataI;
454 ++comFragDataI;
455 ++outI;
456 }
457 }
458
459 // erase unused part of fragData
460 fragData.fragNums.erase (fragData.fragNums.begin()+outI,
461 fragData.fragNums.end());
462 if (needFragFreqs)
463 fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
464 fragData.fragFreqs.end());
465 else
466 fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
467 fragData.fragFreqs.end());
468}
469
470
471void FragsToQueryResult (IndexData &indexData,
472 const QueryInfo &queryInfo,
473 const FragData &termData,
474 const UCArray &tag,
475 const UCArray &term,
476 mg_u_long stemMethod,
477 mg_u_long termWeight,
478 UCArrayVector &equivTerms,
479 QueryResult &result) {
480 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
481
482 result.Clear();
483
484 // log (N / ft)
485 mg_u_long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
486 // termData.matchDocs is not accurate - its just the largest docfreq out of the list of equiv terms. We'll delay calculating ranks until after we have worked out exactly how many docs we have
487 //float wordLog = log((double)N / (double)termData.matchDocs);
488
489 // Wqt = fqt * log (N / ft)
490 // note: terms are allowed to have a weight of zero so
491 // they can be excluded from the ranking
492 //float Wqt = termWeight * wordLog;
493
494 // Wdt = fdt * log (N / ft)
495 //float Wdt;
496 mg_u_long actual_num_match_docs = 0;
497 vector<mg_u_long> docFreqsArray;
498
499 mg_u_long termDataI = 0;
500 mg_u_long termDataSize = termData.fragNums.size();
501 mg_u_long levelDocNum = 0;
502
503 mg_u_long termDocFreq = 0;
504 mg_u_long lastLevelDocNum = 0;
505 mg_u_long overallwordfreq = 0;
506
507 while (termDataI < termDataSize) {
508 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
509 levelDocNum)) {
510 if (levelDocNum != lastLevelDocNum) {
511 if (lastLevelDocNum > 0) {
512 // add this doc information
513 if (needRanks) {
514 //Wdt = termDocFreq * wordLog;
515 //result.ranks.push_back (Wqt * Wdt);
516 docFreqsArray.push_back(termDocFreq);
517 }
518 result.docs.push_back (lastLevelDocNum);
519 ++actual_num_match_docs;
520 }
521
522 lastLevelDocNum = levelDocNum;
523 termDocFreq = 0;
524 }
525
526 if (needRanks){
527 termDocFreq += termData.fragFreqs[termDataI];
528 overallwordfreq += termData.fragFreqs[termDataI];
529 }
530 }
531 ++termDataI;
532 }
533
534 if (lastLevelDocNum > 0) {
535 // add the last document information
536 if (needRanks) {
537 //Wdt = termDocFreq * wordLog;
538 //result.ranks.push_back (Wqt * Wdt);
539 docFreqsArray.push_back(termDocFreq);
540 }
541 result.docs.push_back (lastLevelDocNum);
542 ++actual_num_match_docs;
543 }
544 // Now that we know the actual number of docs containing this term, we can calculate ranks
545 float wordLog = log((double)N / (double)actual_num_match_docs);
546 float Wqt = termWeight * wordLog;
547 float factor = wordLog * Wqt;
548
549 mg_u_long docFreqI = 0;
550 mg_u_long docFreqSize = docFreqsArray.size();
551
552 while (docFreqI < docFreqSize) {
553 result.ranks.push_back(docFreqsArray[docFreqI]*factor);
554 ++docFreqI;
555 }
556
557 // add the term frequency information
558 if (queryInfo.needTermFreqs) {
559 TermFreqData termFreqData;
560 termFreqData.tag = tag;
561 termFreqData.term = term;
562 termFreqData.stemMethod = stemMethod;
563 termFreqData.equivTerms = equivTerms;
564 //termFreqData.matchDocs = termData.matchDocs;
565 termFreqData.matchDocs = actual_num_match_docs;
566 termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo
567 //not true
568 result.termFreqs.push_back (termFreqData);
569 }
570}
571
572void AndFragsToQueryResult (IndexData &indexData,
573 const QueryInfo &queryInfo,
574 const FragData &termData,
575 const UCArray &tag,
576 const UCArray &term,
577 mg_u_long stemMethod,
578 mg_u_long termWeight,
579 UCArrayVector &equivTerms,
580 QueryResult &result) {
581 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
582
583 // log (N / ft)
584 float wordLog =
585 log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
586 (double)termData.matchDocs);
587
588 // Wqt = fqt * log (N / ft)
589 // note: terms are allowed to have a weight of zero so
590 // they can be excluded from the ranking
591 float Wqt = termWeight * wordLog;
592
593 // Wdt = fdt * log (N / ft)
594 float Wdt;
595
596 mg_u_long termDataI = 0;
597 mg_u_long termDataSize = termData.fragNums.size();
598 mg_u_long levelDocNum = 0;
599
600 mg_u_long termDocFreq = 0;
601 mg_u_long lastLevelDocNum = 0;
602 mg_u_long overallwordfreq = 0;
603 mg_u_long resultI = 0;
604 mg_u_long resultSize = result.docs.size();
605 mg_u_long resultOutI = 0;
606
607 mg_u_long actual_num_term_match_docs = 0;
608
609 while (termDataI < termDataSize) {
610 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
611 levelDocNum)) {
612 if (levelDocNum != lastLevelDocNum) {
613 if (lastLevelDocNum > 0) {
614 ++actual_num_term_match_docs;
615
616 Wdt = termDocFreq * wordLog;
617
618 // find this document number
619 while (resultI < resultSize &&
620 result.docs[resultI] < lastLevelDocNum)
621 ++resultI;
622
623 // store the result
624 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
625 result.docs[resultOutI] = lastLevelDocNum;
626 if (needRanks)
627 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
628 ++resultI;
629 ++resultOutI;
630 }
631 }
632
633 lastLevelDocNum = levelDocNum;
634 termDocFreq = 0;
635 }
636
637 if (needRanks)
638 termDocFreq += termData.fragFreqs[termDataI];
639 overallwordfreq += termData.fragFreqs[termDataI];
640 }
641
642 ++termDataI;
643 } // while
644
645 if (lastLevelDocNum > 0) {
646 ++actual_num_term_match_docs;
647 // add the last document information
648 Wdt = termDocFreq * wordLog;
649
650 // find this document number
651 while (resultI < resultSize &&
652 result.docs[resultI] < lastLevelDocNum)
653 ++resultI;
654
655 // store the result
656 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
657 result.docs[resultOutI] = lastLevelDocNum;
658 if (needRanks)
659 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
660 ++resultI;
661 ++resultOutI;
662 }
663 }
664
665 // remove unneeded entries
666 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
667 if (needRanks)
668 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
669 else
670 result.ranks.erase (result.ranks.begin(), result.ranks.end());
671
672 // add the term frequency information
673 if (queryInfo.needTermFreqs) {
674 TermFreqData termFreqData;
675 termFreqData.tag = tag;
676 termFreqData.term = term;
677 termFreqData.stemMethod = stemMethod;
678 termFreqData.equivTerms = equivTerms;
679 //termFreqData.matchDocs = termData.matchDocs;
680 termFreqData.matchDocs = actual_num_term_match_docs;
681 termFreqData.termFreq = overallwordfreq;
682 result.termFreqs.push_back (termFreqData);
683 }
684}
685
686
687void RemoveUnwantedResults (IndexData &indexData,
688 const QueryInfo &queryInfo,
689 const FragData &termData,
690 QueryResult &result) {
691 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
692
693 mg_u_long termDataI = 0;
694 mg_u_long termDataSize = termData.fragNums.size();
695 mg_u_long levelDocNum = 0;
696
697 mg_u_long lastLevelDocNum = 0;
698
699 mg_u_long resultI = 0;
700 mg_u_long resultSize = result.docs.size();
701 mg_u_long resultOutI = 0;
702
703 while (termDataI < termDataSize) {
704 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
705 levelDocNum)) {
706 if (levelDocNum != lastLevelDocNum) {
707 if (lastLevelDocNum > 0) {
708 // find this document number
709 while (resultI < resultSize &&
710 result.docs[resultI] < lastLevelDocNum)
711 ++resultI;
712
713 // store the result
714 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
715 result.docs[resultOutI] = lastLevelDocNum;
716 if (needRanks)
717 result.ranks[resultOutI] = result.ranks[resultI];
718 ++resultI;
719 ++resultOutI;
720 }
721 }
722
723 lastLevelDocNum = levelDocNum;
724 }
725 }
726
727 ++termDataI;
728 }
729
730 if (lastLevelDocNum > 0) {
731 // find this document number
732 while (resultI < resultSize &&
733 result.docs[resultI] < lastLevelDocNum)
734 ++resultI;
735
736 // store the result
737 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
738 result.docs[resultOutI] = lastLevelDocNum;
739 if (needRanks)
740 result.ranks[resultOutI] = result.ranks[resultI];
741 ++resultI;
742 ++resultOutI;
743 }
744 }
745
746 // remove unneeded entries
747 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
748 if (needRanks)
749 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
750 else
751 result.ranks.erase (result.ranks.begin(), result.ranks.end());
752}
753
754
755
756//--------------------------------------------------------------
757// functions to support full text browse
758
759void FindNearestWordNumber (IndexData &indexData,
760 const UCArray &term,
761 mg_u_long &number) {
762
763 // find the word number for this term
764 mg_u_long wordElNum = 0;
765 mg_u_long numLevels = indexData.bdh.num_levels;
766 word_block_dict_el wordDictEl;
767 wordDictEl.SetNumLevels (numLevels);
768 if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
769 indexData.bdh.entries_per_wblk,
770 indexData.bdh.word_dict_size,
771 numLevels, term, wordDictEl, wordElNum))
772 number = wordElNum;
773
774}
775
776void GetTermList(IndexData &indexData,
777 mg_u_long startTerm,
778 mg_u_long numTerms,
779 TermFreqArray &terms) {
780
781 word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array();
782 TermFreqData termdata;
783
784 terms.erase(terms.begin(), terms.end());
785
786 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
787 indexData.bdh.entries_per_wblk,
788 indexData.bdh.word_dict_size,
789 indexData.bdh.num_levels, startTerm,
790 numTerms, wordBlocks);
791
792 word_block_dict_el_array::iterator here = wordBlocks.begin();
793 word_block_dict_el_array::iterator end = wordBlocks.end();
794
795 while (here != end) {
796 termdata.Clear();
797 termdata.term = (*here).el;
798 termdata.termFreq = (*here).freq;
799 terms.push_back(termdata);
800 ++here;
801 }
802
803}
804
805void GetTermList(IndexData &indexData,
806 mg_u_long startTerm,
807 mg_u_long numTerms,
808 UCArrayVector &terms) {
809
810
811
812 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
813 indexData.bdh.entries_per_wblk,
814 indexData.bdh.word_dict_size,
815 indexData.bdh.num_levels, startTerm,
816 numTerms, terms);
817
818}
Note: See TracBrowser for help on using the repository browser.