source: trunk/gsdl/src/mgpp/text/Terms.cpp@ 1836

Last change on this file since 1836 was 1836, checked in by kjm18, 23 years ago

added support for equiv terms for highlighting. THe QueryResult.TermFreqData
has UCArrayVector equivTerms now.

  • Property svn:keywords set to Author Date Id Revision
File size: 21.3 KB
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: Terms.cpp 1836 2001-01-14 23:56:57Z kjm18 $
21 *
22 **************************************************************************/
23
24#include "Terms.h"
25#include "words.h"
26#include "stemmer.h"
27#include "bitio_gen.h"
28#include "bitio_m_stdio.h"
29
30void QueryInfo::Clear () {
31 UCArrayClear (docLevel);
32 maxDocs = 0;
33 sortByRank = true;
34 exactWeights = false;
35 needRankInfo = false;
36 needTermFreqs = false;
37}
38
39
40
41void TermFreqData::Clear () {
42 UCArrayClear (tag);
43 UCArrayClear (term);
44 equivTerms.erase(equivTerms.begin(), equivTerms.end());
45 stemMethod = 0;
46 matchDocs = 0;
47 termFreq = 0;
48}
49
50ostream &operator<< (ostream &s, const TermFreqData &t) {
51 s << "<" << t.tag << ">\"" << t.term << "\"stem("
52 << t.stemMethod << ")equiv terms(";
53
54 unsigned long i;
55 for (i=0; i<t.equivTerms.size(); i++) {
56 s << t.equivTerms[i] << ", ";
57 }
58 s <<")docs(" << t.matchDocs << ")"
59 << "count("<<t.termFreq<<")";
60 return s;
61}
62
63bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
64 return ((t1.tag == t2.tag) &&
65 (t1.term == t2.term) &&
66 (t1.stemMethod == t2.stemMethod) &&
67 (t1.equivTerms == t2.equivTerms) &&
68 (t1.matchDocs == t2.matchDocs) &&
69 (t1.termFreq == t2.termFreq));
70}
71
72
73void QueryResult::Clear () {
74 docs.erase (docs.begin(), docs.end());
75 ranks.erase (ranks.begin(), ranks.end());
76 termFreqs.erase (termFreqs.begin(), termFreqs.end());
77 actualNumDocs = 0;
78}
79
80QueryResult::QueryResult () {
81 Clear ();
82}
83
84
85
86ostream &operator<< (ostream &s, const QueryResult &r) {
87 s << "docs: ";
88 unsigned long i;
89 for (i=0; i<r.docs.size(); i++)
90 s << r.docs[i] << ", ";
91
92 s << "\nranks: ";
93 for (i=0; i<r.ranks.size(); i++)
94 s << r.ranks[i] << ", ";
95
96 s << "\ntermFreqs: ";
97 for (i=0; i<r.termFreqs.size(); i++)
98 s << r.termFreqs[i] << ", ";
99
100 s << "\nactual number of docs found: " << r.actualNumDocs;
101 s << "\n\n";
102
103 return s;
104}
105
106
107bool operator== (const QueryResult &r1, const QueryResult &r2) {
108 return ((r1.docs == r2.docs) &&
109 (r1.ranks == r2.ranks) &&
110 (r1.termFreqs == r2.termFreqs) &&
111 (r1.actualNumDocs == r2.actualNumDocs));
112}
113
114//---------------------------------------------------
115// new ExtQueryResult stuff
116void ExtQueryResult::Clear () {
117 docs.erase (docs.begin(), docs.end());
118 levels.erase (levels.begin(), levels.end());
119 ranks.erase (ranks.begin(), ranks.end());
120 termFreqs.erase (termFreqs.begin(), termFreqs.end());
121 actualNumDocs = 0;
122}
123
124ExtQueryResult::ExtQueryResult () {
125 Clear ();
126}
127
128ostream &operator<< (ostream &s, const ExtQueryResult &r) {
129 s << "docs: ";
130 unsigned long i;
131 for (i=0; i<r.docs.size(); i++)
132 s << r.docs[i] << ", ";
133
134 s << "\nlevels: ";
135 for (i=0; i<r.levels.size(); i++)
136 s << r.levels[i] << ", ";
137
138
139 s << "\nranks: ";
140 for (i=0; i<r.ranks.size(); i++)
141 s << r.ranks[i] << ", ";
142
143 s << "\ntermFreqs: ";
144 for (i=0; i<r.termFreqs.size(); i++)
145 s << r.termFreqs[i] << ", ";
146 s << "\nactual number of docs found: " << r.actualNumDocs;
147 s << "\n\n";
148
149 return s;
150}
151
152
153bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {
154 return ((r1.docs == r2.docs) &&
155 (r1.levels == r2.levels) &&
156 (r1.ranks == r2.ranks) &&
157 (r1.termFreqs == r2.termFreqs) &&
158 (r1.actualNumDocs == r2.actualNumDocs));
159}
160
161//-------------------------------------------------------
162// new BrowseQueryResult stuff
163void BrowseQueryResult::Clear () {
164 termFreqs.erase (termFreqs.begin(), termFreqs.end());
165}
166
167BrowseQueryResult::BrowseQueryResult () {
168 Clear ();
169}
170
171
172
173ostream &operator<< (ostream &s, const BrowseQueryResult &r) {
174 s << "terms: ";
175 unsigned long i;
176 for (i=0; i<r.termFreqs.size(); i++)
177 s << r.termFreqs[i] << ", ";
178 s << "\n\n";
179 return s;
180}
181
182
183bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2) {
184 return ((r1.termFreqs == r2.termFreqs));
185
186}
187
188
189
190
191//--------------------------------------
192void FragData::Clear () {
193 matchDocs = 0;
194 fragNums.erase (fragNums.begin(), fragNums.end());
195 fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
196}
197
198
199
200
201void FindWordNumbers (IndexData &indexData,
202 const UCArray &term,
203 unsigned long stemMethod,
204 vector<unsigned long> &equivWords) {
205 equivWords.erase (equivWords.begin(), equivWords.end());
206
207 if (stemMethod == 0) {
208 // don't need to stem the word,
209 // find the word number for this term
210 unsigned long wordElNum = 0;
211 unsigned long numLevels = indexData.bdh.num_levels;
212 word_block_dict_el wordDictEl;
213 wordDictEl.SetNumLevels (numLevels);
214 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
215 indexData.bdh.entries_per_wblk,
216 indexData.bdh.word_dict_size,
217 numLevels, term, wordDictEl, wordElNum))
218 equivWords.push_back (wordElNum);
219
220 return;
221
222 }
223
224
225 // need to stem this word and find it in the blocked stem index
226
227 unsigned char mgWord[MAXSTEMLEN + 1];
228 UCArray stemTerm;
229 unsigned long stemmerNum = 0;
230
231 if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;
232 else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;
233 else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;
234
235
236 // convert the word to an "mg word"
237 mgWord[0] = term.size();
238 bcopy ((char *)term.begin(), (char *)&mgWord[1], term.size());
239
240 // stem the word
241 stemmer (stemMethod, stemmerNum, mgWord);
242
243 // convert the result back to a UCArray
244 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
245
246 // need to look up this term in the appropriate dictionary
247 stem_block_dict_el stemDictEl;
248 unsigned long stemElNum;
249 bool result = false;
250 if (stemMethod == 1) {
251 result = SearchStemBlockDictEl (indexData.stem1File,
252 indexData.sii1,
253 indexData.sih1.entries_per_block,
254 indexData.sih1.dict_size,
255 stemTerm,
256 stemDictEl,
257 stemElNum);
258
259 } else if (stemMethod == 2) {
260 result = SearchStemBlockDictEl (indexData.stem2File,
261 indexData.sii2,
262 indexData.sih2.entries_per_block,
263 indexData.sih2.dict_size,
264 stemTerm,
265 stemDictEl,
266 stemElNum);
267
268 } else if (stemMethod == 3) {
269 result = SearchStemBlockDictEl (indexData.stem3File,
270 indexData.sii3,
271 indexData.sih3.entries_per_block,
272 indexData.sih3.dict_size,
273 stemTerm,
274 stemDictEl,
275 stemElNum);
276 }
277
278 if (result) {
279 equivWords = stemDictEl.equivWords;
280 }
281}
282
283
284
285void ReadTermFragData (IndexData &indexData,
286 bool needFragFreqs,
287 unsigned long termNum,
288 FragData &fragData,
289 FragRangeArray *fragLimits,
290 UCArray & termWord) {
291 fragData.Clear();
292
293 // look up the word in the dictionary
294 unsigned long numLevels = indexData.bdh.num_levels;
295 word_block_dict_el wordDictEl;
296 wordDictEl.SetNumLevels (numLevels);
297 if (!SearchWordBlockDictElNum (indexData.dictFile,
298 indexData.biWords,
299 indexData.bdh.entries_per_wblk,
300 indexData.bdh.word_dict_size,
301 numLevels,
302 termNum, wordDictEl))
303 return; // nothing more to do
304
305 fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
306 termWord = wordDictEl.el;
307 // seek to the appropriate place in the inverted file
308 fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
309 stdio_bitio_buffer buffer (indexData.invfFile);
310
311 unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags,
312 wordDictEl.frag_occur);
313 unsigned long fragNum = 0;
314 unsigned long termFreq = 0;
315
316 unsigned long fragLimitI = 0;
317 unsigned long i;
318 for (i=0; i<wordDictEl.frag_occur; i++) {
319 fragNum += buffer.bblock_decode (B, NULL);
320 if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
321 else termFreq = 1;
322
323 // get the right fragment range
324 if (fragLimits != NULL) {
325 while (fragLimitI+1 < (*fragLimits).size() &&
326 fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
327 fragLimitI++;
328 }
329 }
330
331 // add the entry if it is within the limits
332 if ((fragLimits == NULL) ||
333 (fragLimitI < (*fragLimits).size() &&
334 fragNum > (*fragLimits)[fragLimitI].rangeStart &&
335 fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
336 fragData.fragNums.push_back (fragNum);
337 if (needFragFreqs)
338 fragData.fragFreqs.push_back (termFreq);
339 }
340 }
341
342 buffer.done();
343}
344
345
346void CombineFragData (bool needFragFreqs,
347 const FragData &f1,
348 const FragData &f2,
349 FragData &outFragData) {
350 outFragData.Clear();
351
352 // the new number of matching documents is the maximum
353 // of the two input matching number of documents -- it
354 // is assumed that these are at the same document level
355 outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
356 f1.matchDocs : f2.matchDocs;
357
358 // do or
359 unsigned long f1I = 0, f1Size = f1.fragNums.size();
360 unsigned long f2I = 0, f2Size = f2.fragNums.size();
361 while (f1I < f1Size || f2I < f2Size) {
362 if (f2I < f2Size &&
363 (f1I >= f1Size ||
364 f1.fragNums[f1I] > f2.fragNums[f2I])) {
365 // output f2I
366 outFragData.fragNums.push_back (f2.fragNums[f2I]);
367 if (needFragFreqs)
368 outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
369 f2I++;
370
371 } else if (f1I < f1Size &&
372 (f2I >= f2Size ||
373 f1.fragNums[f1I] < f2.fragNums[f2I])) {
374 // output f1I
375 outFragData.fragNums.push_back (f1.fragNums[f1I]);
376 if (needFragFreqs)
377 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
378 f1I++;
379
380 } else {
381 // must be equal combine f1I and f2I
382 outFragData.fragNums.push_back (f1.fragNums[f1I]);
383 if (needFragFreqs)
384 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
385 f1I++;
386 f2I++;
387 }
388 }
389}
390
391
392void AndCombineFragData (bool needFragFreqs,
393 FragData &fragData,
394 const FragData &comFragData,
395 signed long startRange,
396 signed long endRange,
397 const FragRangeArray *fragLimits) {
398 // sanity check on range
399 if (startRange > endRange) {
400 signed long temp = endRange;
401 endRange = startRange;
402 startRange = temp;
403 }
404
405 // get min matchdocs
406 if (comFragData.matchDocs < fragData.matchDocs)
407 fragData.matchDocs = comFragData.matchDocs;
408
409 unsigned long fragDataI = 0;
410 unsigned long fragDataSize = fragData.fragNums.size();
411 unsigned long comFragDataI = 0;
412 unsigned long comFragDataSize = comFragData.fragNums.size();
413 unsigned long fragLimitI = 0;
414 unsigned long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
415 unsigned long outI = 0;
416
417 while (fragDataI < fragDataSize &&
418 comFragDataI < comFragDataSize) {
419 signed long fragNum = (signed long)fragData.fragNums[fragDataI];
420 signed long comFragNum = (signed long)comFragData.fragNums[comFragDataI];
421
422 // go to the right fragment limit (for the com frag)
423 if (fragLimits != NULL) {
424 while (fragLimitI+1 < fragLimitSize &&
425 comFragNum > (signed long)(*fragLimits)[fragLimitI+1].rangeStart) {
426 fragLimitI++;
427 }
428 }
429
430 if (fragNum <= comFragNum+startRange ||
431 (fragLimits!=NULL &&
432 fragNum<=(signed long)(*fragLimits)[fragLimitI].rangeStart)) {
433 fragDataI++;
434
435 } else if (fragNum > comFragNum+endRange ||
436 (fragLimits!=NULL &&
437 fragNum>(signed long)(*fragLimits)[fragLimitI].rangeEnd)) {
438 comFragDataI++;
439
440 } else {
441 // equal and within tag
442 fragData.fragNums[outI] = comFragNum;
443 if (needFragFreqs) {
444 fragData.fragFreqs[outI] =
445 (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
446 fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
447 }
448 fragDataI++;
449 comFragDataI++;
450 outI++;
451 }
452 }
453
454 // erase unused part of fragData
455 fragData.fragNums.erase (fragData.fragNums.begin()+outI,
456 fragData.fragNums.end());
457 if (needFragFreqs)
458 fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
459 fragData.fragFreqs.end());
460 else
461 fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
462 fragData.fragFreqs.end());
463}
464
465
466void FragsToQueryResult (IndexData &indexData,
467 const QueryInfo &queryInfo,
468 const FragData &termData,
469 const UCArray &tag,
470 const UCArray &term,
471 unsigned long stemMethod,
472 unsigned long termWeight,
473 UCArrayVector &equivTerms,
474 QueryResult &result) {
475 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
476
477 result.Clear();
478
479 // log (N / ft)
480 unsigned long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
481 float wordLog = log((double)N / (double)termData.matchDocs);
482
483 // Wqt = fqt * log (N / ft)
484 // note: terms are allowed to have a weight of zero so
485 // they can be excluded from the ranking
486 float Wqt = termWeight * wordLog;
487
488 // Wdt = fdt * log (N / ft)
489 float Wdt;
490
491 unsigned long termDataI = 0;
492 unsigned long termDataSize = termData.fragNums.size();
493 unsigned long levelDocNum = 0;
494
495 unsigned long termDocFreq = 0;
496 unsigned long lastLevelDocNum = 0;
497 unsigned long overallwordfreq = 0;
498
499 while (termDataI < termDataSize) {
500 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
501 levelDocNum)) {
502 if (levelDocNum != lastLevelDocNum) {
503 if (lastLevelDocNum > 0) {
504 // add this doc information
505 if (needRanks) {
506 Wdt = termDocFreq * wordLog;
507 result.ranks.push_back (Wqt * Wdt);
508 }
509 result.docs.push_back (lastLevelDocNum);
510 }
511
512 lastLevelDocNum = levelDocNum;
513 termDocFreq = 0;
514 }
515
516 if (needRanks){
517 termDocFreq += termData.fragFreqs[termDataI];
518 overallwordfreq += termData.fragFreqs[termDataI];
519 }
520 }
521 termDataI++;
522 }
523
524 if (lastLevelDocNum > 0) {
525 // add the last document information
526 if (needRanks) {
527 Wdt = termDocFreq * wordLog;
528 result.ranks.push_back (Wqt * Wdt);
529 }
530 result.docs.push_back (lastLevelDocNum);
531 }
532
533 // add the term frequency information
534 if (queryInfo.needTermFreqs) {
535 TermFreqData termFreqData;
536 termFreqData.tag = tag;
537 termFreqData.term = term;
538 termFreqData.stemMethod = stemMethod;
539 termFreqData.equivTerms = equivTerms;
540 termFreqData.matchDocs = termData.matchDocs;
541 termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo
542 //not true
543 result.termFreqs.push_back (termFreqData);
544 }
545}
546
547void AndFragsToQueryResult (IndexData &indexData,
548 const QueryInfo &queryInfo,
549 const FragData &termData,
550 const UCArray &tag,
551 const UCArray &term,
552 unsigned long stemMethod,
553 unsigned long termWeight,
554 UCArrayVector &equivTerms,
555 QueryResult &result) {
556 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
557
558 // log (N / ft)
559 float wordLog =
560 log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
561 (double)termData.matchDocs);
562
563 // Wqt = fqt * log (N / ft)
564 // note: terms are allowed to have a weight of zero so
565 // they can be excluded from the ranking
566 float Wqt = termWeight * wordLog;
567
568 // Wdt = fdt * log (N / ft)
569 float Wdt;
570
571 unsigned long termDataI = 0;
572 unsigned long termDataSize = termData.fragNums.size();
573 unsigned long levelDocNum = 0;
574
575 unsigned long termDocFreq = 0;
576 unsigned long lastLevelDocNum = 0;
577 unsigned long overallwordfreq = 0;
578 unsigned long resultI = 0;
579 unsigned long resultSize = result.docs.size();
580 unsigned long resultOutI = 0;
581
582
583 while (termDataI < termDataSize) {
584 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
585 levelDocNum)) {
586 if (levelDocNum != lastLevelDocNum) {
587 if (lastLevelDocNum > 0) {
588 // add this doc information
589 Wdt = termDocFreq * wordLog;
590
591 // find this document number
592 while (resultI < resultSize &&
593 result.docs[resultI] < lastLevelDocNum)
594 resultI++;
595
596 // store the result
597 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
598 result.docs[resultOutI] = lastLevelDocNum;
599 if (needRanks)
600 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
601 resultI++;
602 resultOutI++;
603 }
604 }
605
606 lastLevelDocNum = levelDocNum;
607 termDocFreq = 0;
608 }
609
610 if (needRanks)
611 termDocFreq += termData.fragFreqs[termDataI];
612 overallwordfreq += termData.fragFreqs[termDataI];
613 }
614
615 termDataI++;
616 } // while
617
618 if (lastLevelDocNum > 0) {
619 // add the last document information
620 Wdt = termDocFreq * wordLog;
621
622 // find this document number
623 while (resultI < resultSize &&
624 result.docs[resultI] < lastLevelDocNum)
625 resultI++;
626
627 // store the result
628 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
629 result.docs[resultOutI] = lastLevelDocNum;
630 if (needRanks)
631 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
632 resultI++;
633 resultOutI++;
634 }
635 }
636
637 // remove unneeded entries
638 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
639 if (needRanks)
640 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
641 else
642 result.ranks.erase (result.ranks.begin(), result.ranks.end());
643
644 // add the term frequency information
645 if (queryInfo.needTermFreqs) {
646 TermFreqData termFreqData;
647 termFreqData.tag = tag;
648 termFreqData.term = term;
649 termFreqData.stemMethod = stemMethod;
650 termFreqData.equivTerms = equivTerms;
651 termFreqData.matchDocs = termData.matchDocs;
652 termFreqData.termFreq = overallwordfreq;
653 result.termFreqs.push_back (termFreqData);
654 }
655}
656
657
658void RemoveUnwantedResults (IndexData &indexData,
659 const QueryInfo &queryInfo,
660 const FragData &termData,
661 QueryResult &result) {
662 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
663
664 unsigned long termDataI = 0;
665 unsigned long termDataSize = termData.fragNums.size();
666 unsigned long levelDocNum = 0;
667
668 unsigned long lastLevelDocNum = 0;
669
670 unsigned long resultI = 0;
671 unsigned long resultSize = result.docs.size();
672 unsigned long resultOutI = 0;
673
674 while (termDataI < termDataSize) {
675 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
676 levelDocNum)) {
677 if (levelDocNum != lastLevelDocNum) {
678 if (lastLevelDocNum > 0) {
679 // find this document number
680 while (resultI < resultSize &&
681 result.docs[resultI] < lastLevelDocNum)
682 resultI++;
683
684 // store the result
685 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
686 result.docs[resultOutI] = lastLevelDocNum;
687 if (needRanks)
688 result.ranks[resultOutI] = result.ranks[resultI];
689 resultI++;
690 resultOutI++;
691 }
692 }
693
694 lastLevelDocNum = levelDocNum;
695 }
696 }
697
698 termDataI++;
699 }
700
701 if (lastLevelDocNum > 0) {
702 // find this document number
703 while (resultI < resultSize &&
704 result.docs[resultI] < lastLevelDocNum)
705 resultI++;
706
707 // store the result
708 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
709 result.docs[resultOutI] = lastLevelDocNum;
710 if (needRanks)
711 result.ranks[resultOutI] = result.ranks[resultI];
712 resultI++;
713 resultOutI++;
714 }
715 }
716
717 // remove unneeded entries
718 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
719 if (needRanks)
720 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
721 else
722 result.ranks.erase (result.ranks.begin(), result.ranks.end());
723}
724
725
726
727//--------------------------------------------------------------
728// functions to support full text browse
729
730void FindNearestWordNumber (IndexData &indexData,
731 const UCArray &term,
732 unsigned long &number) {
733
734 // find the word number for this term
735 unsigned long wordElNum = 0;
736 unsigned long numLevels = indexData.bdh.num_levels;
737 word_block_dict_el wordDictEl;
738 wordDictEl.SetNumLevels (numLevels);
739 if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
740 indexData.bdh.entries_per_wblk,
741 indexData.bdh.word_dict_size,
742 numLevels, term, wordDictEl, wordElNum))
743 number = wordElNum;
744
745}
746
747void GetTermList(IndexData &indexData,
748 unsigned long startTerm,
749 unsigned long numTerms,
750 TermFreqArray &terms) {
751
752 word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array();
753 TermFreqData termdata;
754
755 terms.erase(terms.begin(), terms.end());
756
757 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
758 indexData.bdh.entries_per_wblk,
759 indexData.bdh.word_dict_size,
760 indexData.bdh.num_levels, startTerm,
761 numTerms, wordBlocks);
762
763 word_block_dict_el_array::iterator here = wordBlocks.begin();
764 word_block_dict_el_array::iterator end = wordBlocks.end();
765
766 while (here != end) {
767 termdata.Clear();
768 termdata.term = (*here).el;
769 termdata.termFreq = (*here).freq;
770 terms.push_back(termdata);
771 here++;
772 }
773
774}
775
776void GetTermList(IndexData &indexData,
777 unsigned long startTerm,
778 unsigned long numTerms,
779 UCArrayVector &terms) {
780
781
782
783 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
784 indexData.bdh.entries_per_wblk,
785 indexData.bdh.word_dict_size,
786 indexData.bdh.num_levels, startTerm,
787 numTerms, terms);
788
789}
790
791
792
Note: See TracBrowser for help on using the repository browser.