source: trunk/gsdl/src/mgpp/text/Terms.cpp@ 1847

Last change on this file since 1847 was 1847, checked in by kjm18, 23 years ago

added a QueryResult::printShort routine - outputs termfreq stuff, but not
the doc results (used for timing expts)

  • Property svn:keywords set to Author Date Id Revision
File size: 21.5 KB
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: Terms.cpp 1847 2001-01-22 01:47:56Z kjm18 $
21 *
22 **************************************************************************/
23
24#include "Terms.h"
25#include "words.h"
26#include "stemmer.h"
27#include "bitio_gen.h"
28#include "bitio_m_stdio.h"
29
30void QueryInfo::Clear () {
31 UCArrayClear (docLevel);
32 maxDocs = 0;
33 sortByRank = true;
34 exactWeights = false;
35 needRankInfo = false;
36 needTermFreqs = false;
37}
38
39
40
41void TermFreqData::Clear () {
42 UCArrayClear (tag);
43 UCArrayClear (term);
44 equivTerms.erase(equivTerms.begin(), equivTerms.end());
45 stemMethod = 0;
46 matchDocs = 0;
47 termFreq = 0;
48}
49
50ostream &operator<< (ostream &s, const TermFreqData &t) {
51 s << "<" << t.tag << ">\"" << t.term << "\"stem("
52 << t.stemMethod << ")equiv terms(";
53
54 unsigned long i;
55 for (i=0; i<t.equivTerms.size(); i++) {
56 s << t.equivTerms[i] << ", ";
57 }
58 s <<")docs(" << t.matchDocs << ")"
59 << "count("<<t.termFreq<<")";
60 return s;
61}
62
63bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
64 return ((t1.tag == t2.tag) &&
65 (t1.term == t2.term) &&
66 (t1.stemMethod == t2.stemMethod) &&
67 (t1.equivTerms == t2.equivTerms) &&
68 (t1.matchDocs == t2.matchDocs) &&
69 (t1.termFreq == t2.termFreq));
70}
71
72
73void QueryResult::Clear () {
74 docs.erase (docs.begin(), docs.end());
75 ranks.erase (ranks.begin(), ranks.end());
76 termFreqs.erase (termFreqs.begin(), termFreqs.end());
77 actualNumDocs = 0;
78}
79
80QueryResult::QueryResult () {
81 Clear ();
82}
83
84void QueryResult::printShort(ostream &s) {
85
86 s << "termFreqs: ";
87 for (unsigned long i=0; i<termFreqs.size(); i++)
88 s << termFreqs[i] << ", ";
89
90 s << "\nactual number of docs found: " << actualNumDocs;
91 s << "\n\n";
92
93}
94
95
96ostream &operator<< (ostream &s, const QueryResult &r) {
97 s << "docs: ";
98 unsigned long i;
99 for (i=0; i<r.docs.size(); i++)
100 s << r.docs[i] << ", ";
101
102 s << "\nranks: ";
103 for (i=0; i<r.ranks.size(); i++)
104 s << r.ranks[i] << ", ";
105
106 s << "\ntermFreqs: ";
107 for (i=0; i<r.termFreqs.size(); i++)
108 s << r.termFreqs[i] << ", ";
109
110 s << "\nactual number of docs found: " << r.actualNumDocs;
111 s << "\n\n";
112
113 return s;
114}
115
116
117bool operator== (const QueryResult &r1, const QueryResult &r2) {
118 return ((r1.docs == r2.docs) &&
119 (r1.ranks == r2.ranks) &&
120 (r1.termFreqs == r2.termFreqs) &&
121 (r1.actualNumDocs == r2.actualNumDocs));
122}
123
124//---------------------------------------------------
125// new ExtQueryResult stuff
126void ExtQueryResult::Clear () {
127 docs.erase (docs.begin(), docs.end());
128 levels.erase (levels.begin(), levels.end());
129 ranks.erase (ranks.begin(), ranks.end());
130 termFreqs.erase (termFreqs.begin(), termFreqs.end());
131 actualNumDocs = 0;
132}
133
134ExtQueryResult::ExtQueryResult () {
135 Clear ();
136}
137
138ostream &operator<< (ostream &s, const ExtQueryResult &r) {
139 s << "docs: ";
140 unsigned long i;
141 for (i=0; i<r.docs.size(); i++)
142 s << r.docs[i] << ", ";
143
144 s << "\nlevels: ";
145 for (i=0; i<r.levels.size(); i++)
146 s << r.levels[i] << ", ";
147
148
149 s << "\nranks: ";
150 for (i=0; i<r.ranks.size(); i++)
151 s << r.ranks[i] << ", ";
152
153 s << "\ntermFreqs: ";
154 for (i=0; i<r.termFreqs.size(); i++)
155 s << r.termFreqs[i] << ", ";
156 s << "\nactual number of docs found: " << r.actualNumDocs;
157 s << "\n\n";
158
159 return s;
160}
161
162
163bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {
164 return ((r1.docs == r2.docs) &&
165 (r1.levels == r2.levels) &&
166 (r1.ranks == r2.ranks) &&
167 (r1.termFreqs == r2.termFreqs) &&
168 (r1.actualNumDocs == r2.actualNumDocs));
169}
170
171//-------------------------------------------------------
172// new BrowseQueryResult stuff
173void BrowseQueryResult::Clear () {
174 termFreqs.erase (termFreqs.begin(), termFreqs.end());
175}
176
177BrowseQueryResult::BrowseQueryResult () {
178 Clear ();
179}
180
181
182
183ostream &operator<< (ostream &s, const BrowseQueryResult &r) {
184 s << "terms: ";
185 unsigned long i;
186 for (i=0; i<r.termFreqs.size(); i++)
187 s << r.termFreqs[i] << ", ";
188 s << "\n\n";
189 return s;
190}
191
192
193bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2) {
194 return ((r1.termFreqs == r2.termFreqs));
195
196}
197
198
199
200
201//--------------------------------------
202void FragData::Clear () {
203 matchDocs = 0;
204 fragNums.erase (fragNums.begin(), fragNums.end());
205 fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
206}
207
208
209
210
211void FindWordNumbers (IndexData &indexData,
212 const UCArray &term,
213 unsigned long stemMethod,
214 vector<unsigned long> &equivWords) {
215 equivWords.erase (equivWords.begin(), equivWords.end());
216
217 if (stemMethod == 0) {
218 // don't need to stem the word,
219 // find the word number for this term
220 unsigned long wordElNum = 0;
221 unsigned long numLevels = indexData.bdh.num_levels;
222 word_block_dict_el wordDictEl;
223 wordDictEl.SetNumLevels (numLevels);
224 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
225 indexData.bdh.entries_per_wblk,
226 indexData.bdh.word_dict_size,
227 numLevels, term, wordDictEl, wordElNum))
228 equivWords.push_back (wordElNum);
229
230 return;
231
232 }
233
234
235 // need to stem this word and find it in the blocked stem index
236
237 unsigned char mgWord[MAXSTEMLEN + 1];
238 UCArray stemTerm;
239 unsigned long stemmerNum = 0;
240
241 if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;
242 else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;
243 else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;
244
245
246 // convert the word to an "mg word"
247 mgWord[0] = term.size();
248 bcopy ((char *)term.begin(), (char *)&mgWord[1], term.size());
249
250 // stem the word
251 stemmer (stemMethod, stemmerNum, mgWord);
252
253 // convert the result back to a UCArray
254 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
255
256 // need to look up this term in the appropriate dictionary
257 stem_block_dict_el stemDictEl;
258 unsigned long stemElNum;
259 bool result = false;
260 if (stemMethod == 1) {
261 result = SearchStemBlockDictEl (indexData.stem1File,
262 indexData.sii1,
263 indexData.sih1.entries_per_block,
264 indexData.sih1.dict_size,
265 stemTerm,
266 stemDictEl,
267 stemElNum);
268
269 } else if (stemMethod == 2) {
270 result = SearchStemBlockDictEl (indexData.stem2File,
271 indexData.sii2,
272 indexData.sih2.entries_per_block,
273 indexData.sih2.dict_size,
274 stemTerm,
275 stemDictEl,
276 stemElNum);
277
278 } else if (stemMethod == 3) {
279 result = SearchStemBlockDictEl (indexData.stem3File,
280 indexData.sii3,
281 indexData.sih3.entries_per_block,
282 indexData.sih3.dict_size,
283 stemTerm,
284 stemDictEl,
285 stemElNum);
286 }
287
288 if (result) {
289 equivWords = stemDictEl.equivWords;
290 }
291}
292
293
294
295void ReadTermFragData (IndexData &indexData,
296 bool needFragFreqs,
297 unsigned long termNum,
298 FragData &fragData,
299 FragRangeArray *fragLimits,
300 UCArray & termWord) {
301 fragData.Clear();
302
303 // look up the word in the dictionary
304 unsigned long numLevels = indexData.bdh.num_levels;
305 word_block_dict_el wordDictEl;
306 wordDictEl.SetNumLevels (numLevels);
307 if (!SearchWordBlockDictElNum (indexData.dictFile,
308 indexData.biWords,
309 indexData.bdh.entries_per_wblk,
310 indexData.bdh.word_dict_size,
311 numLevels,
312 termNum, wordDictEl))
313 return; // nothing more to do
314
315 fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
316 termWord = wordDictEl.el;
317 // seek to the appropriate place in the inverted file
318 fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
319 stdio_bitio_buffer buffer (indexData.invfFile);
320
321 unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags,
322 wordDictEl.frag_occur);
323 unsigned long fragNum = 0;
324 unsigned long termFreq = 0;
325
326 unsigned long fragLimitI = 0;
327 unsigned long i;
328 for (i=0; i<wordDictEl.frag_occur; i++) {
329 fragNum += buffer.bblock_decode (B, NULL);
330 if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
331 else termFreq = 1;
332
333 // get the right fragment range
334 if (fragLimits != NULL) {
335 while (fragLimitI+1 < (*fragLimits).size() &&
336 fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
337 fragLimitI++;
338 }
339 }
340
341 // add the entry if it is within the limits
342 if ((fragLimits == NULL) ||
343 (fragLimitI < (*fragLimits).size() &&
344 fragNum > (*fragLimits)[fragLimitI].rangeStart &&
345 fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
346 fragData.fragNums.push_back (fragNum);
347 if (needFragFreqs)
348 fragData.fragFreqs.push_back (termFreq);
349 }
350 }
351
352 buffer.done();
353}
354
355
356void CombineFragData (bool needFragFreqs,
357 const FragData &f1,
358 const FragData &f2,
359 FragData &outFragData) {
360 outFragData.Clear();
361
362 // the new number of matching documents is the maximum
363 // of the two input matching number of documents -- it
364 // is assumed that these are at the same document level
365 outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
366 f1.matchDocs : f2.matchDocs;
367
368 // do or
369 unsigned long f1I = 0, f1Size = f1.fragNums.size();
370 unsigned long f2I = 0, f2Size = f2.fragNums.size();
371 while (f1I < f1Size || f2I < f2Size) {
372 if (f2I < f2Size &&
373 (f1I >= f1Size ||
374 f1.fragNums[f1I] > f2.fragNums[f2I])) {
375 // output f2I
376 outFragData.fragNums.push_back (f2.fragNums[f2I]);
377 if (needFragFreqs)
378 outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
379 f2I++;
380
381 } else if (f1I < f1Size &&
382 (f2I >= f2Size ||
383 f1.fragNums[f1I] < f2.fragNums[f2I])) {
384 // output f1I
385 outFragData.fragNums.push_back (f1.fragNums[f1I]);
386 if (needFragFreqs)
387 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
388 f1I++;
389
390 } else {
391 // must be equal combine f1I and f2I
392 outFragData.fragNums.push_back (f1.fragNums[f1I]);
393 if (needFragFreqs)
394 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
395 f1I++;
396 f2I++;
397 }
398 }
399}
400
401
402void AndCombineFragData (bool needFragFreqs,
403 FragData &fragData,
404 const FragData &comFragData,
405 signed long startRange,
406 signed long endRange,
407 const FragRangeArray *fragLimits) {
408 // sanity check on range
409 if (startRange > endRange) {
410 signed long temp = endRange;
411 endRange = startRange;
412 startRange = temp;
413 }
414
415 // get min matchdocs
416 if (comFragData.matchDocs < fragData.matchDocs)
417 fragData.matchDocs = comFragData.matchDocs;
418
419 unsigned long fragDataI = 0;
420 unsigned long fragDataSize = fragData.fragNums.size();
421 unsigned long comFragDataI = 0;
422 unsigned long comFragDataSize = comFragData.fragNums.size();
423 unsigned long fragLimitI = 0;
424 unsigned long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
425 unsigned long outI = 0;
426
427 while (fragDataI < fragDataSize &&
428 comFragDataI < comFragDataSize) {
429 signed long fragNum = (signed long)fragData.fragNums[fragDataI];
430 signed long comFragNum = (signed long)comFragData.fragNums[comFragDataI];
431
432 // go to the right fragment limit (for the com frag)
433 if (fragLimits != NULL) {
434 while (fragLimitI+1 < fragLimitSize &&
435 comFragNum > (signed long)(*fragLimits)[fragLimitI+1].rangeStart) {
436 fragLimitI++;
437 }
438 }
439
440 if (fragNum <= comFragNum+startRange ||
441 (fragLimits!=NULL &&
442 fragNum<=(signed long)(*fragLimits)[fragLimitI].rangeStart)) {
443 fragDataI++;
444
445 } else if (fragNum > comFragNum+endRange ||
446 (fragLimits!=NULL &&
447 fragNum>(signed long)(*fragLimits)[fragLimitI].rangeEnd)) {
448 comFragDataI++;
449
450 } else {
451 // equal and within tag
452 fragData.fragNums[outI] = comFragNum;
453 if (needFragFreqs) {
454 fragData.fragFreqs[outI] =
455 (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
456 fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
457 }
458 fragDataI++;
459 comFragDataI++;
460 outI++;
461 }
462 }
463
464 // erase unused part of fragData
465 fragData.fragNums.erase (fragData.fragNums.begin()+outI,
466 fragData.fragNums.end());
467 if (needFragFreqs)
468 fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
469 fragData.fragFreqs.end());
470 else
471 fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
472 fragData.fragFreqs.end());
473}
474
475
476void FragsToQueryResult (IndexData &indexData,
477 const QueryInfo &queryInfo,
478 const FragData &termData,
479 const UCArray &tag,
480 const UCArray &term,
481 unsigned long stemMethod,
482 unsigned long termWeight,
483 UCArrayVector &equivTerms,
484 QueryResult &result) {
485 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
486
487 result.Clear();
488
489 // log (N / ft)
490 unsigned long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
491 float wordLog = log((double)N / (double)termData.matchDocs);
492
493 // Wqt = fqt * log (N / ft)
494 // note: terms are allowed to have a weight of zero so
495 // they can be excluded from the ranking
496 float Wqt = termWeight * wordLog;
497
498 // Wdt = fdt * log (N / ft)
499 float Wdt;
500
501 unsigned long termDataI = 0;
502 unsigned long termDataSize = termData.fragNums.size();
503 unsigned long levelDocNum = 0;
504
505 unsigned long termDocFreq = 0;
506 unsigned long lastLevelDocNum = 0;
507 unsigned long overallwordfreq = 0;
508
509 while (termDataI < termDataSize) {
510 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
511 levelDocNum)) {
512 if (levelDocNum != lastLevelDocNum) {
513 if (lastLevelDocNum > 0) {
514 // add this doc information
515 if (needRanks) {
516 Wdt = termDocFreq * wordLog;
517 result.ranks.push_back (Wqt * Wdt);
518 }
519 result.docs.push_back (lastLevelDocNum);
520 }
521
522 lastLevelDocNum = levelDocNum;
523 termDocFreq = 0;
524 }
525
526 if (needRanks){
527 termDocFreq += termData.fragFreqs[termDataI];
528 overallwordfreq += termData.fragFreqs[termDataI];
529 }
530 }
531 termDataI++;
532 }
533
534 if (lastLevelDocNum > 0) {
535 // add the last document information
536 if (needRanks) {
537 Wdt = termDocFreq * wordLog;
538 result.ranks.push_back (Wqt * Wdt);
539 }
540 result.docs.push_back (lastLevelDocNum);
541 }
542
543 // add the term frequency information
544 if (queryInfo.needTermFreqs) {
545 TermFreqData termFreqData;
546 termFreqData.tag = tag;
547 termFreqData.term = term;
548 termFreqData.stemMethod = stemMethod;
549 termFreqData.equivTerms = equivTerms;
550 termFreqData.matchDocs = termData.matchDocs;
551 termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo
552 //not true
553 result.termFreqs.push_back (termFreqData);
554 }
555}
556
557void AndFragsToQueryResult (IndexData &indexData,
558 const QueryInfo &queryInfo,
559 const FragData &termData,
560 const UCArray &tag,
561 const UCArray &term,
562 unsigned long stemMethod,
563 unsigned long termWeight,
564 UCArrayVector &equivTerms,
565 QueryResult &result) {
566 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
567
568 // log (N / ft)
569 float wordLog =
570 log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
571 (double)termData.matchDocs);
572
573 // Wqt = fqt * log (N / ft)
574 // note: terms are allowed to have a weight of zero so
575 // they can be excluded from the ranking
576 float Wqt = termWeight * wordLog;
577
578 // Wdt = fdt * log (N / ft)
579 float Wdt;
580
581 unsigned long termDataI = 0;
582 unsigned long termDataSize = termData.fragNums.size();
583 unsigned long levelDocNum = 0;
584
585 unsigned long termDocFreq = 0;
586 unsigned long lastLevelDocNum = 0;
587 unsigned long overallwordfreq = 0;
588 unsigned long resultI = 0;
589 unsigned long resultSize = result.docs.size();
590 unsigned long resultOutI = 0;
591
592
593 while (termDataI < termDataSize) {
594 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
595 levelDocNum)) {
596 if (levelDocNum != lastLevelDocNum) {
597 if (lastLevelDocNum > 0) {
598 // add this doc information
599 Wdt = termDocFreq * wordLog;
600
601 // find this document number
602 while (resultI < resultSize &&
603 result.docs[resultI] < lastLevelDocNum)
604 resultI++;
605
606 // store the result
607 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
608 result.docs[resultOutI] = lastLevelDocNum;
609 if (needRanks)
610 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
611 resultI++;
612 resultOutI++;
613 }
614 }
615
616 lastLevelDocNum = levelDocNum;
617 termDocFreq = 0;
618 }
619
620 if (needRanks)
621 termDocFreq += termData.fragFreqs[termDataI];
622 overallwordfreq += termData.fragFreqs[termDataI];
623 }
624
625 termDataI++;
626 } // while
627
628 if (lastLevelDocNum > 0) {
629 // add the last document information
630 Wdt = termDocFreq * wordLog;
631
632 // find this document number
633 while (resultI < resultSize &&
634 result.docs[resultI] < lastLevelDocNum)
635 resultI++;
636
637 // store the result
638 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
639 result.docs[resultOutI] = lastLevelDocNum;
640 if (needRanks)
641 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
642 resultI++;
643 resultOutI++;
644 }
645 }
646
647 // remove unneeded entries
648 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
649 if (needRanks)
650 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
651 else
652 result.ranks.erase (result.ranks.begin(), result.ranks.end());
653
654 // add the term frequency information
655 if (queryInfo.needTermFreqs) {
656 TermFreqData termFreqData;
657 termFreqData.tag = tag;
658 termFreqData.term = term;
659 termFreqData.stemMethod = stemMethod;
660 termFreqData.equivTerms = equivTerms;
661 termFreqData.matchDocs = termData.matchDocs;
662 termFreqData.termFreq = overallwordfreq;
663 result.termFreqs.push_back (termFreqData);
664 }
665}
666
667
668void RemoveUnwantedResults (IndexData &indexData,
669 const QueryInfo &queryInfo,
670 const FragData &termData,
671 QueryResult &result) {
672 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
673
674 unsigned long termDataI = 0;
675 unsigned long termDataSize = termData.fragNums.size();
676 unsigned long levelDocNum = 0;
677
678 unsigned long lastLevelDocNum = 0;
679
680 unsigned long resultI = 0;
681 unsigned long resultSize = result.docs.size();
682 unsigned long resultOutI = 0;
683
684 while (termDataI < termDataSize) {
685 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
686 levelDocNum)) {
687 if (levelDocNum != lastLevelDocNum) {
688 if (lastLevelDocNum > 0) {
689 // find this document number
690 while (resultI < resultSize &&
691 result.docs[resultI] < lastLevelDocNum)
692 resultI++;
693
694 // store the result
695 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
696 result.docs[resultOutI] = lastLevelDocNum;
697 if (needRanks)
698 result.ranks[resultOutI] = result.ranks[resultI];
699 resultI++;
700 resultOutI++;
701 }
702 }
703
704 lastLevelDocNum = levelDocNum;
705 }
706 }
707
708 termDataI++;
709 }
710
711 if (lastLevelDocNum > 0) {
712 // find this document number
713 while (resultI < resultSize &&
714 result.docs[resultI] < lastLevelDocNum)
715 resultI++;
716
717 // store the result
718 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
719 result.docs[resultOutI] = lastLevelDocNum;
720 if (needRanks)
721 result.ranks[resultOutI] = result.ranks[resultI];
722 resultI++;
723 resultOutI++;
724 }
725 }
726
727 // remove unneeded entries
728 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
729 if (needRanks)
730 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
731 else
732 result.ranks.erase (result.ranks.begin(), result.ranks.end());
733}
734
735
736
737//--------------------------------------------------------------
738// functions to support full text browse
739
740void FindNearestWordNumber (IndexData &indexData,
741 const UCArray &term,
742 unsigned long &number) {
743
744 // find the word number for this term
745 unsigned long wordElNum = 0;
746 unsigned long numLevels = indexData.bdh.num_levels;
747 word_block_dict_el wordDictEl;
748 wordDictEl.SetNumLevels (numLevels);
749 if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
750 indexData.bdh.entries_per_wblk,
751 indexData.bdh.word_dict_size,
752 numLevels, term, wordDictEl, wordElNum))
753 number = wordElNum;
754
755}
756
757void GetTermList(IndexData &indexData,
758 unsigned long startTerm,
759 unsigned long numTerms,
760 TermFreqArray &terms) {
761
762 word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array();
763 TermFreqData termdata;
764
765 terms.erase(terms.begin(), terms.end());
766
767 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
768 indexData.bdh.entries_per_wblk,
769 indexData.bdh.word_dict_size,
770 indexData.bdh.num_levels, startTerm,
771 numTerms, wordBlocks);
772
773 word_block_dict_el_array::iterator here = wordBlocks.begin();
774 word_block_dict_el_array::iterator end = wordBlocks.end();
775
776 while (here != end) {
777 termdata.Clear();
778 termdata.term = (*here).el;
779 termdata.termFreq = (*here).freq;
780 terms.push_back(termdata);
781 here++;
782 }
783
784}
785
786void GetTermList(IndexData &indexData,
787 unsigned long startTerm,
788 unsigned long numTerms,
789 UCArrayVector &terms) {
790
791
792
793 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
794 indexData.bdh.entries_per_wblk,
795 indexData.bdh.word_dict_size,
796 indexData.bdh.num_levels, startTerm,
797 numTerms, terms);
798
799}
800
801
802
Note: See TracBrowser for help on using the repository browser.