source: trunk/indexers/mgpp/text/Terms.cpp@ 8692

Last change on this file since 8692 was 8692, checked in by kjdon, 19 years ago

Added the changes from Emanuel Dejanu (Simple Words) - mostly efficiency changes. For example, changing i++ to ++i, delete xxx to delete []xxx, some stuff to do with UCArrays...

  • Property svn:keywords set to Author Date Id Revision
File size: 21.8 KB
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "Terms.h"
23#include "words.h"
24#include "stemmer.h"
25#include "bitio_gen.h"
26#include "bitio_m_stdio.h"
27
28void QueryInfo::Clear () {
29 UCArrayClear (docLevel);
30 maxDocs = 0;
31 sortByRank = true;
32 exactWeights = false;
33 needRankInfo = false;
34 needTermFreqs = false;
35}
36
37
38
39void TermFreqData::Clear () {
40 UCArrayClear (tag);
41 UCArrayClear (term);
42 equivTerms.erase(equivTerms.begin(), equivTerms.end());
43 stemMethod = 0;
44 matchDocs = 0;
45 termFreq = 0;
46}
47
48ostream &operator<< (ostream &s, const TermFreqData &t) {
49 s << "<" << t.tag << ">\"" << t.term << "\"stem("
50 << t.stemMethod << ")equiv terms(";
51
52 unsigned long i;
53 for (i=0; i<t.equivTerms.size(); ++i) {
54 s << t.equivTerms[i] << ", ";
55 }
56 s <<")docs(" << t.matchDocs << ")"
57 << "count("<<t.termFreq<<")";
58 return s;
59}
60
61bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
62 return ((t1.tag == t2.tag) &&
63 (t1.term == t2.term) &&
64 (t1.stemMethod == t2.stemMethod) &&
65 (t1.equivTerms == t2.equivTerms) &&
66 (t1.matchDocs == t2.matchDocs) &&
67 (t1.termFreq == t2.termFreq));
68}
69
70
71void QueryResult::Clear () {
72 docs.erase (docs.begin(), docs.end());
73 ranks.erase (ranks.begin(), ranks.end());
74 termFreqs.erase (termFreqs.begin(), termFreqs.end());
75 actualNumDocs = 0;
76}
77
78QueryResult::QueryResult () {
79 Clear ();
80}
81
82void QueryResult::printShort(ostream &s) {
83
84 s << "termFreqs: ";
85 for (unsigned long i=0; i<termFreqs.size(); ++i)
86 s << termFreqs[i] << ", ";
87
88 s << "\nactual number of docs found: " << actualNumDocs;
89 s << "\n\n";
90
91}
92
93
94ostream &operator<< (ostream &s, const QueryResult &r) {
95 s << "docs: ";
96 unsigned long i;
97 for (i=0; i<r.docs.size(); ++i)
98 s << r.docs[i] << ", ";
99
100 s << "\nranks: ";
101 for (i=0; i<r.ranks.size(); ++i)
102 s << r.ranks[i] << ", ";
103
104 s << "\ntermFreqs: ";
105 for (i=0; i<r.termFreqs.size(); ++i)
106 s << r.termFreqs[i] << ", ";
107
108 s << "\nactual number of docs found: " << r.actualNumDocs;
109 s << "\n\n";
110
111 return s;
112}
113
114
115bool operator== (const QueryResult &r1, const QueryResult &r2) {
116 return ((r1.docs == r2.docs) &&
117 (r1.ranks == r2.ranks) &&
118 (r1.termFreqs == r2.termFreqs) &&
119 (r1.actualNumDocs == r2.actualNumDocs));
120}
121
122//---------------------------------------------------
123// new ExtQueryResult stuff
124void ExtQueryResult::Clear () {
125 docs.erase (docs.begin(), docs.end());
126 levels.erase (levels.begin(), levels.end());
127 ranks.erase (ranks.begin(), ranks.end());
128 termFreqs.erase (termFreqs.begin(), termFreqs.end());
129 actualNumDocs = 0;
130}
131
132ExtQueryResult::ExtQueryResult () {
133 Clear ();
134}
135
136ostream &operator<< (ostream &s, const ExtQueryResult &r) {
137 s << "docs: ";
138 unsigned long i;
139 for (i=0; i<r.docs.size(); ++i)
140 s << r.docs[i] << ", ";
141
142 s << "\nlevels: ";
143 for (i=0; i<r.levels.size(); ++i)
144 s << r.levels[i] << ", ";
145
146
147 s << "\nranks: ";
148 for (i=0; i<r.ranks.size(); ++i)
149 s << r.ranks[i] << ", ";
150
151 s << "\ntermFreqs: ";
152 for (i=0; i<r.termFreqs.size(); ++i)
153 s << r.termFreqs[i] << ", ";
154 s << "\nactual number of docs found: " << r.actualNumDocs;
155 s << "\n\n";
156
157 return s;
158}
159
160
161bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {
162 return ((r1.docs == r2.docs) &&
163 (r1.levels == r2.levels) &&
164 (r1.ranks == r2.ranks) &&
165 (r1.termFreqs == r2.termFreqs) &&
166 (r1.actualNumDocs == r2.actualNumDocs));
167}
168
169//-------------------------------------------------------
170// new BrowseQueryResult stuff
171void BrowseQueryResult::Clear () {
172 termFreqs.erase (termFreqs.begin(), termFreqs.end());
173}
174
175BrowseQueryResult::BrowseQueryResult () {
176 Clear ();
177}
178
179
180
181ostream &operator<< (ostream &s, const BrowseQueryResult &r) {
182 s << "terms: ";
183 unsigned long i;
184 for (i=0; i<r.termFreqs.size(); ++i)
185 s << r.termFreqs[i] << ", ";
186 s << "\n\n";
187 return s;
188}
189
190
191bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2) {
192 return ((r1.termFreqs == r2.termFreqs));
193
194}
195
196
197
198
199//--------------------------------------
200void FragData::Clear () {
201 matchDocs = 0;
202 fragNums.erase (fragNums.begin(), fragNums.end());
203 fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
204}
205
206
207void FindWordNumbers (IndexData &indexData,
208 const UCArray &term,
209 unsigned long stemMethod,
210 vector<unsigned long> &equivWords) {
211 equivWords.erase (equivWords.begin(), equivWords.end());
212
213 if (stemMethod == 0 || stemMethod==4 || stemMethod==5) {
214 // don't need to stem the word,
215 // find the word number(s) for this term
216 unsigned long wordElNum = 0;
217 unsigned long numLevels = indexData.bdh.num_levels;
218 word_block_dict_el wordDictEl;
219 wordDictEl.SetNumLevels (numLevels);
220 if (stemMethod ==0) {
221 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
222 indexData.bdh.entries_per_wblk,
223 indexData.bdh.word_dict_size,
224 numLevels, term, wordDictEl, wordElNum))
225 equivWords.push_back (wordElNum);
226
227 return;
228 } else {
229 // partial matching,
230 PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod==5?true:false) );
231 return;
232 }
233 }
234
235 // need to stem this word and find it in the blocked stem index
236
237 unsigned char mgWord[MAXSTEMLEN + 1];
238 UCArray stemTerm;
239 unsigned long stemmerNum = 0;
240 if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;
241 else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;
242 else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;
243
244 // convert the word to an "mg word"
245 mgWord[0] = term.size();
246 memcpy ((char *)&mgWord[1], &(term[0]), term.size());
247
248 // stem the word
249 stemmer (stemMethod, stemmerNum, mgWord);
250
251 // convert the result back to a UCArray
252 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
253
254 // need to look up this term in the appropriate dictionary
255 stem_block_dict_el stemDictEl;
256 unsigned long stemElNum;
257 bool result = false;
258 if (stemMethod == 1) {
259 result = SearchStemBlockDictEl (indexData.stem1File,
260 indexData.sii1,
261 indexData.sih1.entries_per_block,
262 indexData.sih1.dict_size,
263 stemTerm,
264 stemDictEl,
265 stemElNum);
266
267 } else if (stemMethod == 2) {
268 result = SearchStemBlockDictEl (indexData.stem2File,
269 indexData.sii2,
270 indexData.sih2.entries_per_block,
271 indexData.sih2.dict_size,
272 stemTerm,
273 stemDictEl,
274 stemElNum);
275
276 } else if (stemMethod == 3) {
277 result = SearchStemBlockDictEl (indexData.stem3File,
278 indexData.sii3,
279 indexData.sih3.entries_per_block,
280 indexData.sih3.dict_size,
281 stemTerm,
282 stemDictEl,
283 stemElNum);
284 }
285
286 if (result) {
287 equivWords = stemDictEl.equivWords;
288 }
289}
290
291
292
293void ReadTermFragData (IndexData &indexData,
294 bool needFragFreqs,
295 unsigned long termNum,
296 FragData &fragData,
297 FragRangeArray *fragLimits,
298 UCArray & termWord) {
299 fragData.Clear();
300
301 // look up the word in the dictionary
302 unsigned long numLevels = indexData.bdh.num_levels;
303 word_block_dict_el wordDictEl;
304 wordDictEl.SetNumLevels (numLevels);
305 if (!SearchWordBlockDictElNum (indexData.dictFile,
306 indexData.biWords,
307 indexData.bdh.entries_per_wblk,
308 indexData.bdh.word_dict_size,
309 numLevels,
310 termNum, wordDictEl))
311 return; // nothing more to do
312
313 fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
314 termWord = wordDictEl.el;
315 // seek to the appropriate place in the inverted file
316 fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
317 stdio_bitio_buffer buffer (indexData.invfFile);
318
319 unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags,
320 wordDictEl.frag_occur);
321 unsigned long fragNum = 0;
322 unsigned long termFreq = 0;
323
324 unsigned long fragLimitI = 0;
325 unsigned long i;
326 for (i=0; i<wordDictEl.frag_occur; ++i) {
327 fragNum += buffer.bblock_decode (B, NULL);
328 if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
329 else termFreq = 1;
330
331 // get the right fragment range
332 if (fragLimits != NULL) {
333 while (fragLimitI+1 < (*fragLimits).size() &&
334 fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
335 ++fragLimitI;
336 }
337 }
338
339 // add the entry if it is within the limits
340 if ((fragLimits == NULL) ||
341 (fragLimitI < (*fragLimits).size() &&
342 fragNum > (*fragLimits)[fragLimitI].rangeStart &&
343 fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
344 fragData.fragNums.push_back (fragNum);
345 if (needFragFreqs)
346 fragData.fragFreqs.push_back (termFreq);
347 }
348 }
349
350 buffer.done();
351}
352
353
354void CombineFragData (bool needFragFreqs,
355 const FragData &f1,
356 const FragData &f2,
357 FragData &outFragData) {
358 outFragData.Clear();
359
360 // the new number of matching documents is the maximum
361 // of the two input matching number of documents -- it
362 // is assumed that these are at the same document level
363 outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
364 f1.matchDocs : f2.matchDocs;
365
366 // do or
367 unsigned long f1I = 0, f1Size = f1.fragNums.size();
368 unsigned long f2I = 0, f2Size = f2.fragNums.size();
369 while (f1I < f1Size || f2I < f2Size) {
370 if (f2I < f2Size &&
371 (f1I >= f1Size ||
372 f1.fragNums[f1I] > f2.fragNums[f2I])) {
373 // output f2I
374 outFragData.fragNums.push_back (f2.fragNums[f2I]);
375 if (needFragFreqs)
376 outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
377 ++f2I;
378
379 } else if (f1I < f1Size &&
380 (f2I >= f2Size ||
381 f1.fragNums[f1I] < f2.fragNums[f2I])) {
382 // output f1I
383 outFragData.fragNums.push_back (f1.fragNums[f1I]);
384 if (needFragFreqs)
385 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
386 ++f1I;
387
388 } else {
389 // must be equal combine f1I and f2I
390 outFragData.fragNums.push_back (f1.fragNums[f1I]);
391 if (needFragFreqs)
392 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
393 ++f1I;
394 ++f2I;
395 }
396 }
397}
398
399
400void AndCombineFragData (bool needFragFreqs,
401 FragData &fragData,
402 const FragData &comFragData,
403 signed long startRange,
404 signed long endRange,
405 const FragRangeArray *fragLimits) {
406 // sanity check on range
407 if (startRange > endRange) {
408 signed long temp = endRange;
409 endRange = startRange;
410 startRange = temp;
411 }
412
413 // get min matchdocs
414 if (comFragData.matchDocs < fragData.matchDocs)
415 fragData.matchDocs = comFragData.matchDocs;
416
417 unsigned long fragDataI = 0;
418 unsigned long fragDataSize = fragData.fragNums.size();
419 unsigned long comFragDataI = 0;
420 unsigned long comFragDataSize = comFragData.fragNums.size();
421 unsigned long fragLimitI = 0;
422 unsigned long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
423 unsigned long outI = 0;
424
425 while (fragDataI < fragDataSize &&
426 comFragDataI < comFragDataSize) {
427 signed long fragNum = (signed long)fragData.fragNums[fragDataI];
428 signed long comFragNum = (signed long)comFragData.fragNums[comFragDataI];
429
430 // go to the right fragment limit (for the com frag)
431 if (fragLimits != NULL) {
432 while (fragLimitI+1 < fragLimitSize &&
433 comFragNum > (signed long)(*fragLimits)[fragLimitI+1].rangeStart) {
434 ++fragLimitI;
435 }
436 }
437
438 if (fragNum <= comFragNum+startRange ||
439 (fragLimits!=NULL &&
440 fragNum<=(signed long)(*fragLimits)[fragLimitI].rangeStart)) {
441 ++fragDataI;
442
443 } else if (fragNum > comFragNum+endRange ||
444 (fragLimits!=NULL &&
445 fragNum>(signed long)(*fragLimits)[fragLimitI].rangeEnd)) {
446 ++comFragDataI;
447
448 } else {
449 // equal and within tag
450 fragData.fragNums[outI] = comFragNum;
451 if (needFragFreqs) {
452 fragData.fragFreqs[outI] =
453 (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
454 fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
455 }
456 ++fragDataI;
457 ++comFragDataI;
458 ++outI;
459 }
460 }
461
462 // erase unused part of fragData
463 fragData.fragNums.erase (fragData.fragNums.begin()+outI,
464 fragData.fragNums.end());
465 if (needFragFreqs)
466 fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
467 fragData.fragFreqs.end());
468 else
469 fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
470 fragData.fragFreqs.end());
471}
472
473
474void FragsToQueryResult (IndexData &indexData,
475 const QueryInfo &queryInfo,
476 const FragData &termData,
477 const UCArray &tag,
478 const UCArray &term,
479 unsigned long stemMethod,
480 unsigned long termWeight,
481 UCArrayVector &equivTerms,
482 QueryResult &result) {
483 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
484
485 result.Clear();
486
487 // log (N / ft)
488 unsigned long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
489 float wordLog = log((double)N / (double)termData.matchDocs);
490
491 // Wqt = fqt * log (N / ft)
492 // note: terms are allowed to have a weight of zero so
493 // they can be excluded from the ranking
494 float Wqt = termWeight * wordLog;
495
496 // Wdt = fdt * log (N / ft)
497 float Wdt;
498
499 unsigned long termDataI = 0;
500 unsigned long termDataSize = termData.fragNums.size();
501 unsigned long levelDocNum = 0;
502
503 unsigned long termDocFreq = 0;
504 unsigned long lastLevelDocNum = 0;
505 unsigned long overallwordfreq = 0;
506
507 while (termDataI < termDataSize) {
508 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
509 levelDocNum)) {
510 if (levelDocNum != lastLevelDocNum) {
511 if (lastLevelDocNum > 0) {
512 // add this doc information
513 if (needRanks) {
514 Wdt = termDocFreq * wordLog;
515 result.ranks.push_back (Wqt * Wdt);
516 }
517 result.docs.push_back (lastLevelDocNum);
518 }
519
520 lastLevelDocNum = levelDocNum;
521 termDocFreq = 0;
522 }
523
524 if (needRanks){
525 termDocFreq += termData.fragFreqs[termDataI];
526 overallwordfreq += termData.fragFreqs[termDataI];
527 }
528 }
529 ++termDataI;
530 }
531
532 if (lastLevelDocNum > 0) {
533 // add the last document information
534 if (needRanks) {
535 Wdt = termDocFreq * wordLog;
536 result.ranks.push_back (Wqt * Wdt);
537 }
538 result.docs.push_back (lastLevelDocNum);
539 }
540
541 // add the term frequency information
542 if (queryInfo.needTermFreqs) {
543 TermFreqData termFreqData;
544 termFreqData.tag = tag;
545 termFreqData.term = term;
546 termFreqData.stemMethod = stemMethod;
547 termFreqData.equivTerms = equivTerms;
548 termFreqData.matchDocs = termData.matchDocs;
549 termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo
550 //not true
551 result.termFreqs.push_back (termFreqData);
552 }
553}
554
555void AndFragsToQueryResult (IndexData &indexData,
556 const QueryInfo &queryInfo,
557 const FragData &termData,
558 const UCArray &tag,
559 const UCArray &term,
560 unsigned long stemMethod,
561 unsigned long termWeight,
562 UCArrayVector &equivTerms,
563 QueryResult &result) {
564 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
565
566 // log (N / ft)
567 float wordLog =
568 log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
569 (double)termData.matchDocs);
570
571 // Wqt = fqt * log (N / ft)
572 // note: terms are allowed to have a weight of zero so
573 // they can be excluded from the ranking
574 float Wqt = termWeight * wordLog;
575
576 // Wdt = fdt * log (N / ft)
577 float Wdt;
578
579 unsigned long termDataI = 0;
580 unsigned long termDataSize = termData.fragNums.size();
581 unsigned long levelDocNum = 0;
582
583 unsigned long termDocFreq = 0;
584 unsigned long lastLevelDocNum = 0;
585 unsigned long overallwordfreq = 0;
586 unsigned long resultI = 0;
587 unsigned long resultSize = result.docs.size();
588 unsigned long resultOutI = 0;
589
590
591 while (termDataI < termDataSize) {
592 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
593 levelDocNum)) {
594 if (levelDocNum != lastLevelDocNum) {
595 if (lastLevelDocNum > 0) {
596 // add this doc information
597 Wdt = termDocFreq * wordLog;
598
599 // find this document number
600 while (resultI < resultSize &&
601 result.docs[resultI] < lastLevelDocNum)
602 ++resultI;
603
604 // store the result
605 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
606 result.docs[resultOutI] = lastLevelDocNum;
607 if (needRanks)
608 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
609 ++resultI;
610 ++resultOutI;
611 }
612 }
613
614 lastLevelDocNum = levelDocNum;
615 termDocFreq = 0;
616 }
617
618 if (needRanks)
619 termDocFreq += termData.fragFreqs[termDataI];
620 overallwordfreq += termData.fragFreqs[termDataI];
621 }
622
623 ++termDataI;
624 } // while
625
626 if (lastLevelDocNum > 0) {
627 // add the last document information
628 Wdt = termDocFreq * wordLog;
629
630 // find this document number
631 while (resultI < resultSize &&
632 result.docs[resultI] < lastLevelDocNum)
633 ++resultI;
634
635 // store the result
636 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
637 result.docs[resultOutI] = lastLevelDocNum;
638 if (needRanks)
639 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
640 ++resultI;
641 ++resultOutI;
642 }
643 }
644
645 // remove unneeded entries
646 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
647 if (needRanks)
648 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
649 else
650 result.ranks.erase (result.ranks.begin(), result.ranks.end());
651
652 // add the term frequency information
653 if (queryInfo.needTermFreqs) {
654 TermFreqData termFreqData;
655 termFreqData.tag = tag;
656 termFreqData.term = term;
657 termFreqData.stemMethod = stemMethod;
658 termFreqData.equivTerms = equivTerms;
659 termFreqData.matchDocs = termData.matchDocs;
660 termFreqData.termFreq = overallwordfreq;
661 result.termFreqs.push_back (termFreqData);
662 }
663}
664
665
666void RemoveUnwantedResults (IndexData &indexData,
667 const QueryInfo &queryInfo,
668 const FragData &termData,
669 QueryResult &result) {
670 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
671
672 unsigned long termDataI = 0;
673 unsigned long termDataSize = termData.fragNums.size();
674 unsigned long levelDocNum = 0;
675
676 unsigned long lastLevelDocNum = 0;
677
678 unsigned long resultI = 0;
679 unsigned long resultSize = result.docs.size();
680 unsigned long resultOutI = 0;
681
682 while (termDataI < termDataSize) {
683 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
684 levelDocNum)) {
685 if (levelDocNum != lastLevelDocNum) {
686 if (lastLevelDocNum > 0) {
687 // find this document number
688 while (resultI < resultSize &&
689 result.docs[resultI] < lastLevelDocNum)
690 ++resultI;
691
692 // store the result
693 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
694 result.docs[resultOutI] = lastLevelDocNum;
695 if (needRanks)
696 result.ranks[resultOutI] = result.ranks[resultI];
697 ++resultI;
698 ++resultOutI;
699 }
700 }
701
702 lastLevelDocNum = levelDocNum;
703 }
704 }
705
706 ++termDataI;
707 }
708
709 if (lastLevelDocNum > 0) {
710 // find this document number
711 while (resultI < resultSize &&
712 result.docs[resultI] < lastLevelDocNum)
713 ++resultI;
714
715 // store the result
716 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
717 result.docs[resultOutI] = lastLevelDocNum;
718 if (needRanks)
719 result.ranks[resultOutI] = result.ranks[resultI];
720 ++resultI;
721 ++resultOutI;
722 }
723 }
724
725 // remove unneeded entries
726 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
727 if (needRanks)
728 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
729 else
730 result.ranks.erase (result.ranks.begin(), result.ranks.end());
731}
732
733
734
735//--------------------------------------------------------------
736// functions to support full text browse
737
738void FindNearestWordNumber (IndexData &indexData,
739 const UCArray &term,
740 unsigned long &number) {
741
742 // find the word number for this term
743 unsigned long wordElNum = 0;
744 unsigned long numLevels = indexData.bdh.num_levels;
745 word_block_dict_el wordDictEl;
746 wordDictEl.SetNumLevels (numLevels);
747 if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
748 indexData.bdh.entries_per_wblk,
749 indexData.bdh.word_dict_size,
750 numLevels, term, wordDictEl, wordElNum))
751 number = wordElNum;
752
753}
754
755void GetTermList(IndexData &indexData,
756 unsigned long startTerm,
757 unsigned long numTerms,
758 TermFreqArray &terms) {
759
760 word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array();
761 TermFreqData termdata;
762
763 terms.erase(terms.begin(), terms.end());
764
765 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
766 indexData.bdh.entries_per_wblk,
767 indexData.bdh.word_dict_size,
768 indexData.bdh.num_levels, startTerm,
769 numTerms, wordBlocks);
770
771 word_block_dict_el_array::iterator here = wordBlocks.begin();
772 word_block_dict_el_array::iterator end = wordBlocks.end();
773
774 while (here != end) {
775 termdata.Clear();
776 termdata.term = (*here).el;
777 termdata.termFreq = (*here).freq;
778 terms.push_back(termdata);
779 ++here;
780 }
781
782}
783
784void GetTermList(IndexData &indexData,
785 unsigned long startTerm,
786 unsigned long numTerms,
787 UCArrayVector &terms) {
788
789
790
791 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
792 indexData.bdh.entries_per_wblk,
793 indexData.bdh.word_dict_size,
794 indexData.bdh.num_levels, startTerm,
795 numTerms, terms);
796
797}
Note: See TracBrowser for help on using the repository browser.