source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mgpp/text/Terms.cpp@ 23508

Last change on this file since 23508 was 23508, checked in by sjm84, 13 years ago

Committing 64 bit changes into the branch

  • Property svn:keywords set to Author Date Id Revision
File size: 21.7 KB
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "Terms.h"
23#include "words.h"
24#include "stemmer.h"
25#include "bitio_gen.h"
26#include "bitio_m_stdio.h"
27
28void QueryInfo::Clear () {
29 UCArrayClear (docLevel);
30 maxDocs = 0;
31 sortByRank = true;
32 exactWeights = false;
33 needRankInfo = false;
34 needTermFreqs = false;
35}
36
37
38
39void TermFreqData::Clear () {
40 UCArrayClear (tag);
41 UCArrayClear (term);
42 equivTerms.erase(equivTerms.begin(), equivTerms.end());
43 stemMethod = 0;
44 matchDocs = 0;
45 termFreq = 0;
46}
47
48ostream &operator<< (ostream &s, const TermFreqData &t) {
49 s << "<" << t.tag << ">\"" << t.term << "\"stem("
50 << t.stemMethod << ")equiv terms(";
51
52 mg_u_long i;
53 for (i=0; i<t.equivTerms.size(); ++i) {
54 s << t.equivTerms[i] << ", ";
55 }
56 s <<")docs(" << t.matchDocs << ")"
57 << "count("<<t.termFreq<<")";
58 return s;
59}
60
61bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
62 return ((t1.tag == t2.tag) &&
63 (t1.term == t2.term) &&
64 (t1.stemMethod == t2.stemMethod) &&
65 (t1.equivTerms == t2.equivTerms) &&
66 (t1.matchDocs == t2.matchDocs) &&
67 (t1.termFreq == t2.termFreq));
68}
69
70
71void QueryResult::Clear () {
72 docs.erase (docs.begin(), docs.end());
73 ranks.erase (ranks.begin(), ranks.end());
74 termFreqs.erase (termFreqs.begin(), termFreqs.end());
75 actualNumDocs = 0;
76}
77
78QueryResult::QueryResult () {
79 Clear ();
80}
81
82void QueryResult::printShort(ostream &s) {
83
84 s << "termFreqs: ";
85 for (mg_u_long i=0; i<termFreqs.size(); ++i)
86 s << termFreqs[i] << ", ";
87
88 s << "\nactual number of docs found: " << actualNumDocs;
89 s << "\n\n";
90
91}
92
93
94ostream &operator<< (ostream &s, const QueryResult &r) {
95 s << "docs: ";
96 mg_u_long i;
97 for (i=0; i<r.docs.size(); ++i)
98 s << r.docs[i] << ", ";
99
100 s << "\nranks: ";
101 for (i=0; i<r.ranks.size(); ++i)
102 s << r.ranks[i] << ", ";
103
104 s << "\ntermFreqs: ";
105 for (i=0; i<r.termFreqs.size(); ++i)
106 s << r.termFreqs[i] << ", ";
107
108 s << "\nactual number of docs found: " << r.actualNumDocs;
109 s << "\n\n";
110
111 return s;
112}
113
114
115bool operator== (const QueryResult &r1, const QueryResult &r2) {
116 return ((r1.docs == r2.docs) &&
117 (r1.ranks == r2.ranks) &&
118 (r1.termFreqs == r2.termFreqs) &&
119 (r1.actualNumDocs == r2.actualNumDocs));
120}
121
122//---------------------------------------------------
123// new ExtQueryResult stuff
124void ExtQueryResult::Clear () {
125 docs.erase (docs.begin(), docs.end());
126 levels.erase (levels.begin(), levels.end());
127 ranks.erase (ranks.begin(), ranks.end());
128 termFreqs.erase (termFreqs.begin(), termFreqs.end());
129 actualNumDocs = 0;
130}
131
132ExtQueryResult::ExtQueryResult () {
133 Clear ();
134}
135
136ostream &operator<< (ostream &s, const ExtQueryResult &r) {
137 s << "docs: ";
138 mg_u_long i;
139 for (i=0; i<r.docs.size(); ++i)
140 s << r.docs[i] << ", ";
141
142 s << "\nlevels: ";
143 for (i=0; i<r.levels.size(); ++i)
144 s << r.levels[i] << ", ";
145
146
147 s << "\nranks: ";
148 for (i=0; i<r.ranks.size(); ++i)
149 s << r.ranks[i] << ", ";
150
151 s << "\ntermFreqs: ";
152 for (i=0; i<r.termFreqs.size(); ++i)
153 s << r.termFreqs[i] << ", ";
154 s << "\nactual number of docs found: " << r.actualNumDocs;
155 s << "\n\n";
156
157 return s;
158}
159
160
161bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {
162 return ((r1.docs == r2.docs) &&
163 (r1.levels == r2.levels) &&
164 (r1.ranks == r2.ranks) &&
165 (r1.termFreqs == r2.termFreqs) &&
166 (r1.actualNumDocs == r2.actualNumDocs));
167}
168
169//-------------------------------------------------------
170// new BrowseQueryResult stuff
171void BrowseQueryResult::Clear () {
172 termFreqs.erase (termFreqs.begin(), termFreqs.end());
173}
174
175BrowseQueryResult::BrowseQueryResult () {
176 Clear ();
177}
178
179
180
181ostream &operator<< (ostream &s, const BrowseQueryResult &r) {
182 s << "terms: ";
183 mg_u_long i;
184 for (i=0; i<r.termFreqs.size(); ++i)
185 s << r.termFreqs[i] << ", ";
186 s << "\n\n";
187 return s;
188}
189
190
191bool operator== (const BrowseQueryResult &r1, const BrowseQueryResult &r2) {
192 return ((r1.termFreqs == r2.termFreqs));
193
194}
195
196
197
198
199//--------------------------------------
200void FragData::Clear () {
201 matchDocs = 0;
202 fragNums.erase (fragNums.begin(), fragNums.end());
203 fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
204}
205
206
207void FindWordNumbers (IndexData &indexData,
208 const UCArray &term,
209 mg_u_long stemMethod,
210 vector<mg_u_long> &equivWords) {
211 equivWords.erase (equivWords.begin(), equivWords.end());
212
213 // if the stem method specified is not a valid one (i.e. there was no appropriate stem index, then we set it to 0)
214 // unless we have partial matching, in which case we are not doing stem indexes anyway.
215 if (!(stemMethod & STEM_PARTIAL_MATCH) && indexData.stemFile[stemMethod-1] == NULL) {
216 cerr << "Stem index for method "<<stemMethod<< " was not built, so not doing stemming\n";
217 stemMethod = 0;
218 }
219 /* [JFG - Mar 06: Accent folding patch] */
220 /* use flag PARTIAL_MATCH */
221 if (stemMethod == 0 || (stemMethod & STEM_PARTIAL_MATCH)) {
222 // don't need to stem the word,
223 // find the word number(s) for this term
224 mg_u_long wordElNum = 0;
225 mg_u_long numLevels = indexData.bdh.num_levels;
226 word_block_dict_el wordDictEl;
227 wordDictEl.SetNumLevels (numLevels);
228 if (stemMethod == 0) {
229 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
230 indexData.bdh.entries_per_wblk,
231 indexData.bdh.word_dict_size,
232 numLevels, term, wordDictEl, wordElNum))
233 equivWords.push_back (wordElNum);
234
235 return;
236 } else {
237 // partial matching,
238 PartialMatchSearchWordBlockDictEl (indexData.dictFile, indexData.biWords, indexData.bdh.entries_per_wblk, indexData.bdh.word_dict_size, numLevels, term, wordDictEl, equivWords, (stemMethod & STEM_CaseFolding)? true : false);
239 // TODO: Accent Folding is not handled here!!
240 return;
241 }
242 }
243
244 // need to stem this word and find it in the blocked stem index
245 unsigned char mgWord[MAXSTEMLEN + 1];
246 UCArray stemTerm;
247 mg_u_long stemmerNum = 0;
248
249 /* [JFG - Mar 06: Accent folding patch] */
250 if(stemMethod > STEM_MAX) {
251 return;
252 //TODO: throw an error here
253 }
254
255 stemmerNum = indexData.sih[stemMethod-1].stemmer_num;
256
257 // convert the word to an "mg word"
258 mgWord[0] = term.size();
259 memcpy ((char *)&mgWord[1], &(term[0]), term.size());
260
261 // stem the word
262 mgpp_stemmer (stemMethod, stemmerNum, mgWord);
263 // convert the result back to a UCArray
264 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
265
266 // need to look up this term in the appropriate dictionary
267 stem_block_dict_el stemDictEl;
268 mg_u_long stemElNum;
269 bool result = false;
270
271 /* [JFG - Mar 06: Accent folding patch] */
272 result = SearchStemBlockDictEl (indexData.stemFile[stemMethod-1],
273 indexData.sii[stemMethod-1],
274 indexData.sih[stemMethod-1].entries_per_block,
275 indexData.sih[stemMethod-1].dict_size,
276 stemTerm,
277 stemDictEl,
278 stemElNum);
279
280 if (result) {
281 equivWords = stemDictEl.equivWords;
282 }
283}
284
285
286
287void ReadTermFragData (IndexData &indexData,
288 bool needFragFreqs,
289 mg_u_long termNum,
290 FragData &fragData,
291 FragRangeArray *fragLimits,
292 UCArray & termWord) {
293 fragData.Clear();
294
295 // look up the word in the dictionary
296 mg_u_long numLevels = indexData.bdh.num_levels;
297 word_block_dict_el wordDictEl;
298 wordDictEl.SetNumLevels (numLevels);
299 if (!SearchWordBlockDictElNum (indexData.dictFile,
300 indexData.biWords,
301 indexData.bdh.entries_per_wblk,
302 indexData.bdh.word_dict_size,
303 numLevels,
304 termNum, wordDictEl))
305 return; // nothing more to do
306
307 fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
308 termWord = wordDictEl.el;
309 // seek to the appropriate place in the inverted file
310 fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
311 stdio_bitio_buffer buffer (indexData.invfFile);
312
313 mg_u_long B = BIO_Bblock_Init (indexData.bdh.num_frags,
314 wordDictEl.frag_occur);
315 mg_u_long fragNum = 0;
316 mg_u_long termFreq = 0;
317
318 mg_u_long fragLimitI = 0;
319 mg_u_long i;
320 for (i=0; i<wordDictEl.frag_occur; ++i) {
321 fragNum += buffer.bblock_decode (B, NULL);
322 if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
323 else termFreq = 1;
324
325 // get the right fragment range
326 if (fragLimits != NULL) {
327 while (fragLimitI+1 < (*fragLimits).size() &&
328 fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
329 ++fragLimitI;
330 }
331 }
332
333 // add the entry if it is within the limits
334 if ((fragLimits == NULL) ||
335 (fragLimitI < (*fragLimits).size() &&
336 fragNum > (*fragLimits)[fragLimitI].rangeStart &&
337 fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
338 fragData.fragNums.push_back (fragNum);
339 if (needFragFreqs)
340 fragData.fragFreqs.push_back (termFreq);
341 }
342 }
343
344 buffer.done();
345}
346
347
348void CombineFragData (bool needFragFreqs,
349 const FragData &f1,
350 const FragData &f2,
351 FragData &outFragData) {
352 outFragData.Clear();
353
354 // the new number of matching documents is the maximum
355 // of the two input matching number of documents -- it
356 // is assumed that these are at the same document level
357 outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
358 f1.matchDocs : f2.matchDocs;
359
360 // do or
361 mg_u_long f1I = 0, f1Size = f1.fragNums.size();
362 mg_u_long f2I = 0, f2Size = f2.fragNums.size();
363 while (f1I < f1Size || f2I < f2Size) {
364 if (f2I < f2Size &&
365 (f1I >= f1Size ||
366 f1.fragNums[f1I] > f2.fragNums[f2I])) {
367 // output f2I
368 outFragData.fragNums.push_back (f2.fragNums[f2I]);
369 if (needFragFreqs)
370 outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
371 ++f2I;
372
373 } else if (f1I < f1Size &&
374 (f2I >= f2Size ||
375 f1.fragNums[f1I] < f2.fragNums[f2I])) {
376 // output f1I
377 outFragData.fragNums.push_back (f1.fragNums[f1I]);
378 if (needFragFreqs)
379 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
380 ++f1I;
381
382 } else {
383 // must be equal combine f1I and f2I
384 outFragData.fragNums.push_back (f1.fragNums[f1I]);
385 if (needFragFreqs)
386 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
387 ++f1I;
388 ++f2I;
389 }
390 }
391}
392
393
394void AndCombineFragData (bool needFragFreqs,
395 FragData &fragData,
396 const FragData &comFragData,
397 mg_s_long startRange,
398 mg_s_long endRange,
399 const FragRangeArray *fragLimits) {
400 // sanity check on range
401 if (startRange > endRange) {
402 mg_s_long temp = endRange;
403 endRange = startRange;
404 startRange = temp;
405 }
406
407 // get min matchdocs
408 if (comFragData.matchDocs < fragData.matchDocs)
409 fragData.matchDocs = comFragData.matchDocs;
410
411 mg_u_long fragDataI = 0;
412 mg_u_long fragDataSize = fragData.fragNums.size();
413 mg_u_long comFragDataI = 0;
414 mg_u_long comFragDataSize = comFragData.fragNums.size();
415 mg_u_long fragLimitI = 0;
416 mg_u_long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
417 mg_u_long outI = 0;
418
419 while (fragDataI < fragDataSize &&
420 comFragDataI < comFragDataSize) {
421 mg_s_long fragNum = (mg_s_long)fragData.fragNums[fragDataI];
422 mg_s_long comFragNum = (mg_s_long)comFragData.fragNums[comFragDataI];
423
424 // go to the right fragment limit (for the com frag)
425 if (fragLimits != NULL) {
426 while (fragLimitI+1 < fragLimitSize &&
427 comFragNum > (mg_s_long)(*fragLimits)[fragLimitI+1].rangeStart) {
428 ++fragLimitI;
429 }
430 }
431
432 if (fragNum <= comFragNum+startRange ||
433 (fragLimits!=NULL &&
434 fragNum<=(mg_s_long)(*fragLimits)[fragLimitI].rangeStart)) {
435 ++fragDataI;
436
437 } else if (fragNum > comFragNum+endRange ||
438 (fragLimits!=NULL &&
439 fragNum>(mg_s_long)(*fragLimits)[fragLimitI].rangeEnd)) {
440 ++comFragDataI;
441
442 } else {
443 // equal and within tag
444 fragData.fragNums[outI] = comFragNum;
445 if (needFragFreqs) {
446 fragData.fragFreqs[outI] =
447 (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
448 fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
449 }
450 ++fragDataI;
451 ++comFragDataI;
452 ++outI;
453 }
454 }
455
456 // erase unused part of fragData
457 fragData.fragNums.erase (fragData.fragNums.begin()+outI,
458 fragData.fragNums.end());
459 if (needFragFreqs)
460 fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
461 fragData.fragFreqs.end());
462 else
463 fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
464 fragData.fragFreqs.end());
465}
466
467
468void FragsToQueryResult (IndexData &indexData,
469 const QueryInfo &queryInfo,
470 const FragData &termData,
471 const UCArray &tag,
472 const UCArray &term,
473 mg_u_long stemMethod,
474 mg_u_long termWeight,
475 UCArrayVector &equivTerms,
476 QueryResult &result) {
477 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
478
479 result.Clear();
480
481 // log (N / ft)
482 mg_u_long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
483 float wordLog = log((double)N / (double)termData.matchDocs);
484
485 // Wqt = fqt * log (N / ft)
486 // note: terms are allowed to have a weight of zero so
487 // they can be excluded from the ranking
488 float Wqt = termWeight * wordLog;
489
490 // Wdt = fdt * log (N / ft)
491 float Wdt;
492
493 mg_u_long termDataI = 0;
494 mg_u_long termDataSize = termData.fragNums.size();
495 mg_u_long levelDocNum = 0;
496
497 mg_u_long termDocFreq = 0;
498 mg_u_long lastLevelDocNum = 0;
499 mg_u_long overallwordfreq = 0;
500
501 while (termDataI < termDataSize) {
502 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
503 levelDocNum)) {
504 if (levelDocNum != lastLevelDocNum) {
505 if (lastLevelDocNum > 0) {
506 // add this doc information
507 if (needRanks) {
508 Wdt = termDocFreq * wordLog;
509 result.ranks.push_back (Wqt * Wdt);
510 }
511 result.docs.push_back (lastLevelDocNum);
512 }
513
514 lastLevelDocNum = levelDocNum;
515 termDocFreq = 0;
516 }
517
518 if (needRanks){
519 termDocFreq += termData.fragFreqs[termDataI];
520 overallwordfreq += termData.fragFreqs[termDataI];
521 }
522 }
523 ++termDataI;
524 }
525
526 if (lastLevelDocNum > 0) {
527 // add the last document information
528 if (needRanks) {
529 Wdt = termDocFreq * wordLog;
530 result.ranks.push_back (Wqt * Wdt);
531 }
532 result.docs.push_back (lastLevelDocNum);
533 }
534
535 // add the term frequency information
536 if (queryInfo.needTermFreqs) {
537 TermFreqData termFreqData;
538 termFreqData.tag = tag;
539 termFreqData.term = term;
540 termFreqData.stemMethod = stemMethod;
541 termFreqData.equivTerms = equivTerms;
542 termFreqData.matchDocs = termData.matchDocs;
543 termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo
544 //not true
545 result.termFreqs.push_back (termFreqData);
546 }
547}
548
549void AndFragsToQueryResult (IndexData &indexData,
550 const QueryInfo &queryInfo,
551 const FragData &termData,
552 const UCArray &tag,
553 const UCArray &term,
554 mg_u_long stemMethod,
555 mg_u_long termWeight,
556 UCArrayVector &equivTerms,
557 QueryResult &result) {
558 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
559
560 // log (N / ft)
561 float wordLog =
562 log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
563 (double)termData.matchDocs);
564
565 // Wqt = fqt * log (N / ft)
566 // note: terms are allowed to have a weight of zero so
567 // they can be excluded from the ranking
568 float Wqt = termWeight * wordLog;
569
570 // Wdt = fdt * log (N / ft)
571 float Wdt;
572
573 mg_u_long termDataI = 0;
574 mg_u_long termDataSize = termData.fragNums.size();
575 mg_u_long levelDocNum = 0;
576
577 mg_u_long termDocFreq = 0;
578 mg_u_long lastLevelDocNum = 0;
579 mg_u_long overallwordfreq = 0;
580 mg_u_long resultI = 0;
581 mg_u_long resultSize = result.docs.size();
582 mg_u_long resultOutI = 0;
583
584
585 while (termDataI < termDataSize) {
586 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
587 levelDocNum)) {
588 if (levelDocNum != lastLevelDocNum) {
589 if (lastLevelDocNum > 0) {
590 // add this doc information
591 Wdt = termDocFreq * wordLog;
592
593 // find this document number
594 while (resultI < resultSize &&
595 result.docs[resultI] < lastLevelDocNum)
596 ++resultI;
597
598 // store the result
599 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
600 result.docs[resultOutI] = lastLevelDocNum;
601 if (needRanks)
602 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
603 ++resultI;
604 ++resultOutI;
605 }
606 }
607
608 lastLevelDocNum = levelDocNum;
609 termDocFreq = 0;
610 }
611
612 if (needRanks)
613 termDocFreq += termData.fragFreqs[termDataI];
614 overallwordfreq += termData.fragFreqs[termDataI];
615 }
616
617 ++termDataI;
618 } // while
619
620 if (lastLevelDocNum > 0) {
621 // add the last document information
622 Wdt = termDocFreq * wordLog;
623
624 // find this document number
625 while (resultI < resultSize &&
626 result.docs[resultI] < lastLevelDocNum)
627 ++resultI;
628
629 // store the result
630 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
631 result.docs[resultOutI] = lastLevelDocNum;
632 if (needRanks)
633 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
634 ++resultI;
635 ++resultOutI;
636 }
637 }
638
639 // remove unneeded entries
640 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
641 if (needRanks)
642 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
643 else
644 result.ranks.erase (result.ranks.begin(), result.ranks.end());
645
646 // add the term frequency information
647 if (queryInfo.needTermFreqs) {
648 TermFreqData termFreqData;
649 termFreqData.tag = tag;
650 termFreqData.term = term;
651 termFreqData.stemMethod = stemMethod;
652 termFreqData.equivTerms = equivTerms;
653 termFreqData.matchDocs = termData.matchDocs;
654 termFreqData.termFreq = overallwordfreq;
655 result.termFreqs.push_back (termFreqData);
656 }
657}
658
659
660void RemoveUnwantedResults (IndexData &indexData,
661 const QueryInfo &queryInfo,
662 const FragData &termData,
663 QueryResult &result) {
664 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
665
666 mg_u_long termDataI = 0;
667 mg_u_long termDataSize = termData.fragNums.size();
668 mg_u_long levelDocNum = 0;
669
670 mg_u_long lastLevelDocNum = 0;
671
672 mg_u_long resultI = 0;
673 mg_u_long resultSize = result.docs.size();
674 mg_u_long resultOutI = 0;
675
676 while (termDataI < termDataSize) {
677 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
678 levelDocNum)) {
679 if (levelDocNum != lastLevelDocNum) {
680 if (lastLevelDocNum > 0) {
681 // find this document number
682 while (resultI < resultSize &&
683 result.docs[resultI] < lastLevelDocNum)
684 ++resultI;
685
686 // store the result
687 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
688 result.docs[resultOutI] = lastLevelDocNum;
689 if (needRanks)
690 result.ranks[resultOutI] = result.ranks[resultI];
691 ++resultI;
692 ++resultOutI;
693 }
694 }
695
696 lastLevelDocNum = levelDocNum;
697 }
698 }
699
700 ++termDataI;
701 }
702
703 if (lastLevelDocNum > 0) {
704 // find this document number
705 while (resultI < resultSize &&
706 result.docs[resultI] < lastLevelDocNum)
707 ++resultI;
708
709 // store the result
710 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
711 result.docs[resultOutI] = lastLevelDocNum;
712 if (needRanks)
713 result.ranks[resultOutI] = result.ranks[resultI];
714 ++resultI;
715 ++resultOutI;
716 }
717 }
718
719 // remove unneeded entries
720 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
721 if (needRanks)
722 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
723 else
724 result.ranks.erase (result.ranks.begin(), result.ranks.end());
725}
726
727
728
729//--------------------------------------------------------------
730// functions to support full text browse
731
732void FindNearestWordNumber (IndexData &indexData,
733 const UCArray &term,
734 mg_u_long &number) {
735
736 // find the word number for this term
737 mg_u_long wordElNum = 0;
738 mg_u_long numLevels = indexData.bdh.num_levels;
739 word_block_dict_el wordDictEl;
740 wordDictEl.SetNumLevels (numLevels);
741 if (NearestSearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
742 indexData.bdh.entries_per_wblk,
743 indexData.bdh.word_dict_size,
744 numLevels, term, wordDictEl, wordElNum))
745 number = wordElNum;
746
747}
748
749void GetTermList(IndexData &indexData,
750 mg_u_long startTerm,
751 mg_u_long numTerms,
752 TermFreqArray &terms) {
753
754 word_block_dict_el_array wordBlocks; // = new word_block_dict_el_array();
755 TermFreqData termdata;
756
757 terms.erase(terms.begin(), terms.end());
758
759 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
760 indexData.bdh.entries_per_wblk,
761 indexData.bdh.word_dict_size,
762 indexData.bdh.num_levels, startTerm,
763 numTerms, wordBlocks);
764
765 word_block_dict_el_array::iterator here = wordBlocks.begin();
766 word_block_dict_el_array::iterator end = wordBlocks.end();
767
768 while (here != end) {
769 termdata.Clear();
770 termdata.term = (*here).el;
771 termdata.termFreq = (*here).freq;
772 terms.push_back(termdata);
773 ++here;
774 }
775
776}
777
778void GetTermList(IndexData &indexData,
779 mg_u_long startTerm,
780 mg_u_long numTerms,
781 UCArrayVector &terms) {
782
783
784
785 SearchWordBlockDictElNumRange (indexData.dictFile, indexData.biWords,
786 indexData.bdh.entries_per_wblk,
787 indexData.bdh.word_dict_size,
788 indexData.bdh.num_levels, startTerm,
789 numTerms, terms);
790
791}
Note: See TracBrowser for help on using the repository browser.