source: trunk/gsdl/src/mgpp/text/Terms.cpp@ 927

Last change on this file since 927 was 927, checked in by kjm18, 24 years ago

added feature to retrieve doc nums at a different level than the level
queried at. eg query at Document level, but retrieve section level docnums
bug in mg_perf_hash_build.cpp fixed

  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: Terms.cpp 927 2000-02-15 22:45:22Z kjm18 $
21 *
22 **************************************************************************/
23
24#include "Terms.h"
25#include "words.h"
26#include "stemmer.h"
27#include "bitio_gen.h"
28#include "bitio_m_stdio.h"
29
30void QueryInfo::Clear () {
31 UCArrayClear (docLevel);
32 maxDocs = 0;
33 sortByRank = true;
34 exactWeights = false;
35 needRankInfo = false;
36 needTermFreqs = false;
37}
38
39
40
41void TermFreqData::Clear () {
42 UCArrayClear (tag);
43 UCArrayClear (term);
44 stemMethod = 0;
45 matchDocs = 0;
46}
47
48ostream &operator<< (ostream &s, const TermFreqData &t) {
49 s << "<" << t.tag << ">\"" << t.term << "\"stem("
50 << t.stemMethod << ")docs(" << t.matchDocs << ")";
51 return s;
52}
53
54bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
55 return ((t1.tag == t2.tag) &&
56 (t1.term == t2.term) &&
57 (t1.stemMethod == t2.stemMethod) &&
58 (t1.matchDocs == t2.matchDocs));
59}
60
61
62void QueryResult::Clear () {
63 docs.erase (docs.begin(), docs.end());
64 ranks.erase (ranks.begin(), ranks.end());
65 termFreqs.erase (termFreqs.begin(), termFreqs.end());
66}
67
68QueryResult::QueryResult () {
69 Clear ();
70}
71
72
73
74ostream &operator<< (ostream &s, const QueryResult &r) {
75 s << "docs: ";
76 unsigned long i;
77 for (i=0; i<r.docs.size(); i++)
78 s << r.docs[i] << ", ";
79
80 s << "\nranks: ";
81 for (i=0; i<r.ranks.size(); i++)
82 s << r.ranks[i] << ", ";
83
84 s << "\ntermFreqs: ";
85 for (i=0; i<r.termFreqs.size(); i++)
86 s << r.termFreqs[i] << ", ";
87 s << "\n\n";
88
89 return s;
90}
91
92
93bool operator== (const QueryResult &r1, const QueryResult &r2) {
94 return ((r1.docs == r2.docs) &&
95 (r1.ranks == r2.ranks) &&
96 (r1.termFreqs == r2.termFreqs));
97}
98
99//---------------------------------------------------
100// new ExtQueryResult stuff
101void ExtQueryResult::Clear () {
102 docs.erase (docs.begin(), docs.end());
103 levels.erase (levels.begin(), levels.end());
104 ranks.erase (ranks.begin(), ranks.end());
105 termFreqs.erase (termFreqs.begin(), termFreqs.end());
106}
107
108ExtQueryResult::ExtQueryResult () {
109 Clear ();
110}
111
112ostream &operator<< (ostream &s, const ExtQueryResult &r) {
113 s << "docs: ";
114 unsigned long i;
115 for (i=0; i<r.docs.size(); i++)
116 s << r.docs[i] << ", ";
117
118 s << "\nlevels: ";
119 for (i=0; i<r.levels.size(); i++)
120 s << r.levels[i] << ", ";
121
122
123 s << "\nranks: ";
124 for (i=0; i<r.ranks.size(); i++)
125 s << r.ranks[i] << ", ";
126
127 s << "\ntermFreqs: ";
128 for (i=0; i<r.termFreqs.size(); i++)
129 s << r.termFreqs[i] << ", ";
130 s << "\n\n";
131
132 return s;
133}
134
135
136bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {
137 return ((r1.docs == r2.docs) &&
138 (r1.levels == r2.levels) &&
139 (r1.ranks == r2.ranks) &&
140 (r1.termFreqs == r2.termFreqs));
141}
142
143//--------------------------------------
144void FragData::Clear () {
145 matchDocs = 0;
146 fragNums.erase (fragNums.begin(), fragNums.end());
147 fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
148}
149
150
151
152
153void FindWordNumbers (IndexData &indexData,
154 const UCArray &term,
155 unsigned long stemMethod,
156 vector<unsigned long> &equivWords) {
157 equivWords.erase (equivWords.begin(), equivWords.end());
158
159 if (stemMethod == 0) {
160 // don't need to stem the word,
161 // find the word number for this term
162 unsigned long wordElNum = 0;
163 unsigned long numLevels = indexData.bdh.num_levels;
164 word_block_dict_el wordDictEl;
165 wordDictEl.SetNumLevels (numLevels);
166 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
167 indexData.bdh.entries_per_wblk,
168 indexData.bdh.word_dict_size,
169 numLevels, term, wordDictEl, wordElNum))
170 equivWords.push_back (wordElNum);
171
172 return;
173
174 }
175
176
177 // need to stem this word and find it in the blocked stem index
178
179 unsigned char mgWord[MAXSTEMLEN + 1];
180 UCArray stemTerm;
181 unsigned long stemmerNum = 0;
182
183 if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;
184 else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;
185 else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;
186
187
188 // convert the word to an "mg word"
189 mgWord[0] = term.size();
190 bcopy ((char *)term.begin(), (char *)&mgWord[1], term.size());
191
192 // stem the word
193 stemmer (stemMethod, stemmerNum, mgWord);
194
195 // convert the result back to a UCArray
196 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
197
198 // need to look up this term in the appropriate dictionary
199 stem_block_dict_el stemDictEl;
200 unsigned long stemElNum;
201 if (stemMethod == 1) {
202 SearchStemBlockDictEl (indexData.stem1File,
203 indexData.sii1,
204 indexData.sih1.entries_per_block,
205 indexData.sih1.dict_size,
206 stemTerm,
207 stemDictEl,
208 stemElNum);
209
210 } else if (stemMethod == 2) {
211 SearchStemBlockDictEl (indexData.stem2File,
212 indexData.sii2,
213 indexData.sih2.entries_per_block,
214 indexData.sih2.dict_size,
215 stemTerm,
216 stemDictEl,
217 stemElNum);
218
219 } else if (stemMethod == 3) {
220 SearchStemBlockDictEl (indexData.stem3File,
221 indexData.sii3,
222 indexData.sih3.entries_per_block,
223 indexData.sih3.dict_size,
224 stemTerm,
225 stemDictEl,
226 stemElNum);
227 }
228
229 equivWords = stemDictEl.equivWords;
230}
231
232
233
234void ReadTermFragData (IndexData &indexData,
235 bool needFragFreqs,
236 unsigned long termNum,
237 FragData &fragData,
238 FragRangeArray *fragLimits) {
239 fragData.Clear();
240
241 // look up the word in the dictionary
242 unsigned long numLevels = indexData.bdh.num_levels;
243 word_block_dict_el wordDictEl;
244 wordDictEl.SetNumLevels (numLevels);
245 if (!SearchWordBlockDictElNum (indexData.dictFile,
246 indexData.biWords,
247 indexData.bdh.entries_per_wblk,
248 indexData.bdh.word_dict_size,
249 numLevels,
250 termNum, wordDictEl))
251 return; // nothing more to do
252
253 fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
254
255 // seek to the appropriate place in the inverted file
256 fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
257 stdio_bitio_buffer buffer (indexData.invfFile);
258
259 unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags,
260 wordDictEl.frag_occur);
261 unsigned long fragNum = 0;
262 unsigned long termFreq = 0;
263
264 unsigned long fragLimitI = 0;
265 unsigned long i;
266 for (i=0; i<wordDictEl.frag_occur; i++) {
267 fragNum += buffer.bblock_decode (B, NULL);
268 if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
269 else termFreq = 1;
270
271 // get the right fragment range
272 if (fragLimits != NULL) {
273 while (fragLimitI+1 < (*fragLimits).size() &&
274 fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
275 fragLimitI++;
276 }
277 }
278
279 // add the entry if it is within the limits
280 if ((fragLimits == NULL) ||
281 (fragLimitI < (*fragLimits).size() &&
282 fragNum > (*fragLimits)[fragLimitI].rangeStart &&
283 fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
284 fragData.fragNums.push_back (fragNum);
285 if (needFragFreqs)
286 fragData.fragFreqs.push_back (termFreq);
287 }
288 }
289
290 buffer.done();
291}
292
293
294void CombineFragData (bool needFragFreqs,
295 const FragData &f1,
296 const FragData &f2,
297 FragData &outFragData) {
298 outFragData.Clear();
299
300 // the new number of matching documents is the maximum
301 // of the two input matching number of documents -- it
302 // is assumed that these are at the same document level
303 outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
304 f1.matchDocs : f2.matchDocs;
305
306 // do or
307 unsigned long f1I = 0, f1Size = f1.fragNums.size();
308 unsigned long f2I = 0, f2Size = f2.fragNums.size();
309 while (f1I < f1Size || f2I < f2Size) {
310 if (f2I < f2Size &&
311 (f1I >= f1Size ||
312 f1.fragNums[f1I] > f2.fragNums[f2I])) {
313 // output f2I
314 outFragData.fragNums.push_back (f2.fragNums[f2I]);
315 if (needFragFreqs)
316 outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
317 f2I++;
318
319 } else if (f1I < f1Size &&
320 (f2I >= f2Size ||
321 f1.fragNums[f1I] < f2.fragNums[f2I])) {
322 // output f1I
323 outFragData.fragNums.push_back (f1.fragNums[f1I]);
324 if (needFragFreqs)
325 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
326 f1I++;
327
328 } else {
329 // must be equal combine f1I and f2I
330 outFragData.fragNums.push_back (f1.fragNums[f1I]);
331 if (needFragFreqs)
332 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
333 f1I++;
334 f2I++;
335 }
336 }
337}
338
339
340void AndCombineFragData (bool needFragFreqs,
341 FragData &fragData,
342 const FragData &comFragData,
343 signed long startRange,
344 signed long endRange,
345 const FragRangeArray *fragLimits) {
346 // sanity check on range
347 if (startRange > endRange) {
348 signed long temp = endRange;
349 endRange = startRange;
350 startRange = temp;
351 }
352
353 // get min matchdocs
354 if (comFragData.matchDocs < fragData.matchDocs)
355 fragData.matchDocs = comFragData.matchDocs;
356
357 unsigned long fragDataI = 0;
358 unsigned long fragDataSize = fragData.fragNums.size();
359 unsigned long comFragDataI = 0;
360 unsigned long comFragDataSize = comFragData.fragNums.size();
361 unsigned long fragLimitI = 0;
362 unsigned long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
363 unsigned long outI = 0;
364
365 while (fragDataI < fragDataSize &&
366 comFragDataI < comFragDataSize) {
367 signed long fragNum = (signed long)fragData.fragNums[fragDataI];
368 signed long comFragNum = (signed long)comFragData.fragNums[comFragDataI];
369
370 // go to the right fragment limit (for the com frag)
371 if (fragLimits != NULL) {
372 while (fragLimitI+1 < fragLimitSize &&
373 comFragNum > (signed long)(*fragLimits)[fragLimitI+1].rangeStart) {
374 fragLimitI++;
375 }
376 }
377
378 if (fragNum <= comFragNum+startRange ||
379 (fragLimits!=NULL &&
380 fragNum<=(signed long)(*fragLimits)[fragLimitI].rangeStart)) {
381 fragDataI++;
382
383 } else if (fragNum > comFragNum+endRange ||
384 (fragLimits!=NULL &&
385 fragNum>(signed long)(*fragLimits)[fragLimitI].rangeEnd)) {
386 comFragDataI++;
387
388 } else {
389 // equal and within tag
390 fragData.fragNums[outI] = comFragNum;
391 if (needFragFreqs) {
392 fragData.fragFreqs[outI] =
393 (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
394 fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
395 }
396 fragDataI++;
397 comFragDataI++;
398 outI++;
399 }
400 }
401
402 // erase unused part of fragData
403 fragData.fragNums.erase (fragData.fragNums.begin()+outI,
404 fragData.fragNums.end());
405 if (needFragFreqs)
406 fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
407 fragData.fragFreqs.end());
408 else
409 fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
410 fragData.fragFreqs.end());
411}
412
413
414void FragsToQueryResult (IndexData &indexData,
415 const QueryInfo &queryInfo,
416 const FragData &termData,
417 const UCArray &tag,
418 const UCArray &term,
419 unsigned long stemMethod,
420 unsigned long termWeight,
421 QueryResult &result) {
422 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
423
424 result.Clear();
425
426 // log (N / ft)
427 unsigned long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
428 float wordLog = log((double)N / (double)termData.matchDocs);
429
430 // Wqt = fqt * log (N / ft)
431 // note: terms are allowed to have a weight of zero so
432 // they can be excluded from the ranking
433 float Wqt = termWeight * wordLog;
434
435 // Wdt = fdt * log (N / ft)
436 float Wdt;
437
438 unsigned long termDataI = 0;
439 unsigned long termDataSize = termData.fragNums.size();
440 unsigned long levelDocNum = 0;
441
442 unsigned long termDocFreq = 0;
443 unsigned long lastLevelDocNum = 0;
444
445
446 while (termDataI < termDataSize) {
447 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
448 levelDocNum)) {
449 if (levelDocNum != lastLevelDocNum) {
450 if (lastLevelDocNum > 0) {
451 // add this doc information
452 if (needRanks) {
453 Wdt = termDocFreq * wordLog;
454 result.ranks.push_back (Wqt * Wdt);
455 }
456 result.docs.push_back (lastLevelDocNum);
457 }
458
459 lastLevelDocNum = levelDocNum;
460 termDocFreq = 0;
461 }
462
463 if (needRanks)
464 termDocFreq += termData.fragFreqs[termDataI];
465 }
466
467 termDataI++;
468 }
469
470 if (lastLevelDocNum > 0) {
471 // add the last document information
472 if (needRanks) {
473 Wdt = termDocFreq * wordLog;
474 result.ranks.push_back (Wqt * Wdt);
475 }
476 result.docs.push_back (lastLevelDocNum);
477 }
478
479 // add the term frequency information
480 if (queryInfo.needTermFreqs) {
481 TermFreqData termFreqData;
482 termFreqData.tag = tag;
483 termFreqData.term = term;
484 termFreqData.stemMethod = stemMethod;
485 termFreqData.matchDocs = termData.matchDocs;
486 result.termFreqs.push_back (termFreqData);
487 }
488}
489
490void AndFragsToQueryResult (IndexData &indexData,
491 const QueryInfo &queryInfo,
492 const FragData &termData,
493 const UCArray &tag,
494 const UCArray &term,
495 unsigned long stemMethod,
496 unsigned long termWeight,
497 QueryResult &result) {
498 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
499
500 // log (N / ft)
501 float wordLog =
502 log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
503 (double)termData.matchDocs);
504
505 // Wqt = fqt * log (N / ft)
506 // note: terms are allowed to have a weight of zero so
507 // they can be excluded from the ranking
508 float Wqt = termWeight * wordLog;
509
510 // Wdt = fdt * log (N / ft)
511 float Wdt;
512
513 unsigned long termDataI = 0;
514 unsigned long termDataSize = termData.fragNums.size();
515 unsigned long levelDocNum = 0;
516
517 unsigned long termDocFreq = 0;
518 unsigned long lastLevelDocNum = 0;
519
520 unsigned long resultI = 0;
521 unsigned long resultSize = result.docs.size();
522 unsigned long resultOutI = 0;
523
524
525 while (termDataI < termDataSize) {
526 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
527 levelDocNum)) {
528 if (levelDocNum != lastLevelDocNum) {
529 if (lastLevelDocNum > 0) {
530 // add this doc information
531 Wdt = termDocFreq * wordLog;
532
533 // find this document number
534 while (resultI < resultSize &&
535 result.docs[resultI] < lastLevelDocNum)
536 resultI++;
537
538 // store the result
539 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
540 result.docs[resultOutI] = lastLevelDocNum;
541 if (needRanks)
542 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
543 resultI++;
544 resultOutI++;
545 }
546 }
547
548 lastLevelDocNum = levelDocNum;
549 termDocFreq = 0;
550 }
551
552 if (needRanks)
553 termDocFreq += termData.fragFreqs[termDataI];
554 }
555
556 termDataI++;
557 }
558
559 if (lastLevelDocNum > 0) {
560 // add the last document information
561 Wdt = termDocFreq * wordLog;
562
563 // find this document number
564 while (resultI < resultSize &&
565 result.docs[resultI] < lastLevelDocNum)
566 resultI++;
567
568 // store the result
569 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
570 result.docs[resultOutI] = lastLevelDocNum;
571 if (needRanks)
572 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
573 resultI++;
574 resultOutI++;
575 }
576 }
577
578 // remove unneeded entries
579 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
580 if (needRanks)
581 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
582 else
583 result.ranks.erase (result.ranks.begin(), result.ranks.end());
584
585 // add the term frequency information
586 if (queryInfo.needTermFreqs) {
587 TermFreqData termFreqData;
588 termFreqData.tag = tag;
589 termFreqData.term = term;
590 termFreqData.stemMethod = stemMethod;
591 termFreqData.matchDocs = termData.matchDocs;
592 result.termFreqs.push_back (termFreqData);
593 }
594}
595
596
597void RemoveUnwantedResults (IndexData &indexData,
598 const QueryInfo &queryInfo,
599 const FragData &termData,
600 QueryResult &result) {
601 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
602
603 unsigned long termDataI = 0;
604 unsigned long termDataSize = termData.fragNums.size();
605 unsigned long levelDocNum = 0;
606
607 unsigned long lastLevelDocNum = 0;
608
609 unsigned long resultI = 0;
610 unsigned long resultSize = result.docs.size();
611 unsigned long resultOutI = 0;
612
613 while (termDataI < termDataSize) {
614 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
615 levelDocNum)) {
616 if (levelDocNum != lastLevelDocNum) {
617 if (lastLevelDocNum > 0) {
618 // find this document number
619 while (resultI < resultSize &&
620 result.docs[resultI] < lastLevelDocNum)
621 resultI++;
622
623 // store the result
624 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
625 result.docs[resultOutI] = lastLevelDocNum;
626 if (needRanks)
627 result.ranks[resultOutI] = result.ranks[resultI];
628 resultI++;
629 resultOutI++;
630 }
631 }
632
633 lastLevelDocNum = levelDocNum;
634 }
635 }
636
637 termDataI++;
638 }
639
640 if (lastLevelDocNum > 0) {
641 // find this document number
642 while (resultI < resultSize &&
643 result.docs[resultI] < lastLevelDocNum)
644 resultI++;
645
646 // store the result
647 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
648 result.docs[resultOutI] = lastLevelDocNum;
649 if (needRanks)
650 result.ranks[resultOutI] = result.ranks[resultI];
651 resultI++;
652 resultOutI++;
653 }
654 }
655
656 // remove unneeded entries
657 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
658 if (needRanks)
659 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
660 else
661 result.ranks.erase (result.ranks.begin(), result.ranks.end());
662}
Note: See TracBrowser for help on using the repository browser.