source: trunk/gsdl/src/mgpp/text/Terms.cpp@ 1124

Last change on this file since 1124 was 1124, checked in by kjm18, 24 years ago

added termFreq - overall word count rather than document count

  • Property svn:keywords set to Author Date Id Revision
File size: 18.2 KB
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: Terms.cpp 1124 2000-04-18 04:04:29Z kjm18 $
21 *
22 **************************************************************************/
23
24#include "Terms.h"
25#include "words.h"
26#include "stemmer.h"
27#include "bitio_gen.h"
28#include "bitio_m_stdio.h"
29
30void QueryInfo::Clear () {
31 UCArrayClear (docLevel);
32 maxDocs = 0;
33 sortByRank = true;
34 exactWeights = false;
35 needRankInfo = false;
36 needTermFreqs = false;
37}
38
39
40
41void TermFreqData::Clear () {
42 UCArrayClear (tag);
43 UCArrayClear (term);
44 stemMethod = 0;
45 matchDocs = 0;
46 termFreq = 0;
47}
48
49ostream &operator<< (ostream &s, const TermFreqData &t) {
50 s << "<" << t.tag << ">\"" << t.term << "\"stem("
51 << t.stemMethod << ")docs(" << t.matchDocs << ")"
52 << "count("<<t.termFreq<<")";
53 return s;
54}
55
56bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
57 return ((t1.tag == t2.tag) &&
58 (t1.term == t2.term) &&
59 (t1.stemMethod == t2.stemMethod) &&
60 (t1.matchDocs == t2.matchDocs) &&
61 (t1.termFreq == t2.termFreq));
62}
63
64
65void QueryResult::Clear () {
66 docs.erase (docs.begin(), docs.end());
67 ranks.erase (ranks.begin(), ranks.end());
68 termFreqs.erase (termFreqs.begin(), termFreqs.end());
69}
70
71QueryResult::QueryResult () {
72 Clear ();
73}
74
75
76
77ostream &operator<< (ostream &s, const QueryResult &r) {
78 s << "docs: ";
79 unsigned long i;
80 for (i=0; i<r.docs.size(); i++)
81 s << r.docs[i] << ", ";
82
83 s << "\nranks: ";
84 for (i=0; i<r.ranks.size(); i++)
85 s << r.ranks[i] << ", ";
86
87 s << "\ntermFreqs: ";
88 for (i=0; i<r.termFreqs.size(); i++)
89 s << r.termFreqs[i] << ", ";
90 s << "\n\n";
91
92 return s;
93}
94
95
96bool operator== (const QueryResult &r1, const QueryResult &r2) {
97 return ((r1.docs == r2.docs) &&
98 (r1.ranks == r2.ranks) &&
99 (r1.termFreqs == r2.termFreqs));
100}
101
102//---------------------------------------------------
103// new ExtQueryResult stuff
104void ExtQueryResult::Clear () {
105 docs.erase (docs.begin(), docs.end());
106 levels.erase (levels.begin(), levels.end());
107 ranks.erase (ranks.begin(), ranks.end());
108 termFreqs.erase (termFreqs.begin(), termFreqs.end());
109}
110
111ExtQueryResult::ExtQueryResult () {
112 Clear ();
113}
114
115ostream &operator<< (ostream &s, const ExtQueryResult &r) {
116 s << "docs: ";
117 unsigned long i;
118 for (i=0; i<r.docs.size(); i++)
119 s << r.docs[i] << ", ";
120
121 s << "\nlevels: ";
122 for (i=0; i<r.levels.size(); i++)
123 s << r.levels[i] << ", ";
124
125
126 s << "\nranks: ";
127 for (i=0; i<r.ranks.size(); i++)
128 s << r.ranks[i] << ", ";
129
130 s << "\ntermFreqs: ";
131 for (i=0; i<r.termFreqs.size(); i++)
132 s << r.termFreqs[i] << ", ";
133 s << "\n\n";
134
135 return s;
136}
137
138
139bool operator== (const ExtQueryResult &r1, const ExtQueryResult &r2) {
140 return ((r1.docs == r2.docs) &&
141 (r1.levels == r2.levels) &&
142 (r1.ranks == r2.ranks) &&
143 (r1.termFreqs == r2.termFreqs));
144}
145
146//--------------------------------------
147void FragData::Clear () {
148 matchDocs = 0;
149 fragNums.erase (fragNums.begin(), fragNums.end());
150 fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
151}
152
153
154
155
156void FindWordNumbers (IndexData &indexData,
157 const UCArray &term,
158 unsigned long stemMethod,
159 vector<unsigned long> &equivWords) {
160 equivWords.erase (equivWords.begin(), equivWords.end());
161
162 if (stemMethod == 0) {
163 // don't need to stem the word,
164 // find the word number for this term
165 unsigned long wordElNum = 0;
166 unsigned long numLevels = indexData.bdh.num_levels;
167 word_block_dict_el wordDictEl;
168 wordDictEl.SetNumLevels (numLevels);
169 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
170 indexData.bdh.entries_per_wblk,
171 indexData.bdh.word_dict_size,
172 numLevels, term, wordDictEl, wordElNum))
173 equivWords.push_back (wordElNum);
174
175 return;
176
177 }
178
179
180 // need to stem this word and find it in the blocked stem index
181
182 unsigned char mgWord[MAXSTEMLEN + 1];
183 UCArray stemTerm;
184 unsigned long stemmerNum = 0;
185
186 if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;
187 else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;
188 else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;
189
190
191 // convert the word to an "mg word"
192 mgWord[0] = term.size();
193 bcopy ((char *)term.begin(), (char *)&mgWord[1], term.size());
194
195 // stem the word
196 stemmer (stemMethod, stemmerNum, mgWord);
197
198 // convert the result back to a UCArray
199 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
200
201 // need to look up this term in the appropriate dictionary
202 stem_block_dict_el stemDictEl;
203 unsigned long stemElNum;
204 if (stemMethod == 1) {
205 SearchStemBlockDictEl (indexData.stem1File,
206 indexData.sii1,
207 indexData.sih1.entries_per_block,
208 indexData.sih1.dict_size,
209 stemTerm,
210 stemDictEl,
211 stemElNum);
212
213 } else if (stemMethod == 2) {
214 SearchStemBlockDictEl (indexData.stem2File,
215 indexData.sii2,
216 indexData.sih2.entries_per_block,
217 indexData.sih2.dict_size,
218 stemTerm,
219 stemDictEl,
220 stemElNum);
221
222 } else if (stemMethod == 3) {
223 SearchStemBlockDictEl (indexData.stem3File,
224 indexData.sii3,
225 indexData.sih3.entries_per_block,
226 indexData.sih3.dict_size,
227 stemTerm,
228 stemDictEl,
229 stemElNum);
230 }
231
232 equivWords = stemDictEl.equivWords;
233}
234
235
236
237void ReadTermFragData (IndexData &indexData,
238 bool needFragFreqs,
239 unsigned long termNum,
240 FragData &fragData,
241 FragRangeArray *fragLimits) {
242 fragData.Clear();
243
244 // look up the word in the dictionary
245 unsigned long numLevels = indexData.bdh.num_levels;
246 word_block_dict_el wordDictEl;
247 wordDictEl.SetNumLevels (numLevels);
248 if (!SearchWordBlockDictElNum (indexData.dictFile,
249 indexData.biWords,
250 indexData.bdh.entries_per_wblk,
251 indexData.bdh.word_dict_size,
252 numLevels,
253 termNum, wordDictEl))
254 return; // nothing more to do
255
256 fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
257
258 // seek to the appropriate place in the inverted file
259 fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
260 stdio_bitio_buffer buffer (indexData.invfFile);
261
262 unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags,
263 wordDictEl.frag_occur);
264 unsigned long fragNum = 0;
265 unsigned long termFreq = 0;
266
267 unsigned long fragLimitI = 0;
268 unsigned long i;
269 for (i=0; i<wordDictEl.frag_occur; i++) {
270 fragNum += buffer.bblock_decode (B, NULL);
271 if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
272 else termFreq = 1;
273
274 // get the right fragment range
275 if (fragLimits != NULL) {
276 while (fragLimitI+1 < (*fragLimits).size() &&
277 fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
278 fragLimitI++;
279 }
280 }
281
282 // add the entry if it is within the limits
283 if ((fragLimits == NULL) ||
284 (fragLimitI < (*fragLimits).size() &&
285 fragNum > (*fragLimits)[fragLimitI].rangeStart &&
286 fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
287 fragData.fragNums.push_back (fragNum);
288 if (needFragFreqs)
289 fragData.fragFreqs.push_back (termFreq);
290 }
291 }
292
293 buffer.done();
294}
295
296
297void CombineFragData (bool needFragFreqs,
298 const FragData &f1,
299 const FragData &f2,
300 FragData &outFragData) {
301 outFragData.Clear();
302
303 // the new number of matching documents is the maximum
304 // of the two input matching number of documents -- it
305 // is assumed that these are at the same document level
306 outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
307 f1.matchDocs : f2.matchDocs;
308
309 // do or
310 unsigned long f1I = 0, f1Size = f1.fragNums.size();
311 unsigned long f2I = 0, f2Size = f2.fragNums.size();
312 while (f1I < f1Size || f2I < f2Size) {
313 if (f2I < f2Size &&
314 (f1I >= f1Size ||
315 f1.fragNums[f1I] > f2.fragNums[f2I])) {
316 // output f2I
317 outFragData.fragNums.push_back (f2.fragNums[f2I]);
318 if (needFragFreqs)
319 outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
320 f2I++;
321
322 } else if (f1I < f1Size &&
323 (f2I >= f2Size ||
324 f1.fragNums[f1I] < f2.fragNums[f2I])) {
325 // output f1I
326 outFragData.fragNums.push_back (f1.fragNums[f1I]);
327 if (needFragFreqs)
328 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
329 f1I++;
330
331 } else {
332 // must be equal combine f1I and f2I
333 outFragData.fragNums.push_back (f1.fragNums[f1I]);
334 if (needFragFreqs)
335 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
336 f1I++;
337 f2I++;
338 }
339 }
340}
341
342
343void AndCombineFragData (bool needFragFreqs,
344 FragData &fragData,
345 const FragData &comFragData,
346 signed long startRange,
347 signed long endRange,
348 const FragRangeArray *fragLimits) {
349 // sanity check on range
350 if (startRange > endRange) {
351 signed long temp = endRange;
352 endRange = startRange;
353 startRange = temp;
354 }
355
356 // get min matchdocs
357 if (comFragData.matchDocs < fragData.matchDocs)
358 fragData.matchDocs = comFragData.matchDocs;
359
360 unsigned long fragDataI = 0;
361 unsigned long fragDataSize = fragData.fragNums.size();
362 unsigned long comFragDataI = 0;
363 unsigned long comFragDataSize = comFragData.fragNums.size();
364 unsigned long fragLimitI = 0;
365 unsigned long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
366 unsigned long outI = 0;
367
368 while (fragDataI < fragDataSize &&
369 comFragDataI < comFragDataSize) {
370 signed long fragNum = (signed long)fragData.fragNums[fragDataI];
371 signed long comFragNum = (signed long)comFragData.fragNums[comFragDataI];
372
373 // go to the right fragment limit (for the com frag)
374 if (fragLimits != NULL) {
375 while (fragLimitI+1 < fragLimitSize &&
376 comFragNum > (signed long)(*fragLimits)[fragLimitI+1].rangeStart) {
377 fragLimitI++;
378 }
379 }
380
381 if (fragNum <= comFragNum+startRange ||
382 (fragLimits!=NULL &&
383 fragNum<=(signed long)(*fragLimits)[fragLimitI].rangeStart)) {
384 fragDataI++;
385
386 } else if (fragNum > comFragNum+endRange ||
387 (fragLimits!=NULL &&
388 fragNum>(signed long)(*fragLimits)[fragLimitI].rangeEnd)) {
389 comFragDataI++;
390
391 } else {
392 // equal and within tag
393 fragData.fragNums[outI] = comFragNum;
394 if (needFragFreqs) {
395 fragData.fragFreqs[outI] =
396 (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
397 fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
398 }
399 fragDataI++;
400 comFragDataI++;
401 outI++;
402 }
403 }
404
405 // erase unused part of fragData
406 fragData.fragNums.erase (fragData.fragNums.begin()+outI,
407 fragData.fragNums.end());
408 if (needFragFreqs)
409 fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
410 fragData.fragFreqs.end());
411 else
412 fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
413 fragData.fragFreqs.end());
414}
415
416
417void FragsToQueryResult (IndexData &indexData,
418 const QueryInfo &queryInfo,
419 const FragData &termData,
420 const UCArray &tag,
421 const UCArray &term,
422 unsigned long stemMethod,
423 unsigned long termWeight,
424 QueryResult &result) {
425 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
426
427 result.Clear();
428
429 // log (N / ft)
430 unsigned long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
431 float wordLog = log((double)N / (double)termData.matchDocs);
432
433 // Wqt = fqt * log (N / ft)
434 // note: terms are allowed to have a weight of zero so
435 // they can be excluded from the ranking
436 float Wqt = termWeight * wordLog;
437
438 // Wdt = fdt * log (N / ft)
439 float Wdt;
440
441 unsigned long termDataI = 0;
442 unsigned long termDataSize = termData.fragNums.size();
443 unsigned long levelDocNum = 0;
444
445 unsigned long termDocFreq = 0;
446 unsigned long lastLevelDocNum = 0;
447 unsigned long overallwordfreq = 0;
448
449 while (termDataI < termDataSize) {
450 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
451 levelDocNum)) {
452 if (levelDocNum != lastLevelDocNum) {
453 if (lastLevelDocNum > 0) {
454 // add this doc information
455 if (needRanks) {
456 Wdt = termDocFreq * wordLog;
457 result.ranks.push_back (Wqt * Wdt);
458 }
459 result.docs.push_back (lastLevelDocNum);
460 }
461
462 lastLevelDocNum = levelDocNum;
463 termDocFreq = 0;
464 }
465
466 if (needRanks)
467 termDocFreq += termData.fragFreqs[termDataI];
468 overallwordfreq += termData.fragFreqs[termDataI];
469 }
470
471 termDataI++;
472 }
473
474 if (lastLevelDocNum > 0) {
475 // add the last document information
476 if (needRanks) {
477 Wdt = termDocFreq * wordLog;
478 result.ranks.push_back (Wqt * Wdt);
479 }
480 result.docs.push_back (lastLevelDocNum);
481 }
482
483 // add the term frequency information
484 if (queryInfo.needTermFreqs) {
485 TermFreqData termFreqData;
486 termFreqData.tag = tag;
487 termFreqData.term = term;
488 termFreqData.stemMethod = stemMethod;
489 termFreqData.matchDocs = termData.matchDocs;
490 termFreqData.termFreq = overallwordfreq;
491 result.termFreqs.push_back (termFreqData);
492 }
493}
494
495void AndFragsToQueryResult (IndexData &indexData,
496 const QueryInfo &queryInfo,
497 const FragData &termData,
498 const UCArray &tag,
499 const UCArray &term,
500 unsigned long stemMethod,
501 unsigned long termWeight,
502 QueryResult &result) {
503 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
504
505 // log (N / ft)
506 float wordLog =
507 log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
508 (double)termData.matchDocs);
509
510 // Wqt = fqt * log (N / ft)
511 // note: terms are allowed to have a weight of zero so
512 // they can be excluded from the ranking
513 float Wqt = termWeight * wordLog;
514
515 // Wdt = fdt * log (N / ft)
516 float Wdt;
517
518 unsigned long termDataI = 0;
519 unsigned long termDataSize = termData.fragNums.size();
520 unsigned long levelDocNum = 0;
521
522 unsigned long termDocFreq = 0;
523 unsigned long lastLevelDocNum = 0;
524 unsigned long overallwordfreq = 0;
525 unsigned long resultI = 0;
526 unsigned long resultSize = result.docs.size();
527 unsigned long resultOutI = 0;
528
529
530 while (termDataI < termDataSize) {
531 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
532 levelDocNum)) {
533 if (levelDocNum != lastLevelDocNum) {
534 if (lastLevelDocNum > 0) {
535 // add this doc information
536 Wdt = termDocFreq * wordLog;
537
538 // find this document number
539 while (resultI < resultSize &&
540 result.docs[resultI] < lastLevelDocNum)
541 resultI++;
542
543 // store the result
544 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
545 result.docs[resultOutI] = lastLevelDocNum;
546 if (needRanks)
547 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
548 resultI++;
549 resultOutI++;
550 }
551 }
552
553 lastLevelDocNum = levelDocNum;
554 termDocFreq = 0;
555 }
556
557 if (needRanks)
558 termDocFreq += termData.fragFreqs[termDataI];
559 overallwordfreq += termData.fragFreqs[termDataI];
560 }
561
562 termDataI++;
563 } // while
564
565 if (lastLevelDocNum > 0) {
566 // add the last document information
567 Wdt = termDocFreq * wordLog;
568
569 // find this document number
570 while (resultI < resultSize &&
571 result.docs[resultI] < lastLevelDocNum)
572 resultI++;
573
574 // store the result
575 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
576 result.docs[resultOutI] = lastLevelDocNum;
577 if (needRanks)
578 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
579 resultI++;
580 resultOutI++;
581 }
582 }
583
584 // remove unneeded entries
585 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
586 if (needRanks)
587 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
588 else
589 result.ranks.erase (result.ranks.begin(), result.ranks.end());
590
591 // add the term frequency information
592 if (queryInfo.needTermFreqs) {
593 TermFreqData termFreqData;
594 termFreqData.tag = tag;
595 termFreqData.term = term;
596 termFreqData.stemMethod = stemMethod;
597 termFreqData.matchDocs = termData.matchDocs;
598 termFreqData.termFreq = overallwordfreq;
599 result.termFreqs.push_back (termFreqData);
600 }
601}
602
603
604void RemoveUnwantedResults (IndexData &indexData,
605 const QueryInfo &queryInfo,
606 const FragData &termData,
607 QueryResult &result) {
608 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
609
610 unsigned long termDataI = 0;
611 unsigned long termDataSize = termData.fragNums.size();
612 unsigned long levelDocNum = 0;
613
614 unsigned long lastLevelDocNum = 0;
615
616 unsigned long resultI = 0;
617 unsigned long resultSize = result.docs.size();
618 unsigned long resultOutI = 0;
619
620 while (termDataI < termDataSize) {
621 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
622 levelDocNum)) {
623 if (levelDocNum != lastLevelDocNum) {
624 if (lastLevelDocNum > 0) {
625 // find this document number
626 while (resultI < resultSize &&
627 result.docs[resultI] < lastLevelDocNum)
628 resultI++;
629
630 // store the result
631 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
632 result.docs[resultOutI] = lastLevelDocNum;
633 if (needRanks)
634 result.ranks[resultOutI] = result.ranks[resultI];
635 resultI++;
636 resultOutI++;
637 }
638 }
639
640 lastLevelDocNum = levelDocNum;
641 }
642 }
643
644 termDataI++;
645 }
646
647 if (lastLevelDocNum > 0) {
648 // find this document number
649 while (resultI < resultSize &&
650 result.docs[resultI] < lastLevelDocNum)
651 resultI++;
652
653 // store the result
654 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
655 result.docs[resultOutI] = lastLevelDocNum;
656 if (needRanks)
657 result.ranks[resultOutI] = result.ranks[resultI];
658 resultI++;
659 resultOutI++;
660 }
661 }
662
663 // remove unneeded entries
664 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
665 if (needRanks)
666 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
667 else
668 result.ranks.erase (result.ranks.begin(), result.ranks.end());
669}
Note: See TracBrowser for help on using the repository browser.