source: trunk/gsdl/src/mgpp/text/Terms.cpp@ 855

Last change on this file since 855 was 855, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:keywords set to Author Date Id Revision
File size: 17.0 KB
Line 
1/**************************************************************************
2 *
3 * Terms.cpp -- Query related functions
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: Terms.cpp 855 2000-01-14 02:17:52Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "Terms.h"
25#include "words.h"
26#include "stemmer.h"
27#include "bitio_gen.h"
28#include "bitio_m_stdio.h"
29
30void QueryInfo::Clear () {
31 UCArrayClear (docLevel);
32 maxDocs = 0;
33 sortByRank = true;
34 exactWeights = false;
35 needRankInfo = false;
36 needTermFreqs = false;
37}
38
39
40
41void TermFreqData::Clear () {
42 UCArrayClear (tag);
43 UCArrayClear (term);
44 stemMethod = 0;
45 matchDocs = 0;
46}
47
48ostream &operator<< (ostream &s, const TermFreqData &t) {
49 s << "<" << t.tag << ">\"" << t.term << "\"stem("
50 << t.stemMethod << ")docs(" << t.matchDocs << ")";
51 return s;
52}
53
54bool operator== (const TermFreqData &t1, const TermFreqData &t2) {
55 return ((t1.tag == t2.tag) &&
56 (t1.term == t2.term) &&
57 (t1.stemMethod == t2.stemMethod) &&
58 (t1.matchDocs == t2.matchDocs));
59}
60
61
62void QueryResult::Clear () {
63 docs.erase (docs.begin(), docs.end());
64 ranks.erase (ranks.begin(), ranks.end());
65 termFreqs.erase (termFreqs.begin(), termFreqs.end());
66}
67
68QueryResult::QueryResult () {
69 Clear ();
70}
71
72
73ostream &operator<< (ostream &s, const QueryResult &r) {
74 s << "docs: ";
75 unsigned long i;
76 for (i=0; i<r.docs.size(); i++)
77 s << r.docs[i] << ", ";
78
79 s << "\nranks: ";
80 for (i=0; i<r.ranks.size(); i++)
81 s << r.ranks[i] << ", ";
82
83 s << "\ntermFreqs: ";
84 for (i=0; i<r.termFreqs.size(); i++)
85 s << r.termFreqs[i] << ", ";
86 s << "\n\n";
87
88 return s;
89}
90
91
92bool operator== (const QueryResult &r1, const QueryResult &r2) {
93 return ((r1.docs == r2.docs) &&
94 (r1.ranks == r2.ranks) &&
95 (r1.termFreqs == r2.termFreqs));
96}
97
98
99void FragData::Clear () {
100 matchDocs = 0;
101 fragNums.erase (fragNums.begin(), fragNums.end());
102 fragFreqs.erase (fragFreqs.begin(), fragFreqs.end());
103}
104
105
106
107
108void FindWordNumbers (IndexData &indexData,
109 const UCArray &term,
110 unsigned long stemMethod,
111 vector<unsigned long> &equivWords) {
112 equivWords.erase (equivWords.begin(), equivWords.end());
113
114 if (stemMethod == 0) {
115 // don't need to stem the word,
116 // find the word number for this term
117 unsigned long wordElNum = 0;
118 unsigned long numLevels = indexData.bdh.num_levels;
119 word_block_dict_el wordDictEl;
120 wordDictEl.SetNumLevels (numLevels);
121 if (SearchWordBlockDictEl (indexData.dictFile, indexData.biWords,
122 indexData.bdh.entries_per_wblk,
123 indexData.bdh.word_dict_size,
124 numLevels, term, wordDictEl, wordElNum))
125 equivWords.push_back (wordElNum);
126
127 return;
128
129 }
130
131
132 // need to stem this word and find it in the blocked stem index
133
134 unsigned char mgWord[MAXSTEMLEN + 1];
135 UCArray stemTerm;
136 unsigned long stemmerNum = 0;
137
138 if (stemMethod == 1) stemmerNum = indexData.sih1.stemmer_num;
139 else if (stemMethod == 2) stemmerNum = indexData.sih2.stemmer_num;
140 else if (stemMethod == 3) stemmerNum = indexData.sih3.stemmer_num;
141
142
143 // convert the word to an "mg word"
144 mgWord[0] = term.size();
145 bcopy ((char *)term.begin(), (char *)&mgWord[1], term.size());
146
147 // stem the word
148 stemmer (stemMethod, stemmerNum, mgWord);
149
150 // convert the result back to a UCArray
151 stemTerm.insert (stemTerm.end(), &mgWord[1], &mgWord[1] + mgWord[0]);
152
153 // need to look up this term in the appropriate dictionary
154 stem_block_dict_el stemDictEl;
155 unsigned long stemElNum;
156 if (stemMethod == 1) {
157 SearchStemBlockDictEl (indexData.stem1File,
158 indexData.sii1,
159 indexData.sih1.entries_per_block,
160 indexData.sih1.dict_size,
161 stemTerm,
162 stemDictEl,
163 stemElNum);
164
165 } else if (stemMethod == 2) {
166 SearchStemBlockDictEl (indexData.stem2File,
167 indexData.sii2,
168 indexData.sih2.entries_per_block,
169 indexData.sih2.dict_size,
170 stemTerm,
171 stemDictEl,
172 stemElNum);
173
174 } else if (stemMethod == 3) {
175 SearchStemBlockDictEl (indexData.stem3File,
176 indexData.sii3,
177 indexData.sih3.entries_per_block,
178 indexData.sih3.dict_size,
179 stemTerm,
180 stemDictEl,
181 stemElNum);
182 }
183
184 equivWords = stemDictEl.equivWords;
185}
186
187
188
189void ReadTermFragData (IndexData &indexData,
190 bool needFragFreqs,
191 unsigned long termNum,
192 FragData &fragData,
193 FragRangeArray *fragLimits) {
194 fragData.Clear();
195
196 // look up the word in the dictionary
197 unsigned long numLevels = indexData.bdh.num_levels;
198 word_block_dict_el wordDictEl;
199 wordDictEl.SetNumLevels (numLevels);
200 if (!SearchWordBlockDictElNum (indexData.dictFile,
201 indexData.biWords,
202 indexData.bdh.entries_per_wblk,
203 indexData.bdh.word_dict_size,
204 numLevels,
205 termNum, wordDictEl))
206 return; // nothing more to do
207
208 fragData.matchDocs = wordDictEl.levelFreqs[indexData.curLevelNum];
209
210 // seek to the appropriate place in the inverted file
211 fseek (indexData.invfFile, wordDictEl.invf_ptr, SEEK_SET);
212 stdio_bitio_buffer buffer (indexData.invfFile);
213
214 unsigned long B = BIO_Bblock_Init (indexData.bdh.num_frags,
215 wordDictEl.frag_occur);
216 unsigned long fragNum = 0;
217 unsigned long termFreq = 0;
218
219 unsigned long fragLimitI = 0;
220 unsigned long i;
221 for (i=0; i<wordDictEl.frag_occur; i++) {
222 fragNum += buffer.bblock_decode (B, NULL);
223 if (!indexData.ifh.word_level_index) termFreq = buffer.gamma_decode (NULL);
224 else termFreq = 1;
225
226 // get the right fragment range
227 if (fragLimits != NULL) {
228 while (fragLimitI+1 < (*fragLimits).size() &&
229 fragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
230 fragLimitI++;
231 }
232 }
233
234 // add the entry if it is within the limits
235 if ((fragLimits == NULL) ||
236 (fragLimitI < (*fragLimits).size() &&
237 fragNum > (*fragLimits)[fragLimitI].rangeStart &&
238 fragNum <= (*fragLimits)[fragLimitI].rangeEnd)) {
239 fragData.fragNums.push_back (fragNum);
240 if (needFragFreqs)
241 fragData.fragFreqs.push_back (termFreq);
242 }
243 }
244
245 buffer.done();
246}
247
248
249void CombineFragData (bool needFragFreqs,
250 const FragData &f1,
251 const FragData &f2,
252 FragData &outFragData) {
253 outFragData.Clear();
254
255 // the new number of matching documents is the maximum
256 // of the two input matching number of documents -- it
257 // is assumed that these are at the same document level
258 outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
259 f1.matchDocs : f2.matchDocs;
260
261 // do or
262 unsigned long f1I = 0, f1Size = f1.fragNums.size();
263 unsigned long f2I = 0, f2Size = f2.fragNums.size();
264 while (f1I < f1Size || f2I < f2Size) {
265 if (f2I < f2Size &&
266 (f1I >= f1Size ||
267 f1.fragNums[f1I] > f2.fragNums[f2I])) {
268 // output f2I
269 outFragData.fragNums.push_back (f2.fragNums[f2I]);
270 if (needFragFreqs)
271 outFragData.fragFreqs.push_back (f2.fragFreqs[f2I]);
272 f2I++;
273
274 } else if (f1I < f1Size &&
275 (f2I >= f2Size ||
276 f1.fragNums[f1I] < f2.fragNums[f2I])) {
277 // output f1I
278 outFragData.fragNums.push_back (f1.fragNums[f1I]);
279 if (needFragFreqs)
280 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]);
281 f1I++;
282
283 } else {
284 // must be equal combine f1I and f2I
285 outFragData.fragNums.push_back (f1.fragNums[f1I]);
286 if (needFragFreqs)
287 outFragData.fragFreqs.push_back (f1.fragFreqs[f1I]+f2.fragFreqs[f2I]);
288 f1I++;
289 f2I++;
290 }
291 }
292}
293
294
295void AndCombineFragData (bool needFragFreqs,
296 FragData &fragData,
297 const FragData &comFragData,
298 signed long startRange,
299 signed long endRange,
300 const FragRangeArray *fragLimits) {
301 // sanity check on range
302 if (startRange > endRange) {
303 signed long temp = endRange;
304 endRange = startRange;
305 startRange = temp;
306 }
307
308 // get min matchdocs
309 if (comFragData.matchDocs < fragData.matchDocs)
310 fragData.matchDocs = comFragData.matchDocs;
311
312 unsigned long fragDataI = 0;
313 unsigned long fragDataSize = fragData.fragNums.size();
314 unsigned long comFragDataI = 0;
315 unsigned long comFragDataSize = comFragData.fragNums.size();
316 unsigned long fragLimitI = 0;
317 unsigned long fragLimitSize = (fragLimits==NULL) ? 0 : (*fragLimits).size();
318 unsigned long outI = 0;
319
320 // increase comFragDataI until resulting range is positive
321 if (startRange < 0) {
322 while (comFragDataI < comFragDataSize &&
323 ((signed long)comFragData.fragNums[comFragDataI]+startRange < 0)) {
324 comFragDataI++;
325 }
326 }
327
328 while (fragDataI < fragDataSize &&
329 comFragDataI < comFragDataSize) {
330 unsigned long fragNum = fragData.fragNums[fragDataI];
331 unsigned long comFragNum = comFragData.fragNums[comFragDataI];
332
333 // go to the right fragment limit (for the com frag)
334 if (fragLimits != NULL) {
335 while (fragLimitI+1 < fragLimitSize &&
336 comFragNum > (*fragLimits)[fragLimitI+1].rangeStart) {
337 fragLimitI++;
338 }
339 }
340
341 if (fragNum <= comFragNum+startRange ||
342 (fragLimits!=NULL && fragNum<=(*fragLimits)[fragLimitI].rangeStart)) {
343 fragDataI++;
344
345 } else if (fragNum > comFragNum+endRange ||
346 (fragLimits!=NULL && fragNum>(*fragLimits)[fragLimitI].rangeEnd)) {
347 comFragDataI++;
348
349 } else {
350 // equal and within tag
351 fragData.fragNums[outI] = comFragNum;
352 if (needFragFreqs) {
353 fragData.fragFreqs[outI] =
354 (fragData.fragFreqs[fragDataI] < comFragData.fragFreqs[comFragDataI]) ?
355 fragData.fragFreqs[fragDataI] : comFragData.fragFreqs[comFragDataI];
356 }
357 fragDataI++;
358 comFragDataI++;
359 comFragDataI++;
360 }
361 }
362
363 // erase unused part of fragData
364 fragData.fragNums.erase (fragData.fragNums.begin()+outI,
365 fragData.fragNums.end());
366 if (needFragFreqs)
367 fragData.fragFreqs.erase (fragData.fragFreqs.begin()+outI,
368 fragData.fragFreqs.end());
369 else
370 fragData.fragFreqs.erase (fragData.fragFreqs.begin(),
371 fragData.fragFreqs.end());
372}
373
374
375void FragsToQueryResult (IndexData &indexData,
376 const QueryInfo &queryInfo,
377 const FragData &termData,
378 const UCArray &tag,
379 const UCArray &term,
380 unsigned long stemMethod,
381 unsigned long termWeight,
382 QueryResult &result) {
383 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
384
385 result.Clear();
386
387 // log (N / ft)
388 unsigned long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
389 float wordLog = log((double)N / (double)termData.matchDocs);
390
391 // Wqt = fqt * log (N / ft)
392 // note: terms are allowed to have a weight of zero so
393 // they can be excluded from the ranking
394 float Wqt = termWeight * wordLog;
395
396 // Wdt = fdt * log (N / ft)
397 float Wdt;
398
399 unsigned long termDataI = 0;
400 unsigned long termDataSize = termData.fragNums.size();
401 unsigned long levelDocNum = 0;
402
403 unsigned long termDocFreq = 0;
404 unsigned long lastLevelDocNum = 0;
405
406
407 while (termDataI < termDataSize) {
408 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
409 levelDocNum)) {
410 if (levelDocNum != lastLevelDocNum) {
411 if (lastLevelDocNum > 0) {
412 // add this doc information
413 if (needRanks) {
414 Wdt = termDocFreq * wordLog;
415 result.ranks.push_back (Wqt * Wdt);
416 }
417 result.docs.push_back (lastLevelDocNum);
418 }
419
420 lastLevelDocNum = levelDocNum;
421 termDocFreq = 0;
422 }
423
424 if (needRanks)
425 termDocFreq += termData.fragFreqs[termDataI];
426 }
427
428 termDataI++;
429 }
430
431 if (lastLevelDocNum > 0) {
432 // add the last document information
433 if (needRanks) {
434 Wdt = termDocFreq * wordLog;
435 result.ranks.push_back (Wqt * Wdt);
436 }
437 result.docs.push_back (lastLevelDocNum);
438 }
439
440 // add the term frequency information
441 if (queryInfo.needTermFreqs) {
442 TermFreqData termFreqData;
443 termFreqData.tag = tag;
444 termFreqData.term = term;
445 termFreqData.stemMethod = stemMethod;
446 termFreqData.matchDocs = termData.matchDocs;
447 result.termFreqs.push_back (termFreqData);
448 }
449}
450
451void AndFragsToQueryResult (IndexData &indexData,
452 const QueryInfo &queryInfo,
453 const FragData &termData,
454 const UCArray &tag,
455 const UCArray &term,
456 unsigned long stemMethod,
457 unsigned long termWeight,
458 QueryResult &result) {
459 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
460
461 // log (N / ft)
462 float wordLog =
463 log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
464 (double)termData.matchDocs);
465
466 // Wqt = fqt * log (N / ft)
467 // note: terms are allowed to have a weight of zero so
468 // they can be excluded from the ranking
469 float Wqt = termWeight * wordLog;
470
471 // Wdt = fdt * log (N / ft)
472 float Wdt;
473
474 unsigned long termDataI = 0;
475 unsigned long termDataSize = termData.fragNums.size();
476 unsigned long levelDocNum = 0;
477
478 unsigned long termDocFreq = 0;
479 unsigned long lastLevelDocNum = 0;
480
481 unsigned long resultI = 0;
482 unsigned long resultSize = result.docs.size();
483 unsigned long resultOutI = 0;
484
485
486 while (termDataI < termDataSize) {
487 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
488 levelDocNum)) {
489 if (levelDocNum != lastLevelDocNum) {
490 if (lastLevelDocNum > 0) {
491 // add this doc information
492 Wdt = termDocFreq * wordLog;
493
494 // find this document number
495 while (resultI < resultSize &&
496 result.docs[resultI] < lastLevelDocNum)
497 resultI++;
498
499 // store the result
500 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
501 result.docs[resultOutI] = lastLevelDocNum;
502 if (needRanks)
503 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
504 resultI++;
505 resultOutI++;
506 }
507 }
508
509 lastLevelDocNum = levelDocNum;
510 termDocFreq = 0;
511 }
512
513 if (needRanks)
514 termDocFreq += termData.fragFreqs[termDataI];
515 }
516
517 termDataI++;
518 }
519
520 if (lastLevelDocNum > 0) {
521 // add the last document information
522 Wdt = termDocFreq * wordLog;
523
524 // find this document number
525 while (resultI < resultSize &&
526 result.docs[resultI] < lastLevelDocNum)
527 resultI++;
528
529 // store the result
530 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
531 result.docs[resultOutI] = lastLevelDocNum;
532 if (needRanks)
533 result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
534 resultI++;
535 resultOutI++;
536 }
537 }
538
539 // remove unneeded entries
540 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
541 if (needRanks)
542 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
543 else
544 result.ranks.erase (result.ranks.begin(), result.ranks.end());
545
546 // add the term frequency information
547 if (queryInfo.needTermFreqs) {
548 TermFreqData termFreqData;
549 termFreqData.tag = tag;
550 termFreqData.term = term;
551 termFreqData.stemMethod = stemMethod;
552 termFreqData.matchDocs = termData.matchDocs;
553 result.termFreqs.push_back (termFreqData);
554 }
555}
556
557
558void RemoveUnwantedResults (IndexData &indexData,
559 const QueryInfo &queryInfo,
560 const FragData &termData,
561 QueryResult &result) {
562 bool needRanks = (queryInfo.sortByRank || queryInfo.needRankInfo);
563
564 unsigned long termDataI = 0;
565 unsigned long termDataSize = termData.fragNums.size();
566 unsigned long levelDocNum = 0;
567
568 unsigned long lastLevelDocNum = 0;
569
570 unsigned long resultI = 0;
571 unsigned long resultSize = result.docs.size();
572 unsigned long resultOutI = 0;
573
574 while (termDataI < termDataSize) {
575 if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
576 levelDocNum)) {
577 if (levelDocNum != lastLevelDocNum) {
578 if (lastLevelDocNum > 0) {
579 // find this document number
580 while (resultI < resultSize &&
581 result.docs[resultI] < lastLevelDocNum)
582 resultI++;
583
584 // store the result
585 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
586 result.docs[resultOutI] = lastLevelDocNum;
587 if (needRanks)
588 result.ranks[resultOutI] = result.ranks[resultI];
589 resultI++;
590 resultOutI++;
591 }
592 }
593
594 lastLevelDocNum = levelDocNum;
595 }
596 }
597
598 termDataI++;
599 }
600
601 if (lastLevelDocNum > 0) {
602 // find this document number
603 while (resultI < resultSize &&
604 result.docs[resultI] < lastLevelDocNum)
605 resultI++;
606
607 // store the result
608 if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
609 result.docs[resultOutI] = lastLevelDocNum;
610 if (needRanks)
611 result.ranks[resultOutI] = result.ranks[resultI];
612 resultI++;
613 resultOutI++;
614 }
615 }
616
617 // remove unneeded entries
618 result.docs.erase (result.docs.begin()+resultOutI, result.docs.end());
619 if (needRanks)
620 result.ranks.erase (result.ranks.begin()+resultOutI, result.ranks.end());
621 else
622 result.ranks.erase (result.ranks.begin(), result.ranks.end());
623}
Note: See TracBrowser for help on using the repository browser.