Changeset 29581 for main/trunk/greenstone2
- Timestamp:
- 2014-12-11T14:34:58+13:00 (9 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/common-src/indexers/mgpp/text/Terms.cpp
r26138 r29581 360 360 outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ? 361 361 f1.matchDocs : f2.matchDocs; 362 363 362 // do or 364 363 mg_u_long f1I = 0, f1Size = f1.fragNums.size(); 365 364 mg_u_long f2I = 0, f2Size = f2.fragNums.size(); 365 366 366 while (f1I < f1Size || f2I < f2Size) { 367 367 if (f2I < f2Size && … … 484 484 // log (N / ft) 485 485 mg_u_long N = indexData.levels.levelInfo[indexData.curLevel].numEntries; 486 float wordLog = log((double)N / (double)termData.matchDocs); 486 // termData.matchDocs is not accurate - its just the largest docfreq out of the list of equiv terms. We'll delay calculating ranks until after we have worked out exactly how many docs we have 487 //float wordLog = log((double)N / (double)termData.matchDocs); 487 488 488 489 // Wqt = fqt * log (N / ft) 489 490 // note: terms are allowed to have a weight of zero so 490 491 // they can be excluded from the ranking 491 float Wqt = termWeight * wordLog;492 //float Wqt = termWeight * wordLog; 492 493 493 494 // Wdt = fdt * log (N / ft) 494 float Wdt; 495 495 //float Wdt; 496 mg_u_long actual_num_match_docs = 0; 497 vector<mg_u_long> docFreqsArray; 498 496 499 mg_u_long termDataI = 0; 497 500 mg_u_long termDataSize = termData.fragNums.size(); … … 509 512 // add this doc information 510 513 if (needRanks) { 511 Wdt = termDocFreq * wordLog; 512 result.ranks.push_back (Wqt * Wdt); 514 //Wdt = termDocFreq * wordLog; 515 //result.ranks.push_back (Wqt * Wdt); 516 docFreqsArray.push_back(termDocFreq); 513 517 } 514 518 result.docs.push_back (lastLevelDocNum); 519 ++actual_num_match_docs; 515 520 } 516 521 … … 530 535 // add the last document information 531 536 if (needRanks) { 532 Wdt = termDocFreq * wordLog; 533 result.ranks.push_back (Wqt * Wdt); 537 //Wdt = termDocFreq * wordLog; 538 //result.ranks.push_back (Wqt * Wdt); 539 docFreqsArray.push_back(termDocFreq); 534 540 } 535 541 result.docs.push_back (lastLevelDocNum); 542 ++actual_num_match_docs; 543 } 544 // Now that we know the actual number of docs containing this term, we can calculate ranks 545 float wordLog = log((double)N / (double)actual_num_match_docs); 546 float Wqt = termWeight * wordLog; 547 float factor = wordLog * Wqt; 548 549 mg_u_long docFreqI = 0; 550 mg_u_long docFreqSize = docFreqsArray.size(); 551 552 while (docFreqI < docFreqSize) { 553 result.ranks.push_back(docFreqsArray[docFreqI]*factor); 554 ++docFreqI; 536 555 } 537 556 … … 543 562 termFreqData.stemMethod = stemMethod; 544 563 termFreqData.equivTerms = equivTerms; 545 termFreqData.matchDocs = termData.matchDocs; 564 //termFreqData.matchDocs = termData.matchDocs; 565 termFreqData.matchDocs = actual_num_match_docs; 546 566 termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo 547 567 //not true … … 585 605 mg_u_long resultOutI = 0; 586 606 607 mg_u_long actual_num_term_match_docs = 0; 587 608 588 609 while (termDataI < termDataSize) { … … 591 612 if (levelDocNum != lastLevelDocNum) { 592 613 if (lastLevelDocNum > 0) { 593 // add this doc information 614 ++actual_num_term_match_docs; 615 594 616 Wdt = termDocFreq * wordLog; 595 617 … … 622 644 623 645 if (lastLevelDocNum > 0) { 646 ++actual_num_term_match_docs; 624 647 // add the last document information 625 648 Wdt = termDocFreq * wordLog; … … 654 677 termFreqData.stemMethod = stemMethod; 655 678 termFreqData.equivTerms = equivTerms; 656 termFreqData.matchDocs = termData.matchDocs; 679 //termFreqData.matchDocs = termData.matchDocs; 680 termFreqData.matchDocs = actual_num_term_match_docs; 657 681 termFreqData.termFreq = overallwordfreq; 658 682 result.termFreqs.push_back (termFreqData);
Note:
See TracChangeset
for help on using the changeset viewer.