Ignore:
Timestamp:
2014-12-15T09:57:01+13:00 (9 years ago)
Author:
kjdon
Message:

updating AndFragsToQueryResult to use actual num docs in weight generation

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/common-src/indexers/mgpp/text/Terms.cpp

    r29581 r29629  
    582582 
    583583  // log (N / ft)
    584   float wordLog =
    585     log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
    586     (double)termData.matchDocs);
     584  //float wordLog =
     585  //    log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/
     586  //    (double)termData.matchDocs);
    587587
    588588  // Wqt = fqt * log (N / ft)
    589589  // note: terms are allowed to have a weight of zero so
    590590  // they can be excluded from the ranking
    591   float Wqt = termWeight * wordLog;
     591  //float Wqt = termWeight * wordLog;
    592592
    593593  // Wdt = fdt * log (N / ft)
     
    605605  mg_u_long resultOutI = 0;
    606606 
    607   mg_u_long actual_num_term_match_docs = 0;
    608  
     607  mg_u_long actual_num_match_docs = 0;
     608  vector<mg_u_long> docFreqsArray;
    609609  while (termDataI < termDataSize) {
    610610    if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI],
     
    612612      if (levelDocNum != lastLevelDocNum) {
    613613    if (lastLevelDocNum > 0) {
    614       ++actual_num_term_match_docs;
    615 
    616       Wdt = termDocFreq * wordLog;
     614      ++actual_num_match_docs;
     615
     616      //Wdt = termDocFreq * wordLog;
    617617     
    618618      // find this document number
     
    624624      if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
    625625        result.docs[resultOutI] = lastLevelDocNum;
    626         if (needRanks)
    627           result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
     626        if (needRanks) {
     627          // store the doc freq so we can calulate the rank for the new term
     628          // once we know the num docs
     629          docFreqsArray.push_back(termDocFreq);
     630          // just store the old rank for now, and we'll add on the new bit at the end
     631          result.ranks[resultOutI] = result.ranks[resultI]; // + Wqt * Wdt;
     632        }
    628633        ++resultI;
    629634        ++resultOutI;
     
    635640      }
    636641
    637       if (needRanks)
     642      if (needRanks) {
    638643    termDocFreq += termData.fragFreqs[termDataI];
    639      overallwordfreq += termData.fragFreqs[termDataI];
     644      }
     645      overallwordfreq += termData.fragFreqs[termDataI];
    640646    }
    641647   
     
    644650
    645651  if (lastLevelDocNum > 0) {
    646     ++actual_num_term_match_docs;
     652    ++actual_num_match_docs;
    647653    // add the last document information
    648     Wdt = termDocFreq * wordLog;
     654    //Wdt = termDocFreq * wordLog;
    649655
    650656    // find this document number
     
    656662    if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) {
    657663      result.docs[resultOutI] = lastLevelDocNum;
    658       if (needRanks)
    659     result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt;
     664      if (needRanks) {
     665    // store the doc freq so we can calulate the rank for the new term
     666    // once we know the num docs
     667    docFreqsArray.push_back(termDocFreq);
     668    // just store the old rank for now, and we'll add on the new bit at the end
     669    result.ranks[resultOutI] = result.ranks[resultI]; // + Wqt * Wdt;
     670      }
    660671      ++resultI;
    661672      ++resultOutI;
     
    669680  else
    670681    result.ranks.erase (result.ranks.begin(), result.ranks.end());
     682 
     683  // Calcalate correct ranks
     684  float wordLog = log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries / (double)actual_num_match_docs);
     685  float Wqt = termWeight * wordLog;
     686  float factor = wordLog * Wqt;
     687
     688  mg_u_long docFreqI = 0;
     689  mg_u_long docFreqSize = docFreqsArray.size();
     690 
     691  while (docFreqI < docFreqSize) {
     692    result.ranks[docFreqI] = result.ranks[docFreqI] + docFreqsArray[docFreqI]*factor;
     693    ++docFreqI;
     694  }
    671695 
    672696  // add the term frequency information
     
    678702    termFreqData.equivTerms = equivTerms;
    679703    //termFreqData.matchDocs = termData.matchDocs;
    680     termFreqData.matchDocs = actual_num_term_match_docs;
     704    termFreqData.matchDocs = actual_num_match_docs;
    681705    termFreqData.termFreq = overallwordfreq;
    682706    result.termFreqs.push_back (termFreqData);
Note: See TracChangeset for help on using the changeset viewer.