Changeset 29629

Show
Ignore:
Timestamp:
15.12.2014 09:57:01 (5 years ago)
Author:
kjdon
Message:

updating AndFragsToQueryResult? to use actual num docs in weight generation

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/common-src/indexers/mgpp/text/Terms.cpp

    r29581 r29629  
    582582   
    583583  // log (N / ft) 
    584   float wordLog = 
    585     log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/ 
    586     (double)termData.matchDocs); 
     584  //float wordLog = 
     585  //    log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries/ 
     586  //    (double)termData.matchDocs); 
    587587 
    588588  // Wqt = fqt * log (N / ft) 
    589589  // note: terms are allowed to have a weight of zero so 
    590590  // they can be excluded from the ranking 
    591   float Wqt = termWeight * wordLog; 
     591  //float Wqt = termWeight * wordLog; 
    592592 
    593593  // Wdt = fdt * log (N / ft) 
     
    605605  mg_u_long resultOutI = 0; 
    606606   
    607   mg_u_long actual_num_term_match_docs = 0; 
    608    
     607  mg_u_long actual_num_match_docs = 0; 
     608  vector<mg_u_long> docFreqsArray; 
    609609  while (termDataI < termDataSize) { 
    610610    if (indexData.levelConverter.FragToLevel (termData.fragNums[termDataI], 
     
    612612      if (levelDocNum != lastLevelDocNum) { 
    613613    if (lastLevelDocNum > 0) { 
    614       ++actual_num_term_match_docs; 
    615  
    616       Wdt = termDocFreq * wordLog; 
     614      ++actual_num_match_docs; 
     615 
     616      //Wdt = termDocFreq * wordLog; 
    617617       
    618618      // find this document number 
     
    624624      if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) { 
    625625        result.docs[resultOutI] = lastLevelDocNum; 
    626         if (needRanks) 
    627           result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt; 
     626        if (needRanks) { 
     627          // store the doc freq so we can calulate the rank for the new term 
     628          // once we know the num docs 
     629          docFreqsArray.push_back(termDocFreq); 
     630          // just store the old rank for now, and we'll add on the new bit at the end 
     631          result.ranks[resultOutI] = result.ranks[resultI]; // + Wqt * Wdt; 
     632        } 
    628633        ++resultI; 
    629634        ++resultOutI; 
     
    635640      } 
    636641 
    637       if (needRanks) 
     642      if (needRanks) { 
    638643    termDocFreq += termData.fragFreqs[termDataI]; 
    639      overallwordfreq += termData.fragFreqs[termDataI];  
     644      } 
     645      overallwordfreq += termData.fragFreqs[termDataI];  
    640646    } 
    641647     
     
    644650 
    645651  if (lastLevelDocNum > 0) { 
    646     ++actual_num_term_match_docs; 
     652    ++actual_num_match_docs; 
    647653    // add the last document information 
    648     Wdt = termDocFreq * wordLog; 
     654    //Wdt = termDocFreq * wordLog; 
    649655 
    650656    // find this document number 
     
    656662    if (resultI < resultSize && result.docs[resultI] == lastLevelDocNum) { 
    657663      result.docs[resultOutI] = lastLevelDocNum; 
    658       if (needRanks) 
    659     result.ranks[resultOutI] = result.ranks[resultI] + Wqt * Wdt; 
     664      if (needRanks) { 
     665    // store the doc freq so we can calulate the rank for the new term 
     666    // once we know the num docs 
     667    docFreqsArray.push_back(termDocFreq); 
     668    // just store the old rank for now, and we'll add on the new bit at the end 
     669    result.ranks[resultOutI] = result.ranks[resultI]; // + Wqt * Wdt; 
     670      } 
    660671      ++resultI; 
    661672      ++resultOutI; 
     
    669680  else 
    670681    result.ranks.erase (result.ranks.begin(), result.ranks.end()); 
     682   
     683  // Calcalate correct ranks 
     684  float wordLog = log((double)indexData.levels.levelInfo[indexData.curLevel].numEntries / (double)actual_num_match_docs); 
     685  float Wqt = termWeight * wordLog; 
     686  float factor = wordLog * Wqt; 
     687 
     688  mg_u_long docFreqI = 0; 
     689  mg_u_long docFreqSize = docFreqsArray.size(); 
     690   
     691  while (docFreqI < docFreqSize) { 
     692    result.ranks[docFreqI] = result.ranks[docFreqI] + docFreqsArray[docFreqI]*factor; 
     693    ++docFreqI; 
     694  } 
    671695   
    672696  // add the term frequency information 
     
    678702    termFreqData.equivTerms = equivTerms; 
    679703    //termFreqData.matchDocs = termData.matchDocs; 
    680     termFreqData.matchDocs = actual_num_term_match_docs; 
     704    termFreqData.matchDocs = actual_num_match_docs; 
    681705    termFreqData.termFreq = overallwordfreq; 
    682706    result.termFreqs.push_back (termFreqData);