Changeset 29581

Show
Ignore:
Timestamp:
11.12.2014 14:34:58 (5 years ago)
Author:
kjdon
Message:

in gs2mgppdemo, a query of government was coming back with totalMatchDocs 127, but in term info, it said 'government' was found in 108 docs. This is because when generating the list of word nums for government, it looks up the equivalent terms (due to casefolding, stemming etc) and there are 2: government and Government. It gets the list of word positions for each one and merges the lists. When you get the list of word positions, you also get back the number of docs/secs that match the word. Government had 42, and government had 108. The merging code says that for total match docs we'll just take the larger number, ie 108. Later on, this figure is used as total number of matching documents for the ranking calculation, and for the info in the query result.
I have added a new variable, actual_num_match_docs, which we increment as we go through the word position lists and generate doc/sec numbers. This is the point when we actually know how many matches we have. For FragsToQueryResult?, instead of calculating ranks as we generate each doc num, I am just storing the doc term freq, then once we know the actual number, we can calculate term weight and query term weight to generate the ranks. I still need to modify AndFragsToQueryResult? similarly. This currently calculates actual_num_match_docs and uses it in the query result, but it doesn't yet use it for the rank generation.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/common-src/indexers/mgpp/text/Terms.cpp

    r26138 r29581  
    360360  outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ? 
    361361    f1.matchDocs : f2.matchDocs; 
    362  
    363362  // do or 
    364363  mg_u_long f1I = 0, f1Size = f1.fragNums.size(); 
    365364  mg_u_long f2I = 0, f2Size = f2.fragNums.size(); 
     365 
    366366  while (f1I < f1Size || f2I < f2Size) { 
    367367    if (f2I < f2Size && 
     
    484484  // log (N / ft) 
    485485  mg_u_long N = indexData.levels.levelInfo[indexData.curLevel].numEntries; 
    486   float wordLog = log((double)N / (double)termData.matchDocs); 
     486  // termData.matchDocs is not accurate - its just the largest docfreq out of the list of equiv terms. We'll delay calculating ranks until after we have worked out exactly how many docs we have 
     487  //float wordLog = log((double)N / (double)termData.matchDocs); 
    487488 
    488489  // Wqt = fqt * log (N / ft) 
    489490  // note: terms are allowed to have a weight of zero so 
    490491  // they can be excluded from the ranking 
    491   float Wqt = termWeight * wordLog; 
     492  //float Wqt = termWeight * wordLog; 
    492493 
    493494  // Wdt = fdt * log (N / ft) 
    494   float Wdt; 
    495    
     495  //float Wdt; 
     496  mg_u_long actual_num_match_docs = 0; 
     497  vector<mg_u_long> docFreqsArray; 
     498 
    496499  mg_u_long termDataI = 0; 
    497500  mg_u_long termDataSize = termData.fragNums.size(); 
     
    509512      // add this doc information 
    510513      if (needRanks) { 
    511         Wdt = termDocFreq * wordLog; 
    512         result.ranks.push_back (Wqt * Wdt); 
     514        //Wdt = termDocFreq * wordLog; 
     515        //result.ranks.push_back (Wqt * Wdt); 
     516        docFreqsArray.push_back(termDocFreq); 
    513517      } 
    514518      result.docs.push_back (lastLevelDocNum); 
     519      ++actual_num_match_docs; 
    515520    } 
    516521     
     
    530535    // add the last document information 
    531536    if (needRanks) { 
    532       Wdt = termDocFreq * wordLog; 
    533       result.ranks.push_back (Wqt * Wdt); 
     537      //Wdt = termDocFreq * wordLog; 
     538      //result.ranks.push_back (Wqt * Wdt); 
     539      docFreqsArray.push_back(termDocFreq); 
    534540    } 
    535541    result.docs.push_back (lastLevelDocNum); 
     542    ++actual_num_match_docs; 
     543  } 
     544  // Now that we know the actual number of docs containing this term, we can calculate ranks 
     545  float wordLog = log((double)N / (double)actual_num_match_docs); 
     546  float Wqt = termWeight * wordLog; 
     547  float factor = wordLog * Wqt; 
     548 
     549  mg_u_long docFreqI = 0; 
     550  mg_u_long docFreqSize = docFreqsArray.size(); 
     551   
     552  while (docFreqI < docFreqSize) { 
     553    result.ranks.push_back(docFreqsArray[docFreqI]*factor); 
     554    ++docFreqI; 
    536555  } 
    537556 
     
    543562    termFreqData.stemMethod = stemMethod; 
    544563    termFreqData.equivTerms = equivTerms; 
    545     termFreqData.matchDocs = termData.matchDocs; 
     564    //termFreqData.matchDocs = termData.matchDocs; 
     565    termFreqData.matchDocs = actual_num_match_docs; 
    546566    termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo  
    547567                                              //not true 
     
    585605  mg_u_long resultOutI = 0; 
    586606   
     607  mg_u_long actual_num_term_match_docs = 0; 
    587608   
    588609  while (termDataI < termDataSize) { 
     
    591612      if (levelDocNum != lastLevelDocNum) { 
    592613    if (lastLevelDocNum > 0) { 
    593       // add this doc information 
     614      ++actual_num_term_match_docs; 
     615 
    594616      Wdt = termDocFreq * wordLog; 
    595617       
     
    622644 
    623645  if (lastLevelDocNum > 0) { 
     646    ++actual_num_term_match_docs; 
    624647    // add the last document information 
    625648    Wdt = termDocFreq * wordLog; 
     
    654677    termFreqData.stemMethod = stemMethod; 
    655678    termFreqData.equivTerms = equivTerms; 
    656     termFreqData.matchDocs = termData.matchDocs; 
     679    //termFreqData.matchDocs = termData.matchDocs; 
     680    termFreqData.matchDocs = actual_num_term_match_docs; 
    657681    termFreqData.termFreq = overallwordfreq; 
    658682    result.termFreqs.push_back (termFreqData);