Ignore:
Timestamp:
2014-12-11T14:34:58+13:00 (9 years ago)
Author:
kjdon
Message:

in gs2mgppdemo, a query of government was coming back with totalMatchDocs 127, but in term info, it said 'government' was found in 108 docs. This is because when generating the list of word nums for government, it looks up the equivalent terms (due to casefolding, stemming etc) and there are 2: government and Government. It gets the list of word positions for each one and merges the lists. When you get the list of word positions, you also get back the number of docs/secs that match the word. Government had 42, and government had 108. The merging code says that for total match docs we'll just take the larger number, ie 108. Later on, this figure is used as total number of matching documents for the ranking calculation, and for the info in the query result.
I have added a new variable, actual_num_match_docs, which we increment as we go through the word position lists and generate doc/sec numbers. This is the point when we actually know how many matches we have. For FragsToQueryResult, instead of calculating ranks as we generate each doc num, I am just storing the doc term freq, then once we know the actual number, we can calculate term weight and query term weight to generate the ranks. I still need to modify AndFragsToQueryResult similarly. This currently calculates actual_num_match_docs and uses it in the query result, but it doesn't yet use it for the rank generation.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/common-src/indexers/mgpp/text/Terms.cpp

    r26138 r29581  
    360360  outFragData.matchDocs = (f1.matchDocs > f2.matchDocs) ?
    361361    f1.matchDocs : f2.matchDocs;
    362 
    363362  // do or
    364363  mg_u_long f1I = 0, f1Size = f1.fragNums.size();
    365364  mg_u_long f2I = 0, f2Size = f2.fragNums.size();
     365
    366366  while (f1I < f1Size || f2I < f2Size) {
    367367    if (f2I < f2Size &&
     
    484484  // log (N / ft)
    485485  mg_u_long N = indexData.levels.levelInfo[indexData.curLevel].numEntries;
    486   float wordLog = log((double)N / (double)termData.matchDocs);
     486  // termData.matchDocs is not accurate - its just the largest docfreq out of the list of equiv terms. We'll delay calculating ranks until after we have worked out exactly how many docs we have
     487  //float wordLog = log((double)N / (double)termData.matchDocs);
    487488
    488489  // Wqt = fqt * log (N / ft)
    489490  // note: terms are allowed to have a weight of zero so
    490491  // they can be excluded from the ranking
    491   float Wqt = termWeight * wordLog;
     492  //float Wqt = termWeight * wordLog;
    492493
    493494  // Wdt = fdt * log (N / ft)
    494   float Wdt;
    495  
     495  //float Wdt;
     496  mg_u_long actual_num_match_docs = 0;
     497  vector<mg_u_long> docFreqsArray;
     498
    496499  mg_u_long termDataI = 0;
    497500  mg_u_long termDataSize = termData.fragNums.size();
     
    509512      // add this doc information
    510513      if (needRanks) {
    511         Wdt = termDocFreq * wordLog;
    512         result.ranks.push_back (Wqt * Wdt);
     514        //Wdt = termDocFreq * wordLog;
     515        //result.ranks.push_back (Wqt * Wdt);
     516        docFreqsArray.push_back(termDocFreq);
    513517      }
    514518      result.docs.push_back (lastLevelDocNum);
     519      ++actual_num_match_docs;
    515520    }
    516521   
     
    530535    // add the last document information
    531536    if (needRanks) {
    532       Wdt = termDocFreq * wordLog;
    533       result.ranks.push_back (Wqt * Wdt);
     537      //Wdt = termDocFreq * wordLog;
     538      //result.ranks.push_back (Wqt * Wdt);
     539      docFreqsArray.push_back(termDocFreq);
    534540    }
    535541    result.docs.push_back (lastLevelDocNum);
     542    ++actual_num_match_docs;
     543  }
     544  // Now that we know the actual number of docs containing this term, we can calculate ranks
     545  float wordLog = log((double)N / (double)actual_num_match_docs);
     546  float Wqt = termWeight * wordLog;
     547  float factor = wordLog * Wqt;
     548
     549  mg_u_long docFreqI = 0;
     550  mg_u_long docFreqSize = docFreqsArray.size();
     551 
     552  while (docFreqI < docFreqSize) {
     553    result.ranks.push_back(docFreqsArray[docFreqI]*factor);
     554    ++docFreqI;
    536555  }
    537556
     
    543562    termFreqData.stemMethod = stemMethod;
    544563    termFreqData.equivTerms = equivTerms;
    545     termFreqData.matchDocs = termData.matchDocs;
     564    //termFreqData.matchDocs = termData.matchDocs;
     565    termFreqData.matchDocs = actual_num_match_docs;
    546566    termFreqData.termFreq = overallwordfreq; // will be zero if needRankInfo
    547567                                              //not true
     
    585605  mg_u_long resultOutI = 0;
    586606 
     607  mg_u_long actual_num_term_match_docs = 0;
    587608 
    588609  while (termDataI < termDataSize) {
     
    591612      if (levelDocNum != lastLevelDocNum) {
    592613    if (lastLevelDocNum > 0) {
    593       // add this doc information
     614      ++actual_num_term_match_docs;
     615
    594616      Wdt = termDocFreq * wordLog;
    595617     
     
    622644
    623645  if (lastLevelDocNum > 0) {
     646    ++actual_num_term_match_docs;
    624647    // add the last document information
    625648    Wdt = termDocFreq * wordLog;
     
    654677    termFreqData.stemMethod = stemMethod;
    655678    termFreqData.equivTerms = equivTerms;
    656     termFreqData.matchDocs = termData.matchDocs;
     679    //termFreqData.matchDocs = termData.matchDocs;
     680    termFreqData.matchDocs = actual_num_term_match_docs;
    657681    termFreqData.termFreq = overallwordfreq;
    658682    result.termFreqs.push_back (termFreqData);
Note: See TracChangeset for help on using the changeset viewer.