Changeset 16947

Show
Ignore:
Timestamp:
21.08.2008 15:13:41 (11 years ago)
Author:
mdewsnip
Message:

Changed the Lucene code to use the Greenstone document OIDs directly, instead of creating its own numeric IDs and then mapping them to the Greenstone OIDs in the GDBM file. As well as being simpler and more space and speed efficient (the mapping no longer needs to be stored in the GDBM file, and no lookup needs to be done for each search result), this is another important step along the road to true incremental building.

Files:
5 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/runtime-src/src/colservr/expat_resultset.cpp

    r16915 r16947  
    9191 
    9292    char* id = get_attribute(attributes,"id"); 
    93     if (id!=NULL) { 
    94       int id_num = atoi(id); 
    95      
     93    if (id != NULL) 
     94    { 
    9695      docresultclass doc; 
    9796      doc.clear(); 
    98       doc.docnum = id_num; 
     97      doc.docid = id; 
    9998      doc.docweight = qrpack_ptr->match_count; 
    10099 
     
    105104      } 
    106105 
    107       queryresult_ptr->docs.docset[doc.docnum] = doc; 
    108       queryresult_ptr->docs.docorder.push_back(doc.docnum); 
     106      queryresult_ptr->docs.docset[doc.docid] = doc; 
     107      queryresult_ptr->docs.docorder.push_back(doc.docid); 
    109108      ++qrpack_ptr->match_count; 
    110109 
  • gsdl/trunk/runtime-src/src/colservr/lucenequeryfilter.cpp

    r16915 r16947  
    163163 
    164164  // assemble document results 
    165   if (need_matching_docs (request.filterResultOptions)) { 
    166      
     165  if (need_matching_docs (request.filterResultOptions)) 
     166  { 
     167    // Loop through the query results (ordered by ranking) 
    167168    int resultnum = 1; 
    168     ResultDocInfo_t resultdoc; 
    169     text_t trans_OID; 
    170     vector<text_t>::iterator docorder_here = queryresults.docs.docorder.begin(); 
    171     vector<text_t>::iterator docorder_end = queryresults.docs.docorder.end(); 
    172  
    173     // Now handled by Lucene directly 
    174     //if (endresults == -1) endresults = MAXNUMDOCS; 
    175  
    176     while (docorder_here != docorder_end) 
     169    vector<text_t>::iterator docorder_iterator = queryresults.docs.docorder.begin(); 
     170    while (docorder_iterator != queryresults.docs.docorder.end()) 
     171    { 
     172      text_t doc_OID = (*docorder_iterator); 
     173      // logout << "Matching doc OID: " << doc_OID << endl; 
     174 
     175      // Make sure this result is in the docset, and either in the request set or the request set is empty 
     176      docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID); 
     177      if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, doc_OID))) 
    177178      { 
    178         // Now handled by Lucene directly 
    179         //if (resultnum > endresults) break; 
    180        
    181         // translate the document number 
    182         if (!translate(db_ptr, *docorder_here, trans_OID)) 
    183           { 
    184             logout << text_t2ascii 
    185                    << "warning: could not translate lucene document number \"" 
    186                    << *docorder_here << "\" to OID.\n\n"; 
    187              
    188           }  
    189         else  
    190           { 
    191             docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here); 
    192  
    193             // see if there is a result for this number, 
    194             // if it is in the request set (or the request set is empty) 
    195             if (docset_here != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, trans_OID))) 
    196               { 
    197                 // Now handled by Lucene directly 
    198                 //if (resultnum >= startresults) { 
    199  
    200                 // add this document 
    201                 resultdoc.OID = trans_OID; 
    202                 resultdoc.result_num = resultnum; 
    203                 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5); 
    204                 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched; 
    205                  
    206                 response.docInfo.push_back (resultdoc); 
    207                 //} 
    208                 ++resultnum; 
    209               } 
    210           } // else 
    211          
    212         ++docorder_here; 
    213       } 
    214   } // if need matching docs 
     179    // Add the matching document 
     180    ResultDocInfo_t resultdoc; 
     181    resultdoc.OID = doc_OID; 
     182    resultdoc.result_num = resultnum; 
     183    resultdoc.ranking = (int)((*doc_result).second.docweight * 10000.0 + 0.5); 
     184    resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched; 
     185    response.docInfo.push_back (resultdoc); 
     186 
     187    resultnum++; 
     188      } 
     189 
     190      docorder_iterator++; 
     191    } 
     192  } 
    215193   
    216194  // assemble the term results 
  • gsdl/trunk/runtime-src/src/colservr/queryinfo.cpp

    r12868 r16947  
    203203 
    204204void docresultclass::clear () { 
     205  docid=""; 
    205206  docnum=-1; 
    206207  docweight=0.0; 
     
    219220 
    220221docresultclass &docresultclass::operator=(const docresultclass &d) { 
     222  docid = d.docid; 
    221223  docnum = d.docnum; 
    222224  docweight = d.docweight; 
     
    229231 
    230232bool operator==(const docresultclass &x, const docresultclass &y) { 
    231   return ((x.docnum == y.docnum) && (x.docweight == y.docweight) && 
     233  return ((x.docid == y.docid) && (x.docnum == y.docnum) && (x.docweight == y.docweight) && 
    232234      (x.num_query_terms_matched == y.num_query_terms_matched) && 
    233235      (x.num_phrase_match == y.num_phrase_match)); 
     
    235237 
    236238bool operator<(const docresultclass &x, const docresultclass &y) { 
    237   return ((x.docnum < y.docnum) || 
    238       ((x.docnum == y.docnum) && 
    239        ((x.docweight < y.docweight) || 
    240         ((x.docweight == y.docweight) && 
    241          ((x.num_query_terms_matched < y.num_query_terms_matched) || 
    242           ((x.num_query_terms_matched == y.num_query_terms_matched) && 
    243            ((x.num_phrase_match < y.num_phrase_match)))))))); 
     239  return ((x.docid < y.docid) || 
     240      ((x.docid == y.docid) && 
     241       ((x.docnum < y.docnum) || 
     242        ((x.docnum == y.docnum) && 
     243         ((x.docweight < y.docweight) || 
     244          ((x.docweight == y.docweight) && 
     245           ((x.num_query_terms_matched < y.num_query_terms_matched) || 
     246        ((x.num_query_terms_matched == y.num_query_terms_matched) && 
     247         ((x.num_phrase_match < y.num_phrase_match)))))))))); 
    244248} 
    245249 
  • gsdl/trunk/runtime-src/src/colservr/queryinfo.h

    r16445 r16947  
    115115 
    116116struct docresultclass { 
     117  text_t docid;  // currently used by Lucene only 
    117118  int docnum; 
    118119  float docweight; 
  • indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

    r16912 r16947  
    203203            Document doc = hits.doc(i - 1); 
    204204            int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue(); 
    205             lucene_query_result.addDoc(doc.get("nodeID").trim(), hits.score(i-1), doc_term_freq); 
     205            lucene_query_result.addDoc(doc.get("docOID").trim(), hits.score(i-1), doc_term_freq); 
    206206        } 
    207207        } 
     
    221221            Document doc = reader.document(lucene_doc_num); 
    222222            int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue(); 
    223             lucene_query_result.addDoc(doc.get("nodeID").trim(), hits.scoreDocs[i-1].score, doc_term_freq); 
     223            lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq); 
    224224        } 
    225225        }