Changeset 16947


Ignore:
Timestamp:
2008-08-21T15:13:41+12:00 (16 years ago)
Author:
mdewsnip
Message:

Changed the Lucene code to use the Greenstone document OIDs directly, instead of creating its own numeric IDs and then mapping them to the Greenstone OIDs in the GDBM file. As well as being simpler and more space and speed efficient (the mapping no longer needs to be stored in the GDBM file, and no lookup needs to be done for each search result), this is another important step along the road to true incremental building.

Files:
5 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/runtime-src/src/colservr/expat_resultset.cpp

    r16915 r16947  
    9191
    9292    char* id = get_attribute(attributes,"id");
    93     if (id!=NULL) {
    94       int id_num = atoi(id);
    95    
     93    if (id != NULL)
     94    {
    9695      docresultclass doc;
    9796      doc.clear();
    98       doc.docnum = id_num;
     97      doc.docid = id;
    9998      doc.docweight = qrpack_ptr->match_count;
    10099
     
    105104      }
    106105
    107       queryresult_ptr->docs.docset[doc.docnum] = doc;
    108       queryresult_ptr->docs.docorder.push_back(doc.docnum);
     106      queryresult_ptr->docs.docset[doc.docid] = doc;
     107      queryresult_ptr->docs.docorder.push_back(doc.docid);
    109108      ++qrpack_ptr->match_count;
    110109
  • gsdl/trunk/runtime-src/src/colservr/lucenequeryfilter.cpp

    r16915 r16947  
    163163
    164164  // assemble document results
    165   if (need_matching_docs (request.filterResultOptions)) {
    166    
     165  if (need_matching_docs (request.filterResultOptions))
     166  {
     167    // Loop through the query results (ordered by ranking)
    167168    int resultnum = 1;
    168     ResultDocInfo_t resultdoc;
    169     text_t trans_OID;
    170     vector<text_t>::iterator docorder_here = queryresults.docs.docorder.begin();
    171     vector<text_t>::iterator docorder_end = queryresults.docs.docorder.end();
    172 
    173     // Now handled by Lucene directly
    174     //if (endresults == -1) endresults = MAXNUMDOCS;
    175 
    176     while (docorder_here != docorder_end)
     169    vector<text_t>::iterator docorder_iterator = queryresults.docs.docorder.begin();
     170    while (docorder_iterator != queryresults.docs.docorder.end())
     171    {
     172      text_t doc_OID = (*docorder_iterator);
     173      // logout << "Matching doc OID: " << doc_OID << endl;
     174
     175      // Make sure this result is in the docset, and either in the request set or the request set is empty
     176      docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID);
     177      if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, doc_OID)))
    177178      {
    178         // Now handled by Lucene directly
    179         //if (resultnum > endresults) break;
    180      
    181         // translate the document number
    182         if (!translate(db_ptr, *docorder_here, trans_OID))
    183           {
    184             logout << text_t2ascii
    185                    << "warning: could not translate lucene document number \""
    186                    << *docorder_here << "\" to OID.\n\n";
    187            
    188           }
    189         else
    190           {
    191             docresultmap::iterator docset_here = queryresults.docs.docset.find (*docorder_here);
    192 
    193             // see if there is a result for this number,
    194             // if it is in the request set (or the request set is empty)
    195             if (docset_here != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, trans_OID)))
    196               {
    197                 // Now handled by Lucene directly
    198                 //if (resultnum >= startresults) {
    199 
    200                 // add this document
    201                 resultdoc.OID = trans_OID;
    202                 resultdoc.result_num = resultnum;
    203                 resultdoc.ranking = (int)((*docset_here).second.docweight * 10000.0 + 0.5);
    204                 resultdoc.num_terms_matched = (*docset_here).second.num_query_terms_matched;
    205                
    206                 response.docInfo.push_back (resultdoc);
    207                 //}
    208                 ++resultnum;
    209               }
    210           } // else
    211        
    212         ++docorder_here;
    213       }
    214   } // if need matching docs
     179    // Add the matching document
     180    ResultDocInfo_t resultdoc;
     181    resultdoc.OID = doc_OID;
     182    resultdoc.result_num = resultnum;
     183    resultdoc.ranking = (int)((*doc_result).second.docweight * 10000.0 + 0.5);
     184    resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched;
     185    response.docInfo.push_back (resultdoc);
     186
     187    resultnum++;
     188      }
     189
     190      docorder_iterator++;
     191    }
     192  }
    215193 
    216194  // assemble the term results
  • gsdl/trunk/runtime-src/src/colservr/queryinfo.cpp

    r12868 r16947  
    203203
    204204void docresultclass::clear () {
     205  docid="";
    205206  docnum=-1;
    206207  docweight=0.0;
     
    219220
    220221docresultclass &docresultclass::operator=(const docresultclass &d) {
     222  docid = d.docid;
    221223  docnum = d.docnum;
    222224  docweight = d.docweight;
     
    229231
    230232bool operator==(const docresultclass &x, const docresultclass &y) {
    231   return ((x.docnum == y.docnum) && (x.docweight == y.docweight) &&
     233  return ((x.docid == y.docid) && (x.docnum == y.docnum) && (x.docweight == y.docweight) &&
    232234      (x.num_query_terms_matched == y.num_query_terms_matched) &&
    233235      (x.num_phrase_match == y.num_phrase_match));
     
    235237
    236238bool operator<(const docresultclass &x, const docresultclass &y) {
    237   return ((x.docnum < y.docnum) ||
    238       ((x.docnum == y.docnum) &&
    239        ((x.docweight < y.docweight) ||
    240         ((x.docweight == y.docweight) &&
    241          ((x.num_query_terms_matched < y.num_query_terms_matched) ||
    242           ((x.num_query_terms_matched == y.num_query_terms_matched) &&
    243            ((x.num_phrase_match < y.num_phrase_match))))))));
     239  return ((x.docid < y.docid) ||
     240      ((x.docid == y.docid) &&
     241       ((x.docnum < y.docnum) ||
     242        ((x.docnum == y.docnum) &&
     243         ((x.docweight < y.docweight) ||
     244          ((x.docweight == y.docweight) &&
     245           ((x.num_query_terms_matched < y.num_query_terms_matched) ||
     246        ((x.num_query_terms_matched == y.num_query_terms_matched) &&
     247         ((x.num_phrase_match < y.num_phrase_match))))))))));
    244248}
    245249
  • gsdl/trunk/runtime-src/src/colservr/queryinfo.h

    r16445 r16947  
    115115
    116116struct docresultclass {
     117  text_t docid;  // currently used by Lucene only
    117118  int docnum;
    118119  float docweight;
  • indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

    r16912 r16947  
    203203            Document doc = hits.doc(i - 1);
    204204            int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
    205             lucene_query_result.addDoc(doc.get("nodeID").trim(), hits.score(i-1), doc_term_freq);
     205            lucene_query_result.addDoc(doc.get("docOID").trim(), hits.score(i-1), doc_term_freq);
    206206        }
    207207        }
     
    221221            Document doc = reader.document(lucene_doc_num);
    222222            int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
    223             lucene_query_result.addDoc(doc.get("nodeID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
     223            lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
    224224        }
    225225        }
Note: See TracChangeset for help on using the changeset viewer.