/********************************************************************** * * lucenequeryfilter.cpp -- * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include "lucenequeryfilter.h" #include "fileutil.h" #include "lucenesearch.h" lucenequeryfilterclass::lucenequeryfilterclass () : fieldedqueryfilterclass() { FilterOption_t filtopt; // -- onePerQuery SortField, enumerated, used to list available sorting fields filtopt.clear(); filtopt.name = "SortField"; filtopt.type = FilterOption_t::enumeratedt; filtopt.repeatable = FilterOption_t::onePerQuery; filtopt.defaultValue = ""; filterOptions["SortField"] = filtopt; // -- onePerQuery SortOder enumerated (0=ascending, 1=descending) filtopt.clear(); filtopt.name = "SortOrder"; filtopt.type = FilterOption_t::enumeratedt; filtopt.repeatable = FilterOption_t::onePerQuery; filtopt.defaultValue = "ascending"; filtopt.validValues.push_back("ascending"); filtopt.validValues.push_back("descending"); filterOptions["SortOrder"] = filtopt; // -- onePerQuery Fuzziness string 0.0-1.0 filtopt.clear(); filtopt.name = "Fuzziness"; filtopt.type = FilterOption_t::stringt; filtopt.repeatable = FilterOption_t::onePerQuery; filtopt.defaultValue = ""; filterOptions["Fuzziness"] = filtopt; // -- onePerQuery FilterString string filtopt.clear(); filtopt.name = "FilterString"; filtopt.type = FilterOption_t::stringt; filtopt.repeatable = FilterOption_t::onePerQuery; filtopt.defaultValue = ""; filterOptions["FilterString"] = filtopt; } lucenequeryfilterclass::~lucenequeryfilterclass () { } void lucenequeryfilterclass::configure (const text_t &key, const text_tarray &cfgline) { fieldedqueryfilterclass::configure(key, cfgline); if (key == "textlevel") { ((lucenesearchclass *)textsearchptr)->set_text_level(cfgline[0]); } else if (key == "indexsortfieldmap") { sortfieldmap.importmap (cfgline); } else if (key == "indexsortfields") { filterOptions["SortField"].validValues.erase(filterOptions["SortField"].validValues.begin(), filterOptions["SortField"].validValues.end()); text_tarray::const_iterator here = cfgline.begin(); text_tarray::const_iterator end = cfgline.end(); while (here != end) { if (!(*here).empty()) { filterOptions["SortField"].validValues.push_back(*here); } ++here; } } else if (key == "defaultsortfield") { sortfieldmap.from2to (cfgline[0], filterOptions["SortField"].defaultValue); } } bool lucenequeryfilterclass::init (ostream &logout) { if (!fieldedqueryfilterclass::init(logout)) { return false; } if (filterOptions["SortField"].defaultValue.empty() && filterOptions["SortField"].validValues.begin() != filterOptions["SortField"].validValues.end() && !filterOptions["SortField"].validValues[0].empty()) { filterOptions["SortField"].defaultValue = filterOptions["SortField"].validValues[0]; } return true; } void lucenequeryfilterclass::set_queryparam_defaults(queryparamclass &query ) { fieldedqueryfilterclass::set_queryparam_defaults(query); query.filterstring = filterOptions["FilterString"].defaultValue; query.sortfield = filterOptions["SortField"].defaultValue; query.sortorder = (filterOptions["SortOrder"].defaultValue == "descending"); query.fuzziness = filterOptions["Fuzziness"].defaultValue; } bool lucenequeryfilterclass::set_queryparam_field(const OptionValue_t &option, queryparamclass &query) { if (option.name == "FilterString") { query.filterstring = option.value; return true; } if (option.name == "SortField") { query.sortfield = option.value; return true; } if (option.name == "SortOrder") { query.sortorder = (option.value == "descending"); return true; } if (option.name == "Fuzziness") { query.fuzziness = option.value; return true; } return fieldedqueryfilterclass::set_queryparam_field(option, query); } void lucenequeryfilterclass::filter(const FilterRequest_t &request, FilterResponse_t &response, comerror_t &err, ostream &logout) { outconvertclass text_t2ascii; response.clear (); err = noError; if (db_ptr == NULL) { // most likely a configuration problem logout << text_t2ascii << "configuration error: queryfilter contains a null dbclass\n\n"; err = configurationError; return; } if (textsearchptr == NULL) { // most likely a configuration problem logout << text_t2ascii << "configuration error: queryfilter contains a null textsearchclass (lucene)\n\n"; err = configurationError; return; } if (full_text_browse(request.filterResultOptions)) { browsefilter(request, response, err, logout); return; } // open the database db_ptr->setlogout(&logout); if (!db_ptr->opendatabase (db_filename, DB_READER, 100, false)) { // most likely a system problem (we have already checked that the database exists) logout << text_t2ascii << "system problem: open on database \"" << db_filename << "\" failed\n\n"; err = systemProblem; return; } // get the query parameters int startresults, endresults; vector queryfilterparams; parse_query_params (request, queryfilterparams, startresults, endresults, logout); // do query queryresultsclass queryresults; do_multi_query (request, queryfilterparams, queryresults, err, logout); response.error_message = queryresults.error_message; if (err != noError) return; // assemble document results if (need_matching_docs (request.filterResultOptions)) { // Loop through the query results (ordered by ranking) int resultnum = 1; vector::iterator docorder_iterator = queryresults.docs.docorder.begin(); while (docorder_iterator != queryresults.docs.docorder.end()) { text_t doc_OID = (*docorder_iterator); // logout << "Matching doc OID: " << doc_OID << endl; // Make sure this result is in the docset, and either in the request set or the request set is empty docresultmap::iterator doc_result = queryresults.docs.docset.find (doc_OID); if (doc_result != queryresults.docs.docset.end() && (request.docSet.empty() || in_set(request.docSet, doc_OID))) { // Add the matching document ResultDocInfo_t resultdoc; resultdoc.OID = doc_OID; resultdoc.result_num = resultnum; resultdoc.ranking = (int)((*doc_result).second.docweight * 10000.0 + 0.5); resultdoc.num_terms_matched = (*doc_result).second.num_query_terms_matched; response.docInfo.push_back (resultdoc); resultnum++; } docorder_iterator++; } } // assemble the term results if (need_term_info(request.filterResultOptions)) { // note: the terms have already been sorted and uniqued - ?? have they?? TermInfo_t terminfo; bool terms_first = true; termfreqclassarray::iterator terms_here = queryresults.terms.begin(); termfreqclassarray::iterator terms_end = queryresults.terms.end(); while (terms_here != terms_end) { terminfo.clear(); terminfo.term = (*terms_here).termstr; terminfo.freq = (*terms_here).termfreq; // lucene doesn't return any termvariants at this stage, // so make sure the original term is set terminfo.matchTerms.push_back(terminfo.term); // this bit gets the matchTerms ie the equivalent (stem/casefold) terms if (terms_first) { text_tset::iterator termvariants_here = queryresults.termvariants.begin(); text_tset::iterator termvariants_end = queryresults.termvariants.end(); while (termvariants_here != termvariants_end) { terminfo.matchTerms.push_back (*termvariants_here); ++termvariants_here; } } terms_first = false; response.termInfo.push_back (terminfo); ++terms_here; } // add the stop words text_tset::iterator stopwords_here = queryresults.stopwords.begin(); text_tset::iterator stopwords_end = queryresults.stopwords.end(); while (stopwords_here != stopwords_end) { response.stopwords.insert(*stopwords_here); ++stopwords_here; } } db_ptr->closedatabase(); // Important that local library doesn't leave any files open response.numDocs = queryresults.docs_matched; response.isApprox = queryresults.is_approx; } void lucenequeryfilterclass::browsefilter(const FilterRequest_t &request, FilterResponse_t &response, comerror_t &err, ostream &logout) { outconvertclass text_t2ascii; // get the query parameters int startresults, endresults; vector queryfilterparams; parse_query_params (request, queryfilterparams, startresults, endresults, logout); vector::const_iterator query_here = queryfilterparams.begin(); // do query queryresultsclass queryresults; queryresults.clear(); int numDocs = endresults-startresults; textsearchptr->setcollectdir (collectdir); if (!((lucenesearchclass*)textsearchptr)->browse_search((*query_here), startresults, numDocs, queryresults)) { // most likely a system problem logout << text_t2ascii << "system problem: could not do full text browse with lucene for index \"" << (*query_here).index << (*query_here).subcollection << (*query_here).language << "\".\n\n"; err = systemProblem; return; } // assemble the term results TermInfo_t terminfo; termfreqclassarray::iterator terms_here = queryresults.terms.begin(); termfreqclassarray::iterator terms_end = queryresults.terms.end(); while (terms_here != terms_end) { terminfo.clear(); terminfo.term = (*terms_here).termstr; terminfo.freq = (*terms_here).termfreq; response.termInfo.push_back (terminfo); ++terms_here; } } // lucenesearchptr and db_ptr are assumed to be valid void lucenequeryfilterclass::do_multi_query (const FilterRequest_t &request, const vector &query_params, queryresultsclass &multiresults, comerror_t &err, ostream &logout) { outconvertclass text_t2ascii; err = noError; textsearchptr->setcollectdir (collectdir); multiresults.clear(); vector::const_iterator query_here = query_params.begin(); vector::const_iterator query_end = query_params.end(); while (query_here != query_end) { queryresultsclass thisqueryresults; if (!textsearchptr->search((*query_here), thisqueryresults)) { // most likely a system problem logout << text_t2ascii << "system problem: could not do search with lucene for index \"" << (*query_here).index << (*query_here).level << (*query_here).subcollection << (*query_here).language << "\".\n\n"; err = systemProblem; return; } // check for syntax error if (thisqueryresults.syntax_error==true) { logout << text_t2ascii << "syntax problem: invalid query string \"" << (*query_here).querystring<<"\".\n"; err = syntaxError; return; } // combine the results if (need_matching_docs (request.filterResultOptions)) { if (query_params.size() == 1) { multiresults.error_message = thisqueryresults.error_message; multiresults.docs = thisqueryresults.docs; // just one set of results multiresults.docs_matched = thisqueryresults.docs_matched; multiresults.is_approx = thisqueryresults.is_approx; } else { if ((*query_here).combinequery == "and") { multiresults.docs.combine_and (thisqueryresults.docs); } else if ((*query_here).combinequery == "or") { multiresults.docs.combine_or (thisqueryresults.docs); } else if ((*query_here).combinequery == "not") { multiresults.docs.combine_not (thisqueryresults.docs); } multiresults.docs_matched = multiresults.docs.docset.size(); multiresults.is_approx = Exact; } } // combine the term information if (need_term_info (request.filterResultOptions)) { // append the terms multiresults.orgterms.insert(multiresults.orgterms.end(), thisqueryresults.orgterms.begin(), thisqueryresults.orgterms.end()); // add the term variants - text_tset::iterator termvar_here = thisqueryresults.termvariants.begin(); text_tset::iterator termvar_end = thisqueryresults.termvariants.end(); while (termvar_here != termvar_end) { multiresults.termvariants.insert(*termvar_here); ++termvar_here; } // add the stop words text_tset::iterator stopwords_here = thisqueryresults.stopwords.begin(); text_tset::iterator stopwords_end = thisqueryresults.stopwords.end(); while (stopwords_here != stopwords_end) { multiresults.stopwords.insert(*stopwords_here); ++stopwords_here; } } ++query_here; } // sort and unique the query terms multiresults.sortuniqqueryterms (); }