/********************************************************************** * * lucenesearch.cpp -- * Copyright (C) 1999-2002 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #if defined(GSDL_USE_OBJECTSPACE) # include #elif defined(GSDL_USE_IOS_H) # include #else # include #endif #include "gsdlconf.h" #include "lucenesearch.h" #include "fileutil.h" #include "queryinfo.h" #include "gsdlunicode.h" #include "sax_resultset.h" static text_t getindexsuffix(const queryparamclass &qp) { text_t indexsuffix = "index"; text_t ind = qp.index; text_t sub = qp.subcollection; text_t lang = qp.language; // collection name not added for Lucene indexsuffix = filename_cat(indexsuffix, ind + sub + lang); return indexsuffix; } //////////////////// // lucenesearch class // //////////////////// lucenesearchclass::lucenesearchclass () : searchclass() { gdbm_level = "Doc"; } lucenesearchclass::~lucenesearchclass () { if (cache != NULL) { delete cache; cache = NULL; } } void lucenesearchclass::set_gdbm_level(const text_t &level) { gdbm_level = level; } bool lucenesearchclass::search(const queryparamclass &queryparams, queryresultsclass &queryresult) { #ifdef __WIN32__ char basepath[]=""; #else char basepath[] = "/"; #endif cerr << "**** in luecen search" << endl; char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr(); // set default stem method from values originally set on prefs page int defaultStemMethod = 0; if (queryparams.casefolding) { defaultStemMethod |= 1; } if (queryparams.stemming) { defaultStemMethod |= 2; } // set default Boolean combiner from all/some setting // if match_mode == 1, ie all, default=1 ie AND // if match_mode == 0, ie some, default=0, ie OR int defaultBoolCombine = 0; if (queryparams.match_mode){ defaultBoolCombine = 1; } char* utf8querystring=to_utf8(queryparams.querystring).getcstr(); cerr << "**** query string = " << utf8querystring << endl; cerr << "***** index name = " << indexname << endl; text_t cmd = "lucene_query.pl "; cmd += indexname + (text_t)" " + to_utf8(queryparams.querystring); FILE *PIN = popen(cmd.getcstr(),"r"); if (PIN==NULL) { cerr << "Error: unable to open pipe to " << cmd << endl; return false; } text_t xml_text = ""; while (!feof(PIN)) { char buffer[256]; int num_bytes = fread(buffer,1,256,PIN); xml_text.appendcarr(buffer,num_bytes); } sax_resultset(xml_text,queryresult); pclose(PIN); return true; /* // use default query info settings - change to reflect user preferences?? QueryInfo queryInfo; SetCStr (queryInfo.docLevel, (queryparams.level.getcstr())); queryInfo.maxDocs = (unsigned long)queryparams.maxdocs; queryInfo.sortByRank = (queryparams.search_type == 1); queryInfo.exactWeights = false; queryInfo.needRankInfo = true; // used for overall term freq as well as ranking queryInfo.needTermFreqs = true; ExtQueryResult queryResult; UCArray queryArray; // greenstone gives us the query encoded in unicode. We want utf8. char* utf8querystring=to_utf8(queryparams.querystring).getcstr(); SetCStr(queryArray, utf8querystring); delete utf8querystring; UCArray level; UCArrayClear(level); //set the level for results SetCStr(level, gdbm_level.getcstr()); // do the query // LuceneQuery(*indexData, queryInfo, queryTree, queryResult, level); // **** // convert ExtQueryResult to queryresultclass queryresult.docs_matched = (int)queryResult.docs.size(); if (queryresult.docs_matched == (int)queryResult.actualNumDocs) { queryresult.is_approx = Exact; } else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) { queryresult.is_approx = MoreThan; } else { queryresult.is_approx = Approximate; } docresultclass doc; for (int i=0; i<(int)queryResult.docs.size(); i++) { doc.clear(); doc.docnum = (int)queryResult.levels[i]; doc.docweight = queryResult.ranks[i]; queryresult.docs.docset[doc.docnum] = doc; queryresult.docs.docorder.push_back(doc.docnum); } // term info termfreqclass term; for (int k=0; k<(int)queryResult.termFreqs.size(); k++) { term.clear(); char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term); term.termstr = to_uni(termfreq_cstr); delete termfreq_cstr; term.termstemstr = term.termstr; // we don't set term.utf8equivterms ?? - jrm21 term.termfreq = queryResult.termFreqs[k].termFreq; queryresult.terms.push_back(term); queryresult.orgterms.push_back(term); // should this change?? for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) { char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]); queryresult.termvariants.insert(to_uni(equivterm_cstr)); delete equivterm_cstr; } } // clean up delete indexname; return true; */ return false; } bool lucenesearchclass::browse_search(const queryparamclass &queryparams, int start, int numDocs, queryresultsclass &queryresult) { cerr << "**** Not sure what this function does!" << endl; /* #ifdef __WIN32__ char basepath[]=""; #else char basepath[] = "/"; #endif char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr(); UCArray level; UCArrayClear(level); //browse always at top level SetCStr(level, "Doc"); // this name may change. BrowseQueryNode browseNode; browseNode.startPosition = start; browseNode.numTerms = numDocs; BrowseQueryResult browseResult; UCArrayClear(browseNode.term); // greenstone gives us the query encoded in unicode. We want utf8. char* utf8querystring=to_utf8(queryparams.querystring).getcstr(); SetCStr(browseNode.term, utf8querystring); delete utf8querystring; // do the actual query // LuceneBrowseQuery(*indexData, level, browseNode, browseResult); // **** // load results into term info termfreqclass term; for (int i=0; i<(int)browseResult.termFreqs.size(); i++) { term.clear(); char* term_cstr = GetCStr(browseResult.termFreqs[i].term); term.termstr = to_uni(term_cstr); delete term_cstr; term.termstemstr = term.termstr; term.termfreq = browseResult.termFreqs[i].termFreq; queryresult.terms.push_back(term); queryresult.orgterms.push_back(term); } // clean up delete indexname; return true; */ return false; } // the document text for 'docnum' is placed in 'output' // docTargetDocument returns 'true' if it was able to // try to get a document // collection is needed to see if an index from the // collection is loaded. THe default index bits are just there cos // the mg version needs them bool lucenesearchclass::docTargetDocument(const text_t &/*defaultindex*/, const text_t &/*defaultsubcollection*/, const text_t &/*defaultlanguage*/, const text_t &collection, int docnum, text_t &output) { cerr << "**** Should return document text here!" << endl; /* #ifdef __WIN32__ char basepath[]=""; #else char basepath[] = "/"; #endif char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();; TextData textdata; if(!textdata.LoadData(basepath, textname)) { cout<<"couldn't load text data\n"<,

, tags //clean up textdata.UnloadData (); delete textname; return true; */ return false; } // used to clear any cached databases for persistent versions of // Greenstone like the Windows local library void lucenesearchclass::unload_database () { }