source: trunk/gsdl/src/colservr/lucenesearch.cpp@ 11162

Last change on this file since 11162 was 11162, checked in by kjdon, 18 years ago

added quotes around the path to lucene_query.pl, in case we are installed under a directory with spaces

  • Property svn:keywords set to Author Date Id Revision
File size: 9.4 KB
Line 
1/**********************************************************************
2 *
3 * lucenesearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#if defined(GSDL_USE_OBJECTSPACE)
28# include <ospace\std\iostream>
29#elif defined(GSDL_USE_IOS_H)
30# include <iostream.h>
31#else
32# include <iostream>
33#endif
34
35#include <stdio.h>
36#include <time.h>
37
38#include "gsdlconf.h"
39#include "gsdltools.h"
40#include "lucenesearch.h"
41#include "fileutil.h"
42#include "queryinfo.h"
43#include "gsdlunicode.h"
44
45#include "expat_resultset.h"
46
47
48text_t lucenesearchclass::getindexsuffix(const queryparamclass &qp) {
49 text_t indexsuffix = "index";
50 // get the first char of the level to be the start of the index name
51 text_t suffix = substr(qp.level.begin(), qp.level.begin()+1);
52 lc(suffix);
53 text_t ind = qp.index;
54 text_t sub = qp.subcollection;
55 text_t lang = qp.language;
56
57 // collection name not added for Lucene
58 indexsuffix = filename_cat(indexsuffix, suffix +ind + sub + lang);
59 return indexsuffix;
60
61}
62
63////////////////////
64// lucenesearch class //
65////////////////////
66
67lucenesearchclass::lucenesearchclass ()
68 : searchclass() {
69
70 gdbm_level = "Doc";
71}
72
73lucenesearchclass::~lucenesearchclass ()
74{
75 if (cache != NULL)
76 {
77 delete cache;
78 cache = NULL;
79 }
80}
81
82void lucenesearchclass::set_gdbm_level(const text_t &level) {
83 gdbm_level = level;
84
85}
86
87
88bool lucenesearchclass::search(const queryparamclass &queryparams,
89 queryresultsclass &queryresult) {
90
91#ifdef __WIN32__
92 char basepath[]="";
93#else
94 char basepath[] = "/";
95#endif
96
97 cerr << "**** in lucene search" << endl;
98
99 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
100
101 // set default stem method from values originally set on prefs page
102 int defaultStemMethod = 0;
103 if (queryparams.casefolding) {
104 defaultStemMethod |= 1;
105 }
106 if (queryparams.stemming) {
107 defaultStemMethod |= 2;
108 }
109
110 // set default Boolean combiner from all/some setting
111 // if match_mode == 1, ie all, default=1 ie AND
112 // if match_mode == 0, ie some, default=0, ie OR
113 int defaultBoolCombine = 0;
114 if (queryparams.match_mode){
115 defaultBoolCombine = 1;
116 }
117
118 text_t utf8querystring = to_utf8(queryparams.querystring);
119 cerr << "**** query string = " << utf8querystring << endl;
120
121 text_t escaped_utf8querystring = "";
122 text_t::const_iterator here = utf8querystring.begin();
123 while (here != utf8querystring.end()) {
124 if (*here == '"') escaped_utf8querystring.push_back('\\');
125 escaped_utf8querystring.push_back(*here);
126 here++;
127 }
128 cerr << "**** escaped query string = " << escaped_utf8querystring << endl;
129 cerr << "***** index name = " << indexname << endl;
130
131 text_t cmd = "\""+filename_cat(getenv("GSDLHOME"), "bin", "script", "lucene_query.pl")+"\"";
132 cmd += (text_t)" \""+indexname + (text_t)"\" \"" + escaped_utf8querystring + (text_t)"\"";
133 cerr << "Lucene command: " << cmd << endl;
134
135 text_t xml_text = "";
136
137#ifdef __WIN32__
138 //FILE *PIN = _popen(cmd.getcstr(), "r"); // didn't seem to work
139 cmd = (text_t)"perl -S "+cmd;
140 // we write the result to a file
141 clock_t this_time = clock();
142 text_t filename = "luc";
143 filename.append(this_time);
144 filename.append(".txt");
145
146 text_t out_file = filename_cat(collectdir, filename);
147 cmd += (text_t)" \""+out_file+ (text_t)"\"";
148 int rv = gsdl_system(cmd, true, cerr);
149 if (rv != 0) {
150 cerr << "tried to run command \""<<cmd<<"\", but it failed\n";
151 } else {
152 read_file(out_file, xml_text);
153 remove(out_file.getcstr()); // now delete it
154 }
155#else
156 FILE *PIN = popen(cmd.getcstr(), "r");
157
158 if (PIN==NULL) {
159 perror("PIPE");
160 cerr << "Error: unable to open pipe to " << cmd << endl;
161
162 return false;
163 }
164 while (!feof(PIN)) {
165 char buffer[256];
166 int num_bytes = fread(buffer,1,256,PIN);
167 xml_text.appendcarr(buffer,num_bytes);
168 }
169
170#endif
171 expat_resultset(xml_text,queryresult);
172
173#ifdef __WIN32__
174 // _pclose(PIN);
175#else
176 pclose(PIN);
177#endif
178
179 return true;
180}
181 /*
182 // use default query info settings - change to reflect user preferences??
183 QueryInfo queryInfo;
184
185 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
186 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
187 queryInfo.sortByRank = (queryparams.search_type == 1);
188 queryInfo.exactWeights = false;
189 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
190 queryInfo.needTermFreqs = true;
191
192 ExtQueryResult queryResult;
193
194 UCArray queryArray;
195 // greenstone gives us the query encoded in unicode. We want utf8.
196 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
197 SetCStr(queryArray, utf8querystring);
198 delete utf8querystring;
199
200 UCArray level;
201 UCArrayClear(level);
202
203 //set the level for results
204 SetCStr(level, gdbm_level.getcstr());
205
206
207 // do the query
208 // LuceneQuery(*indexData, queryInfo, queryTree, queryResult, level); // ****
209
210
211 // convert ExtQueryResult to queryresultclass
212
213 queryresult.docs_matched = (int)queryResult.docs.size();
214
215 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
216 queryresult.is_approx = Exact;
217 }
218 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
219 queryresult.is_approx = MoreThan;
220 }
221 else {
222 queryresult.is_approx = Approximate;
223 }
224
225 docresultclass doc;
226 for (int i=0; i<(int)queryResult.docs.size(); ++i) {
227 doc.clear();
228 doc.docnum = (int)queryResult.levels[i];
229 doc.docweight = queryResult.ranks[i];
230 queryresult.docs.docset[doc.docnum] = doc;
231 queryresult.docs.docorder.push_back(doc.docnum);
232
233 }
234
235 // term info
236 termfreqclass term;
237 for (int k=0; k<(int)queryResult.termFreqs.size(); ++k) {
238 term.clear();
239 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
240 term.termstr = to_uni(termfreq_cstr);
241 delete termfreq_cstr;
242 term.termstemstr = term.termstr;
243 // we don't set term.utf8equivterms ?? - jrm21
244 term.termfreq = queryResult.termFreqs[k].termFreq;
245 queryresult.terms.push_back(term);
246 queryresult.orgterms.push_back(term); // should this change??
247
248 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); ++j) {
249 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
250 queryresult.termvariants.insert(to_uni(equivterm_cstr));
251 delete equivterm_cstr;
252 }
253
254 }
255 // clean up
256 delete indexname;
257 return true;
258 */
259
260
261bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
262 int start, int numDocs,
263 queryresultsclass &queryresult) {
264
265 cerr << "**** Not sure what this function does!" << endl;
266
267 /*
268#ifdef __WIN32__
269 char basepath[]="";
270#else
271 char basepath[] = "/";
272#endif
273
274 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
275
276 UCArray level;
277 UCArrayClear(level);
278
279 //browse always at top level
280 SetCStr(level, "Doc"); // this name may change.
281
282
283 BrowseQueryNode browseNode;
284 browseNode.startPosition = start;
285 browseNode.numTerms = numDocs;
286
287 BrowseQueryResult browseResult;
288
289
290 UCArrayClear(browseNode.term);
291 // greenstone gives us the query encoded in unicode. We want utf8.
292 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
293 SetCStr(browseNode.term, utf8querystring);
294 delete utf8querystring;
295
296 // do the actual query
297 // LuceneBrowseQuery(*indexData, level, browseNode, browseResult); // ****
298
299 // load results into term info
300 termfreqclass term;
301 for (int i=0; i<(int)browseResult.termFreqs.size(); ++i) {
302 term.clear();
303 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
304 term.termstr = to_uni(term_cstr);
305 delete term_cstr;
306 term.termstemstr = term.termstr;
307 term.termfreq = browseResult.termFreqs[i].termFreq;
308 queryresult.terms.push_back(term);
309 queryresult.orgterms.push_back(term);
310
311 }
312 // clean up
313 delete indexname;
314
315 return true;
316
317 */
318
319 return false;
320}
321
322// the document text for 'docnum' is placed in 'output'
323// docTargetDocument returns 'true' if it was able to
324// try to get a document
325// collection is needed to see if an index from the
326// collection is loaded. THe default index bits are just there cos
327// the mg version needs them
328
329bool lucenesearchclass::docTargetDocument(const text_t &/*defaultindex*/,
330 const text_t &/*defaultsubcollection*/,
331 const text_t &/*defaultlanguage*/,
332 const text_t &collection,
333 int docnum,
334 text_t &output) {
335
336 // we now get the document directly by lucenegdbmsource, so don't use this
337 // method
338 return false;
339}
340
341// used to clear any cached databases for persistent versions of
342// Greenstone like the Windows local library
343void lucenesearchclass::unload_database () {
344}
345
346
347
348
349
Note: See TracBrowser for help on using the repository browser.