Context Navigation

source: trunk/gsdl/src/colservr/lucenesearch.cpp@ 11162

Last change on this file since 11162 was 11162, checked in by kjdon, 18 years ago
added quotes around the path to lucene_query.pl, in case we are installed under a directory with spaces
Property svn:keywords set to `Author Date Id Revision`
File size: 9.4 KB

Line
1	/**********************************************************************
2	*
3	* lucenesearch.cpp --
4	* Copyright (C) 1999-2002 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26
27	#if defined(GSDL_USE_OBJECTSPACE)
28	# include <ospace\std\iostream>
29	#elif defined(GSDL_USE_IOS_H)
30	# include <iostream.h>
31	#else
32	# include <iostream>
33	#endif
34
35	#include <stdio.h>
36	#include <time.h>
37
38	#include "gsdlconf.h"
39	#include "gsdltools.h"
40	#include "lucenesearch.h"
41	#include "fileutil.h"
42	#include "queryinfo.h"
43	#include "gsdlunicode.h"
44
45	#include "expat_resultset.h"
46
47
48	text_t lucenesearchclass::getindexsuffix(const queryparamclass &qp) {
49	text_t indexsuffix = "index";
50	// get the first char of the level to be the start of the index name
51	text_t suffix = substr(qp.level.begin(), qp.level.begin()+1);
52	lc(suffix);
53	text_t ind = qp.index;
54	text_t sub = qp.subcollection;
55	text_t lang = qp.language;
56
57	// collection name not added for Lucene
58	indexsuffix = filename_cat(indexsuffix, suffix +ind + sub + lang);
59	return indexsuffix;
60
61	}
62
63	////////////////////
64	// lucenesearch class //
65	////////////////////
66
67	lucenesearchclass::lucenesearchclass ()
68	: searchclass() {
69
70	gdbm_level = "Doc";
71	}
72
73	lucenesearchclass::~lucenesearchclass ()
74	{
75	if (cache != NULL)
76	{
77	delete cache;
78	cache = NULL;
79	}
80	}
81
82	void lucenesearchclass::set_gdbm_level(const text_t &level) {
83	gdbm_level = level;
84
85	}
86
87
88	bool lucenesearchclass::search(const queryparamclass &queryparams,
89	queryresultsclass &queryresult) {
90
91	#ifdef __WIN32__
92	char basepath[]="";
93	#else
94	char basepath[] = "/";
95	#endif
96
97	cerr << "**** in lucene search" << endl;
98
99	char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
100
101	// set default stem method from values originally set on prefs page
102	int defaultStemMethod = 0;
103	if (queryparams.casefolding) {
104	defaultStemMethod \|= 1;
105	}
106	if (queryparams.stemming) {
107	defaultStemMethod \|= 2;
108	}
109
110	// set default Boolean combiner from all/some setting
111	// if match_mode == 1, ie all, default=1 ie AND
112	// if match_mode == 0, ie some, default=0, ie OR
113	int defaultBoolCombine = 0;
114	if (queryparams.match_mode){
115	defaultBoolCombine = 1;
116	}
117
118	text_t utf8querystring = to_utf8(queryparams.querystring);
119	cerr << "**** query string = " << utf8querystring << endl;
120
121	text_t escaped_utf8querystring = "";
122	text_t::const_iterator here = utf8querystring.begin();
123	while (here != utf8querystring.end()) {
124	if (*here == '"') escaped_utf8querystring.push_back('\\');
125	escaped_utf8querystring.push_back(*here);
126	here++;
127	}
128	cerr << "**** escaped query string = " << escaped_utf8querystring << endl;
129	cerr << "***** index name = " << indexname << endl;
130
131	text_t cmd = "\""+filename_cat(getenv("GSDLHOME"), "bin", "script", "lucene_query.pl")+"\"";
132	cmd += (text_t)" \""+indexname + (text_t)"\" \"" + escaped_utf8querystring + (text_t)"\"";
133	cerr << "Lucene command: " << cmd << endl;
134
135	text_t xml_text = "";
136
137	#ifdef __WIN32__
138	//FILE *PIN = _popen(cmd.getcstr(), "r"); // didn't seem to work
139	cmd = (text_t)"perl -S "+cmd;
140	// we write the result to a file
141	clock_t this_time = clock();
142	text_t filename = "luc";
143	filename.append(this_time);
144	filename.append(".txt");
145
146	text_t out_file = filename_cat(collectdir, filename);
147	cmd += (text_t)" \""+out_file+ (text_t)"\"";
148	int rv = gsdl_system(cmd, true, cerr);
149	if (rv != 0) {
150	cerr << "tried to run command \""<<cmd<<"\", but it failed\n";
151	} else {
152	read_file(out_file, xml_text);
153	remove(out_file.getcstr()); // now delete it
154	}
155	#else
156	FILE *PIN = popen(cmd.getcstr(), "r");
157
158	if (PIN==NULL) {
159	perror("PIPE");
160	cerr << "Error: unable to open pipe to " << cmd << endl;
161
162	return false;
163	}
164	while (!feof(PIN)) {
165	char buffer[256];
166	int num_bytes = fread(buffer,1,256,PIN);
167	xml_text.appendcarr(buffer,num_bytes);
168	}
169
170	#endif
171	expat_resultset(xml_text,queryresult);
172
173	#ifdef __WIN32__
174	// _pclose(PIN);
175	#else
176	pclose(PIN);
177	#endif
178
179	return true;
180	}
181	/*
182	// use default query info settings - change to reflect user preferences??
183	QueryInfo queryInfo;
184
185	SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
186	queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
187	queryInfo.sortByRank = (queryparams.search_type == 1);
188	queryInfo.exactWeights = false;
189	queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
190	queryInfo.needTermFreqs = true;
191
192	ExtQueryResult queryResult;
193
194	UCArray queryArray;
195	// greenstone gives us the query encoded in unicode. We want utf8.
196	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
197	SetCStr(queryArray, utf8querystring);
198	delete utf8querystring;
199
200	UCArray level;
201	UCArrayClear(level);
202
203	//set the level for results
204	SetCStr(level, gdbm_level.getcstr());
205
206
207	// do the query
208	// LuceneQuery(indexData, queryInfo, queryTree, queryResult, level); // ***
209
210
211	// convert ExtQueryResult to queryresultclass
212
213	queryresult.docs_matched = (int)queryResult.docs.size();
214
215	if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
216	queryresult.is_approx = Exact;
217	}
218	else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
219	queryresult.is_approx = MoreThan;
220	}
221	else {
222	queryresult.is_approx = Approximate;
223	}
224
225	docresultclass doc;
226	for (int i=0; i<(int)queryResult.docs.size(); ++i) {
227	doc.clear();
228	doc.docnum = (int)queryResult.levels[i];
229	doc.docweight = queryResult.ranks[i];
230	queryresult.docs.docset[doc.docnum] = doc;
231	queryresult.docs.docorder.push_back(doc.docnum);
232
233	}
234
235	// term info
236	termfreqclass term;
237	for (int k=0; k<(int)queryResult.termFreqs.size(); ++k) {
238	term.clear();
239	char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
240	term.termstr = to_uni(termfreq_cstr);
241	delete termfreq_cstr;
242	term.termstemstr = term.termstr;
243	// we don't set term.utf8equivterms ?? - jrm21
244	term.termfreq = queryResult.termFreqs[k].termFreq;
245	queryresult.terms.push_back(term);
246	queryresult.orgterms.push_back(term); // should this change??
247
248	for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); ++j) {
249	char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
250	queryresult.termvariants.insert(to_uni(equivterm_cstr));
251	delete equivterm_cstr;
252	}
253
254	}
255	// clean up
256	delete indexname;
257	return true;
258	*/
259
260
261	bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
262	int start, int numDocs,
263	queryresultsclass &queryresult) {
264
265	cerr << "**** Not sure what this function does!" << endl;
266
267	/*
268	#ifdef __WIN32__
269	char basepath[]="";
270	#else
271	char basepath[] = "/";
272	#endif
273
274	char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
275
276	UCArray level;
277	UCArrayClear(level);
278
279	//browse always at top level
280	SetCStr(level, "Doc"); // this name may change.
281
282
283	BrowseQueryNode browseNode;
284	browseNode.startPosition = start;
285	browseNode.numTerms = numDocs;
286
287	BrowseQueryResult browseResult;
288
289
290	UCArrayClear(browseNode.term);
291	// greenstone gives us the query encoded in unicode. We want utf8.
292	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
293	SetCStr(browseNode.term, utf8querystring);
294	delete utf8querystring;
295
296	// do the actual query
297	// LuceneBrowseQuery(indexData, level, browseNode, browseResult); // ***
298
299	// load results into term info
300	termfreqclass term;
301	for (int i=0; i<(int)browseResult.termFreqs.size(); ++i) {
302	term.clear();
303	char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
304	term.termstr = to_uni(term_cstr);
305	delete term_cstr;
306	term.termstemstr = term.termstr;
307	term.termfreq = browseResult.termFreqs[i].termFreq;
308	queryresult.terms.push_back(term);
309	queryresult.orgterms.push_back(term);
310
311	}
312	// clean up
313	delete indexname;
314
315	return true;
316
317	*/
318
319	return false;
320	}
321
322	// the document text for 'docnum' is placed in 'output'
323	// docTargetDocument returns 'true' if it was able to
324	// try to get a document
325	// collection is needed to see if an index from the
326	// collection is loaded. THe default index bits are just there cos
327	// the mg version needs them
328
329	bool lucenesearchclass::docTargetDocument(const text_t &/defaultindex/,
330	const text_t &/defaultsubcollection/,
331	const text_t &/defaultlanguage/,
332	const text_t &collection,
333	int docnum,
334	text_t &output) {
335
336	// we now get the document directly by lucenegdbmsource, so don't use this
337	// method
338	return false;
339	}
340
341	// used to clear any cached databases for persistent versions of
342	// Greenstone like the Windows local library
343	void lucenesearchclass::unload_database () {
344	}
345
346
347
348
349

Note: See TracBrowser for help on using the repository browser.

Download in other formats: