Context Navigation

source: trunk/gsdl/src/colservr/lucenesearch.cpp@ 9031

Last change on this file since 9031 was 9031, checked in by davidb, 19 years ago
Query to lucene incorrectly constructed (missing double quote). Now corrected.
Property svn:keywords set to `Author Date Id Revision`
File size: 9.0 KB

Line
1	/**********************************************************************
2	*
3	* lucenesearch.cpp --
4	* Copyright (C) 1999-2002 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26
27	#if defined(GSDL_USE_OBJECTSPACE)
28	# include <ospace\std\iostream>
29	#elif defined(GSDL_USE_IOS_H)
30	# include <iostream.h>
31	#else
32	# include <iostream>
33	#endif
34
35	#include <stdio.h>
36
37	#include "gsdlconf.h"
38	#include "lucenesearch.h"
39	#include "fileutil.h"
40	#include "queryinfo.h"
41	#include "gsdlunicode.h"
42
43	#include "sax_resultset.h"
44	#include "expat_resultset.h"
45
46
47	static text_t getindexsuffix(const queryparamclass &qp) {
48	text_t indexsuffix = "index";
49	text_t ind = qp.index;
50	text_t sub = qp.subcollection;
51	text_t lang = qp.language;
52
53	// collection name not added for Lucene
54	indexsuffix = filename_cat(indexsuffix, ind + sub + lang);
55	return indexsuffix;
56
57	}
58
59	////////////////////
60	// lucenesearch class //
61	////////////////////
62
63	lucenesearchclass::lucenesearchclass ()
64	: searchclass() {
65
66	gdbm_level = "Doc";
67	}
68
69	lucenesearchclass::~lucenesearchclass ()
70	{
71	if (cache != NULL)
72	{
73	delete cache;
74	cache = NULL;
75	}
76	}
77
78	void lucenesearchclass::set_gdbm_level(const text_t &level) {
79	gdbm_level = level;
80
81	}
82
83
84	bool lucenesearchclass::search(const queryparamclass &queryparams,
85	queryresultsclass &queryresult) {
86
87	#ifdef __WIN32__
88	char basepath[]="";
89	#else
90	char basepath[] = "/";
91	#endif
92
93	cerr << "**** in luecen search" << endl;
94
95	char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
96
97	// set default stem method from values originally set on prefs page
98	int defaultStemMethod = 0;
99	if (queryparams.casefolding) {
100	defaultStemMethod \|= 1;
101	}
102	if (queryparams.stemming) {
103	defaultStemMethod \|= 2;
104	}
105
106	// set default Boolean combiner from all/some setting
107	// if match_mode == 1, ie all, default=1 ie AND
108	// if match_mode == 0, ie some, default=0, ie OR
109	int defaultBoolCombine = 0;
110	if (queryparams.match_mode){
111	defaultBoolCombine = 1;
112	}
113	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
114	cerr << "**** query string = " << utf8querystring << endl;
115	cerr << "***** index name = " << indexname << endl;
116
117	text_t cmd = "lucene_query.pl ";
118	cmd += indexname + (text_t)" \"" + to_utf8(queryparams.querystring) + (text_t)"\"";
119
120
121	FILE *PIN = popen(cmd.getcstr(),"r");
122	if (PIN==NULL) {
123	cerr << "Error: unable to open pipe to " << cmd << endl;
124	return false;
125	}
126
127	text_t xml_text = "";
128
129	while (!feof(PIN)) {
130	char buffer[256];
131	int num_bytes = fread(buffer,1,256,PIN);
132	xml_text.appendcarr(buffer,num_bytes);
133	}
134
135	expat_resultset(xml_text,queryresult);
136
137	pclose(PIN);
138
139	return true;
140
141	/*
142	// use default query info settings - change to reflect user preferences??
143	QueryInfo queryInfo;
144
145	SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
146	queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
147	queryInfo.sortByRank = (queryparams.search_type == 1);
148	queryInfo.exactWeights = false;
149	queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
150	queryInfo.needTermFreqs = true;
151
152	ExtQueryResult queryResult;
153
154	UCArray queryArray;
155	// greenstone gives us the query encoded in unicode. We want utf8.
156	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
157	SetCStr(queryArray, utf8querystring);
158	delete utf8querystring;
159
160	UCArray level;
161	UCArrayClear(level);
162
163	//set the level for results
164	SetCStr(level, gdbm_level.getcstr());
165
166
167	// do the query
168	// LuceneQuery(indexData, queryInfo, queryTree, queryResult, level); // ***
169
170
171	// convert ExtQueryResult to queryresultclass
172
173	queryresult.docs_matched = (int)queryResult.docs.size();
174
175	if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
176	queryresult.is_approx = Exact;
177	}
178	else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
179	queryresult.is_approx = MoreThan;
180	}
181	else {
182	queryresult.is_approx = Approximate;
183	}
184
185	docresultclass doc;
186	for (int i=0; i<(int)queryResult.docs.size(); i++) {
187	doc.clear();
188	doc.docnum = (int)queryResult.levels[i];
189	doc.docweight = queryResult.ranks[i];
190	queryresult.docs.docset[doc.docnum] = doc;
191	queryresult.docs.docorder.push_back(doc.docnum);
192
193	}
194
195	// term info
196	termfreqclass term;
197	for (int k=0; k<(int)queryResult.termFreqs.size(); k++) {
198	term.clear();
199	char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
200	term.termstr = to_uni(termfreq_cstr);
201	delete termfreq_cstr;
202	term.termstemstr = term.termstr;
203	// we don't set term.utf8equivterms ?? - jrm21
204	term.termfreq = queryResult.termFreqs[k].termFreq;
205	queryresult.terms.push_back(term);
206	queryresult.orgterms.push_back(term); // should this change??
207
208	for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) {
209	char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
210	queryresult.termvariants.insert(to_uni(equivterm_cstr));
211	delete equivterm_cstr;
212	}
213
214	}
215	// clean up
216	delete indexname;
217	return true;
218	*/
219
220	return false;
221
222	}
223
224
225	bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
226	int start, int numDocs,
227	queryresultsclass &queryresult) {
228
229	cerr << "**** Not sure what this function does!" << endl;
230
231	/*
232	#ifdef __WIN32__
233	char basepath[]="";
234	#else
235	char basepath[] = "/";
236	#endif
237
238	char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
239
240	UCArray level;
241	UCArrayClear(level);
242
243	//browse always at top level
244	SetCStr(level, "Doc"); // this name may change.
245
246
247	BrowseQueryNode browseNode;
248	browseNode.startPosition = start;
249	browseNode.numTerms = numDocs;
250
251	BrowseQueryResult browseResult;
252
253
254	UCArrayClear(browseNode.term);
255	// greenstone gives us the query encoded in unicode. We want utf8.
256	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
257	SetCStr(browseNode.term, utf8querystring);
258	delete utf8querystring;
259
260	// do the actual query
261	// LuceneBrowseQuery(indexData, level, browseNode, browseResult); // ***
262
263	// load results into term info
264	termfreqclass term;
265	for (int i=0; i<(int)browseResult.termFreqs.size(); i++) {
266	term.clear();
267	char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
268	term.termstr = to_uni(term_cstr);
269	delete term_cstr;
270	term.termstemstr = term.termstr;
271	term.termfreq = browseResult.termFreqs[i].termFreq;
272	queryresult.terms.push_back(term);
273	queryresult.orgterms.push_back(term);
274
275	}
276	// clean up
277	delete indexname;
278
279	return true;
280
281	*/
282
283	return false;
284	}
285
286	// the document text for 'docnum' is placed in 'output'
287	// docTargetDocument returns 'true' if it was able to
288	// try to get a document
289	// collection is needed to see if an index from the
290	// collection is loaded. THe default index bits are just there cos
291	// the mg version needs them
292
293	bool lucenesearchclass::docTargetDocument(const text_t &/defaultindex/,
294	const text_t &/defaultsubcollection/,
295	const text_t &/defaultlanguage/,
296	const text_t &collection,
297	int docnum,
298	text_t &output) {
299
300	cerr << "**** Should return document text here!" << endl;
301
302	/*
303	#ifdef __WIN32__
304	char basepath[]="";
305	#else
306	char basepath[] = "/";
307	#endif
308	char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
309
310
311	TextData textdata;
312	if(!textdata.LoadData(basepath, textname)) {
313	cout<<"couldn't load text data\n"<<endl;
314	return false;
315	}
316
317	UCArray doctext;
318	UCArray level;
319	SetCStr(level, gdbm_level.getcstr());
320	if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
321	cout<<"couldn't retrieve document text\n";
322	return false;
323	}
324
325	// convert UCArray to text_t
326	output.clear();
327	char* doctext_cstr = GetCStr(doctext);
328	output = to_uni(doctext_cstr); // convert from utf-8 to unicode
329	delete doctext_cstr;
330
331	// here need to remove the <Document>, <Section>, <Paragraph> tags
332
333
334	//clean up
335	textdata.UnloadData ();
336	delete textname;
337
338	return true;
339
340	*/
341
342	return false;
343	}
344
345	// used to clear any cached databases for persistent versions of
346	// Greenstone like the Windows local library
347	void lucenesearchclass::unload_database () {
348	}
349
350
351
352
353

Note: See TracBrowser for help on using the repository browser.

Download in other formats: