Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/src/colservr/lucenesearch.cpp@ 8027

Last change on this file since 8027 was 8027, checked in by davidb, 20 years ago
Introduction of lucene*.cpp,h classes to support searching with this Java based indexing tool.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.9 KB

Line
1	/**********************************************************************
2	*
3	* lucenesearch.cpp --
4	* Copyright (C) 1999-2002 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26
27	#if defined(GSDL_USE_OBJECTSPACE)
28	# include <ospace\std\iostream>
29	#elif defined(GSDL_USE_IOS_H)
30	# include <iostream.h>
31	#else
32	# include <iostream>
33	#endif
34
35
36	#include "gsdlconf.h"
37	#include "lucenesearch.h"
38	#include "fileutil.h"
39	#include "queryinfo.h"
40	#include "gsdlunicode.h"
41
42	#include "sax_resultset.h"
43
44
45	static text_t getindexsuffix(const queryparamclass &qp) {
46	text_t indexsuffix = "index";
47	text_t ind = qp.index;
48	text_t sub = qp.subcollection;
49	text_t lang = qp.language;
50
51	// collection name not added for Lucene
52	indexsuffix = filename_cat(indexsuffix, ind + sub + lang);
53	return indexsuffix;
54
55	}
56
57	////////////////////
58	// lucenesearch class //
59	////////////////////
60
61	lucenesearchclass::lucenesearchclass ()
62	: searchclass() {
63
64	gdbm_level = "Doc";
65	}
66
67	lucenesearchclass::~lucenesearchclass ()
68	{
69	if (cache != NULL)
70	{
71	delete cache;
72	cache = NULL;
73	}
74	}
75
76	void lucenesearchclass::set_gdbm_level(const text_t &level) {
77	gdbm_level = level;
78
79	}
80
81
82	bool lucenesearchclass::search(const queryparamclass &queryparams,
83	queryresultsclass &queryresult) {
84
85	#ifdef __WIN32__
86	char basepath[]="";
87	#else
88	char basepath[] = "/";
89	#endif
90
91	cerr << "**** in luecen search" << endl;
92
93	char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
94
95	// set default stem method from values originally set on prefs page
96	int defaultStemMethod = 0;
97	if (queryparams.casefolding) {
98	defaultStemMethod \|= 1;
99	}
100	if (queryparams.stemming) {
101	defaultStemMethod \|= 2;
102	}
103
104	// set default Boolean combiner from all/some setting
105	// if match_mode == 1, ie all, default=1 ie AND
106	// if match_mode == 0, ie some, default=0, ie OR
107	int defaultBoolCombine = 0;
108	if (queryparams.match_mode){
109	defaultBoolCombine = 1;
110	}
111	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
112	cerr << "**** query string = " << utf8querystring << endl;
113	cerr << "***** index name = " << indexname << endl;
114
115	text_t cmd = "lucene_query.pl ";
116	cmd += indexname + (text_t)" " + to_utf8(queryparams.querystring);
117
118
119	FILE *PIN = popen(cmd.getcstr(),"r");
120	if (PIN==NULL) {
121	cerr << "Error: unable to open pipe to " << cmd << endl;
122	return false;
123	}
124
125	text_t xml_text = "";
126
127	while (!feof(PIN)) {
128	char buffer[256];
129	int num_bytes = fread(buffer,1,256,PIN);
130	xml_text.appendcarr(buffer,num_bytes);
131	}
132
133	sax_resultset(xml_text,queryresult);
134
135	pclose(PIN);
136
137	return true;
138
139	/*
140	// use default query info settings - change to reflect user preferences??
141	QueryInfo queryInfo;
142
143	SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
144	queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
145	queryInfo.sortByRank = (queryparams.search_type == 1);
146	queryInfo.exactWeights = false;
147	queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
148	queryInfo.needTermFreqs = true;
149
150	ExtQueryResult queryResult;
151
152	UCArray queryArray;
153	// greenstone gives us the query encoded in unicode. We want utf8.
154	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
155	SetCStr(queryArray, utf8querystring);
156	delete utf8querystring;
157
158	UCArray level;
159	UCArrayClear(level);
160
161	//set the level for results
162	SetCStr(level, gdbm_level.getcstr());
163
164
165	// do the query
166	// LuceneQuery(indexData, queryInfo, queryTree, queryResult, level); // ***
167
168
169	// convert ExtQueryResult to queryresultclass
170
171	queryresult.docs_matched = (int)queryResult.docs.size();
172
173	if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
174	queryresult.is_approx = Exact;
175	}
176	else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
177	queryresult.is_approx = MoreThan;
178	}
179	else {
180	queryresult.is_approx = Approximate;
181	}
182
183	docresultclass doc;
184	for (int i=0; i<(int)queryResult.docs.size(); i++) {
185	doc.clear();
186	doc.docnum = (int)queryResult.levels[i];
187	doc.docweight = queryResult.ranks[i];
188	queryresult.docs.docset[doc.docnum] = doc;
189	queryresult.docs.docorder.push_back(doc.docnum);
190
191	}
192
193	// term info
194	termfreqclass term;
195	for (int k=0; k<(int)queryResult.termFreqs.size(); k++) {
196	term.clear();
197	char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
198	term.termstr = to_uni(termfreq_cstr);
199	delete termfreq_cstr;
200	term.termstemstr = term.termstr;
201	// we don't set term.utf8equivterms ?? - jrm21
202	term.termfreq = queryResult.termFreqs[k].termFreq;
203	queryresult.terms.push_back(term);
204	queryresult.orgterms.push_back(term); // should this change??
205
206	for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) {
207	char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
208	queryresult.termvariants.insert(to_uni(equivterm_cstr));
209	delete equivterm_cstr;
210	}
211
212	}
213	// clean up
214	delete indexname;
215	return true;
216	*/
217
218	return false;
219
220	}
221
222
223	bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
224	int start, int numDocs,
225	queryresultsclass &queryresult) {
226
227	cerr << "**** Not sure what this function does!" << endl;
228
229	/*
230	#ifdef __WIN32__
231	char basepath[]="";
232	#else
233	char basepath[] = "/";
234	#endif
235
236	char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
237
238	UCArray level;
239	UCArrayClear(level);
240
241	//browse always at top level
242	SetCStr(level, "Doc"); // this name may change.
243
244
245	BrowseQueryNode browseNode;
246	browseNode.startPosition = start;
247	browseNode.numTerms = numDocs;
248
249	BrowseQueryResult browseResult;
250
251
252	UCArrayClear(browseNode.term);
253	// greenstone gives us the query encoded in unicode. We want utf8.
254	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
255	SetCStr(browseNode.term, utf8querystring);
256	delete utf8querystring;
257
258	// do the actual query
259	// LuceneBrowseQuery(indexData, level, browseNode, browseResult); // ***
260
261	// load results into term info
262	termfreqclass term;
263	for (int i=0; i<(int)browseResult.termFreqs.size(); i++) {
264	term.clear();
265	char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
266	term.termstr = to_uni(term_cstr);
267	delete term_cstr;
268	term.termstemstr = term.termstr;
269	term.termfreq = browseResult.termFreqs[i].termFreq;
270	queryresult.terms.push_back(term);
271	queryresult.orgterms.push_back(term);
272
273	}
274	// clean up
275	delete indexname;
276
277	return true;
278
279	*/
280
281	return false;
282	}
283
284	// the document text for 'docnum' is placed in 'output'
285	// docTargetDocument returns 'true' if it was able to
286	// try to get a document
287	// collection is needed to see if an index from the
288	// collection is loaded. THe default index bits are just there cos
289	// the mg version needs them
290
291	bool lucenesearchclass::docTargetDocument(const text_t &/defaultindex/,
292	const text_t &/defaultsubcollection/,
293	const text_t &/defaultlanguage/,
294	const text_t &collection,
295	int docnum,
296	text_t &output) {
297
298	cerr << "**** Should return document text here!" << endl;
299
300	/*
301	#ifdef __WIN32__
302	char basepath[]="";
303	#else
304	char basepath[] = "/";
305	#endif
306	char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
307
308
309	TextData textdata;
310	if(!textdata.LoadData(basepath, textname)) {
311	cout<<"couldn't load text data\n"<<endl;
312	return false;
313	}
314
315	UCArray doctext;
316	UCArray level;
317	SetCStr(level, gdbm_level.getcstr());
318	if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
319	cout<<"couldn't retrieve document text\n";
320	return false;
321	}
322
323	// convert UCArray to text_t
324	output.clear();
325	char* doctext_cstr = GetCStr(doctext);
326	output = to_uni(doctext_cstr); // convert from utf-8 to unicode
327	delete doctext_cstr;
328
329	// here need to remove the <Document>, <Section>, <Paragraph> tags
330
331
332	//clean up
333	textdata.UnloadData ();
334	delete textname;
335
336	return true;
337
338	*/
339
340	return false;
341	}
342
343	// used to clear any cached databases for persistent versions of
344	// Greenstone like the Windows local library
345	void lucenesearchclass::unload_database () {
346	}
347
348
349
350
351

Note: See TracBrowser for help on using the repository browser.

Download in other formats: