Context Navigation

source: trunk/gsdl/src/colservr/lucenesearch.cpp@ 9904

Last change on this file since 9904 was 9620, checked in by kjdon, 19 years ago
added some x++ -> ++x changes submitted by Emanuel Dejanu
Property svn:keywords set to `Author Date Id Revision`
File size: 9.8 KB

Line
1	/**********************************************************************
2	*
3	* lucenesearch.cpp --
4	* Copyright (C) 1999-2002 The New Zealand Digital Library Project
5	*
6	* A component of the Greenstone digital library software
7	* from the New Zealand Digital Library Project at the
8	* University of Waikato, New Zealand.
9	*
10	* This program is free software; you can redistribute it and/or modify
11	* it under the terms of the GNU General Public License as published by
12	* the Free Software Foundation; either version 2 of the License, or
13	* (at your option) any later version.
14	*
15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	* GNU General Public License for more details.
19	*
20	* You should have received a copy of the GNU General Public License
21	* along with this program; if not, write to the Free Software
22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	*
24	*********************************************************************/
25
26
27	#if defined(GSDL_USE_OBJECTSPACE)
28	# include <ospace\std\iostream>
29	#elif defined(GSDL_USE_IOS_H)
30	# include <iostream.h>
31	#else
32	# include <iostream>
33	#endif
34
35	#include <stdio.h>
36	#include <time.h>
37
38	#include "gsdlconf.h"
39	#include "gsdltools.h"
40	#include "lucenesearch.h"
41	#include "fileutil.h"
42	#include "queryinfo.h"
43	#include "gsdlunicode.h"
44
45	#include "expat_resultset.h"
46
47
48	static text_t getindexsuffix(const queryparamclass &qp) {
49	text_t indexsuffix = "index";
50	// get the first char of the level to be the start of the index name
51	text_t suffix = substr(qp.level.begin(), qp.level.begin()+1);
52	lc(suffix);
53	text_t ind = qp.index;
54	text_t sub = qp.subcollection;
55	text_t lang = qp.language;
56
57	// collection name not added for Lucene
58	indexsuffix = filename_cat(indexsuffix, suffix +ind + sub + lang);
59	return indexsuffix;
60
61	}
62
63	////////////////////
64	// lucenesearch class //
65	////////////////////
66
67	lucenesearchclass::lucenesearchclass ()
68	: searchclass() {
69
70	gdbm_level = "Doc";
71	}
72
73	lucenesearchclass::~lucenesearchclass ()
74	{
75	if (cache != NULL)
76	{
77	delete cache;
78	cache = NULL;
79	}
80	}
81
82	void lucenesearchclass::set_gdbm_level(const text_t &level) {
83	gdbm_level = level;
84
85	}
86
87
88	bool lucenesearchclass::search(const queryparamclass &queryparams,
89	queryresultsclass &queryresult) {
90
91	#ifdef __WIN32__
92	char basepath[]="";
93	#else
94	char basepath[] = "/";
95	#endif
96
97	cerr << "**** in lucene search" << endl;
98
99	char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
100
101	// set default stem method from values originally set on prefs page
102	int defaultStemMethod = 0;
103	if (queryparams.casefolding) {
104	defaultStemMethod \|= 1;
105	}
106	if (queryparams.stemming) {
107	defaultStemMethod \|= 2;
108	}
109
110	// set default Boolean combiner from all/some setting
111	// if match_mode == 1, ie all, default=1 ie AND
112	// if match_mode == 0, ie some, default=0, ie OR
113	int defaultBoolCombine = 0;
114	if (queryparams.match_mode){
115	defaultBoolCombine = 1;
116	}
117	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
118	cerr << "**** query string = " << utf8querystring << endl;
119	cerr << "***** index name = " << indexname << endl;
120
121	text_t cmd = "lucene_query.pl ";
122	cmd += (text_t)" \""+indexname + (text_t)"\" \"" + to_utf8(queryparams.querystring) + (text_t)"\"";
123
124	text_t xml_text = "";
125
126	#ifdef __WIN32__
127	//FILE *PIN = _popen(cmd.getcstr(), "r"); // didn't seem to work
128	cmd = (text_t)"perl -S "+cmd;
129	// we write the result to a file
130	clock_t this_time = clock();
131	text_t filename = "luc";
132	filename.append(this_time);
133	filename.append(".txt");
134
135	text_t out_file = filename_cat(collectdir, filename);
136	cmd += (text_t)" \""+out_file+ (text_t)"\"";
137	int rv = gsdl_system(cmd, true, cerr);
138	if (rv != 0) {
139	cerr << "tried to run command \""<<cmd<<"\", but it failed\n";
140	} else {
141	read_file(out_file, xml_text);
142	remove(out_file.getcstr()); // now delete it
143	}
144	#else
145	FILE *PIN = popen(cmd.getcstr(), "r");
146
147	if (PIN==NULL) {
148	perror("PIPE");
149	cerr << "Error: unable to open pipe to " << cmd << endl;
150
151	return false;
152	}
153	while (!feof(PIN)) {
154	char buffer[256];
155	int num_bytes = fread(buffer,1,256,PIN);
156	xml_text.appendcarr(buffer,num_bytes);
157	}
158
159	#endif
160	expat_resultset(xml_text,queryresult);
161
162	#ifdef __WIN32__
163	// _pclose(PIN);
164	#else
165	pclose(PIN);
166	#endif
167
168	return true;
169	}
170	/*
171	// use default query info settings - change to reflect user preferences??
172	QueryInfo queryInfo;
173
174	SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
175	queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
176	queryInfo.sortByRank = (queryparams.search_type == 1);
177	queryInfo.exactWeights = false;
178	queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
179	queryInfo.needTermFreqs = true;
180
181	ExtQueryResult queryResult;
182
183	UCArray queryArray;
184	// greenstone gives us the query encoded in unicode. We want utf8.
185	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
186	SetCStr(queryArray, utf8querystring);
187	delete utf8querystring;
188
189	UCArray level;
190	UCArrayClear(level);
191
192	//set the level for results
193	SetCStr(level, gdbm_level.getcstr());
194
195
196	// do the query
197	// LuceneQuery(indexData, queryInfo, queryTree, queryResult, level); // ***
198
199
200	// convert ExtQueryResult to queryresultclass
201
202	queryresult.docs_matched = (int)queryResult.docs.size();
203
204	if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
205	queryresult.is_approx = Exact;
206	}
207	else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
208	queryresult.is_approx = MoreThan;
209	}
210	else {
211	queryresult.is_approx = Approximate;
212	}
213
214	docresultclass doc;
215	for (int i=0; i<(int)queryResult.docs.size(); ++i) {
216	doc.clear();
217	doc.docnum = (int)queryResult.levels[i];
218	doc.docweight = queryResult.ranks[i];
219	queryresult.docs.docset[doc.docnum] = doc;
220	queryresult.docs.docorder.push_back(doc.docnum);
221
222	}
223
224	// term info
225	termfreqclass term;
226	for (int k=0; k<(int)queryResult.termFreqs.size(); ++k) {
227	term.clear();
228	char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
229	term.termstr = to_uni(termfreq_cstr);
230	delete termfreq_cstr;
231	term.termstemstr = term.termstr;
232	// we don't set term.utf8equivterms ?? - jrm21
233	term.termfreq = queryResult.termFreqs[k].termFreq;
234	queryresult.terms.push_back(term);
235	queryresult.orgterms.push_back(term); // should this change??
236
237	for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); ++j) {
238	char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
239	queryresult.termvariants.insert(to_uni(equivterm_cstr));
240	delete equivterm_cstr;
241	}
242
243	}
244	// clean up
245	delete indexname;
246	return true;
247	*/
248
249
250	bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
251	int start, int numDocs,
252	queryresultsclass &queryresult) {
253
254	cerr << "**** Not sure what this function does!" << endl;
255
256	/*
257	#ifdef __WIN32__
258	char basepath[]="";
259	#else
260	char basepath[] = "/";
261	#endif
262
263	char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
264
265	UCArray level;
266	UCArrayClear(level);
267
268	//browse always at top level
269	SetCStr(level, "Doc"); // this name may change.
270
271
272	BrowseQueryNode browseNode;
273	browseNode.startPosition = start;
274	browseNode.numTerms = numDocs;
275
276	BrowseQueryResult browseResult;
277
278
279	UCArrayClear(browseNode.term);
280	// greenstone gives us the query encoded in unicode. We want utf8.
281	char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
282	SetCStr(browseNode.term, utf8querystring);
283	delete utf8querystring;
284
285	// do the actual query
286	// LuceneBrowseQuery(indexData, level, browseNode, browseResult); // ***
287
288	// load results into term info
289	termfreqclass term;
290	for (int i=0; i<(int)browseResult.termFreqs.size(); ++i) {
291	term.clear();
292	char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
293	term.termstr = to_uni(term_cstr);
294	delete term_cstr;
295	term.termstemstr = term.termstr;
296	term.termfreq = browseResult.termFreqs[i].termFreq;
297	queryresult.terms.push_back(term);
298	queryresult.orgterms.push_back(term);
299
300	}
301	// clean up
302	delete indexname;
303
304	return true;
305
306	*/
307
308	return false;
309	}
310
311	// the document text for 'docnum' is placed in 'output'
312	// docTargetDocument returns 'true' if it was able to
313	// try to get a document
314	// collection is needed to see if an index from the
315	// collection is loaded. THe default index bits are just there cos
316	// the mg version needs them
317
318	bool lucenesearchclass::docTargetDocument(const text_t &/defaultindex/,
319	const text_t &/defaultsubcollection/,
320	const text_t &/defaultlanguage/,
321	const text_t &collection,
322	int docnum,
323	text_t &output) {
324
325	cerr << "**** Should return document text here!" << endl;
326
327	/*
328	#ifdef __WIN32__
329	char basepath[]="";
330	#else
331	char basepath[] = "/";
332	#endif
333	char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
334
335
336	TextData textdata;
337	if(!textdata.LoadData(basepath, textname)) {
338	cout<<"couldn't load text data\n"<<endl;
339	return false;
340	}
341
342	UCArray doctext;
343	UCArray level;
344	SetCStr(level, gdbm_level.getcstr());
345	if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
346	cout<<"couldn't retrieve document text\n";
347	return false;
348	}
349
350	// convert UCArray to text_t
351	output.clear();
352	char* doctext_cstr = GetCStr(doctext);
353	output = to_uni(doctext_cstr); // convert from utf-8 to unicode
354	delete doctext_cstr;
355
356	// here need to remove the <Document>, <Section>, <Paragraph> tags
357
358
359	//clean up
360	textdata.UnloadData ();
361	delete textname;
362
363	return true;
364
365	*/
366
367	return false;
368	}
369
370	// used to clear any cached databases for persistent versions of
371	// Greenstone like the Windows local library
372	void lucenesearchclass::unload_database () {
373	}
374
375
376
377
378

Note: See TracBrowser for help on using the repository browser.

Download in other formats: