source: trunk/gsdl/src/colservr/lucenesearch.cpp@ 9031

Last change on this file since 9031 was 9031, checked in by davidb, 19 years ago

Query to lucene incorrectly constructed (missing double quote). Now corrected.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.0 KB
Line 
1/**********************************************************************
2 *
3 * lucenesearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#if defined(GSDL_USE_OBJECTSPACE)
28# include <ospace\std\iostream>
29#elif defined(GSDL_USE_IOS_H)
30# include <iostream.h>
31#else
32# include <iostream>
33#endif
34
35#include <stdio.h>
36
37#include "gsdlconf.h"
38#include "lucenesearch.h"
39#include "fileutil.h"
40#include "queryinfo.h"
41#include "gsdlunicode.h"
42
43#include "sax_resultset.h"
44#include "expat_resultset.h"
45
46
47static text_t getindexsuffix(const queryparamclass &qp) {
48 text_t indexsuffix = "index";
49 text_t ind = qp.index;
50 text_t sub = qp.subcollection;
51 text_t lang = qp.language;
52
53 // collection name not added for Lucene
54 indexsuffix = filename_cat(indexsuffix, ind + sub + lang);
55 return indexsuffix;
56
57}
58
59////////////////////
60// lucenesearch class //
61////////////////////
62
63lucenesearchclass::lucenesearchclass ()
64 : searchclass() {
65
66 gdbm_level = "Doc";
67}
68
69lucenesearchclass::~lucenesearchclass ()
70{
71 if (cache != NULL)
72 {
73 delete cache;
74 cache = NULL;
75 }
76}
77
78void lucenesearchclass::set_gdbm_level(const text_t &level) {
79 gdbm_level = level;
80
81}
82
83
84bool lucenesearchclass::search(const queryparamclass &queryparams,
85 queryresultsclass &queryresult) {
86
87#ifdef __WIN32__
88 char basepath[]="";
89#else
90 char basepath[] = "/";
91#endif
92
93 cerr << "**** in luecen search" << endl;
94
95 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
96
97 // set default stem method from values originally set on prefs page
98 int defaultStemMethod = 0;
99 if (queryparams.casefolding) {
100 defaultStemMethod |= 1;
101 }
102 if (queryparams.stemming) {
103 defaultStemMethod |= 2;
104 }
105
106 // set default Boolean combiner from all/some setting
107 // if match_mode == 1, ie all, default=1 ie AND
108 // if match_mode == 0, ie some, default=0, ie OR
109 int defaultBoolCombine = 0;
110 if (queryparams.match_mode){
111 defaultBoolCombine = 1;
112 }
113 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
114 cerr << "**** query string = " << utf8querystring << endl;
115 cerr << "***** index name = " << indexname << endl;
116
117 text_t cmd = "lucene_query.pl ";
118 cmd += indexname + (text_t)" \"" + to_utf8(queryparams.querystring) + (text_t)"\"";
119
120
121 FILE *PIN = popen(cmd.getcstr(),"r");
122 if (PIN==NULL) {
123 cerr << "Error: unable to open pipe to " << cmd << endl;
124 return false;
125 }
126
127 text_t xml_text = "";
128
129 while (!feof(PIN)) {
130 char buffer[256];
131 int num_bytes = fread(buffer,1,256,PIN);
132 xml_text.appendcarr(buffer,num_bytes);
133 }
134
135 expat_resultset(xml_text,queryresult);
136
137 pclose(PIN);
138
139 return true;
140
141 /*
142 // use default query info settings - change to reflect user preferences??
143 QueryInfo queryInfo;
144
145 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
146 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
147 queryInfo.sortByRank = (queryparams.search_type == 1);
148 queryInfo.exactWeights = false;
149 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
150 queryInfo.needTermFreqs = true;
151
152 ExtQueryResult queryResult;
153
154 UCArray queryArray;
155 // greenstone gives us the query encoded in unicode. We want utf8.
156 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
157 SetCStr(queryArray, utf8querystring);
158 delete utf8querystring;
159
160 UCArray level;
161 UCArrayClear(level);
162
163 //set the level for results
164 SetCStr(level, gdbm_level.getcstr());
165
166
167 // do the query
168 // LuceneQuery(*indexData, queryInfo, queryTree, queryResult, level); // ****
169
170
171 // convert ExtQueryResult to queryresultclass
172
173 queryresult.docs_matched = (int)queryResult.docs.size();
174
175 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
176 queryresult.is_approx = Exact;
177 }
178 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
179 queryresult.is_approx = MoreThan;
180 }
181 else {
182 queryresult.is_approx = Approximate;
183 }
184
185 docresultclass doc;
186 for (int i=0; i<(int)queryResult.docs.size(); i++) {
187 doc.clear();
188 doc.docnum = (int)queryResult.levels[i];
189 doc.docweight = queryResult.ranks[i];
190 queryresult.docs.docset[doc.docnum] = doc;
191 queryresult.docs.docorder.push_back(doc.docnum);
192
193 }
194
195 // term info
196 termfreqclass term;
197 for (int k=0; k<(int)queryResult.termFreqs.size(); k++) {
198 term.clear();
199 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
200 term.termstr = to_uni(termfreq_cstr);
201 delete termfreq_cstr;
202 term.termstemstr = term.termstr;
203 // we don't set term.utf8equivterms ?? - jrm21
204 term.termfreq = queryResult.termFreqs[k].termFreq;
205 queryresult.terms.push_back(term);
206 queryresult.orgterms.push_back(term); // should this change??
207
208 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) {
209 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
210 queryresult.termvariants.insert(to_uni(equivterm_cstr));
211 delete equivterm_cstr;
212 }
213
214 }
215 // clean up
216 delete indexname;
217 return true;
218 */
219
220 return false;
221
222}
223
224
225bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
226 int start, int numDocs,
227 queryresultsclass &queryresult) {
228
229 cerr << "**** Not sure what this function does!" << endl;
230
231 /*
232#ifdef __WIN32__
233 char basepath[]="";
234#else
235 char basepath[] = "/";
236#endif
237
238 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
239
240 UCArray level;
241 UCArrayClear(level);
242
243 //browse always at top level
244 SetCStr(level, "Doc"); // this name may change.
245
246
247 BrowseQueryNode browseNode;
248 browseNode.startPosition = start;
249 browseNode.numTerms = numDocs;
250
251 BrowseQueryResult browseResult;
252
253
254 UCArrayClear(browseNode.term);
255 // greenstone gives us the query encoded in unicode. We want utf8.
256 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
257 SetCStr(browseNode.term, utf8querystring);
258 delete utf8querystring;
259
260 // do the actual query
261 // LuceneBrowseQuery(*indexData, level, browseNode, browseResult); // ****
262
263 // load results into term info
264 termfreqclass term;
265 for (int i=0; i<(int)browseResult.termFreqs.size(); i++) {
266 term.clear();
267 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
268 term.termstr = to_uni(term_cstr);
269 delete term_cstr;
270 term.termstemstr = term.termstr;
271 term.termfreq = browseResult.termFreqs[i].termFreq;
272 queryresult.terms.push_back(term);
273 queryresult.orgterms.push_back(term);
274
275 }
276 // clean up
277 delete indexname;
278
279 return true;
280
281 */
282
283 return false;
284}
285
286// the document text for 'docnum' is placed in 'output'
287// docTargetDocument returns 'true' if it was able to
288// try to get a document
289// collection is needed to see if an index from the
290// collection is loaded. THe default index bits are just there cos
291// the mg version needs them
292
293bool lucenesearchclass::docTargetDocument(const text_t &/*defaultindex*/,
294 const text_t &/*defaultsubcollection*/,
295 const text_t &/*defaultlanguage*/,
296 const text_t &collection,
297 int docnum,
298 text_t &output) {
299
300 cerr << "**** Should return document text here!" << endl;
301
302 /*
303#ifdef __WIN32__
304 char basepath[]="";
305#else
306 char basepath[] = "/";
307#endif
308 char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
309
310
311 TextData textdata;
312 if(!textdata.LoadData(basepath, textname)) {
313 cout<<"couldn't load text data\n"<<endl;
314 return false;
315 }
316
317 UCArray doctext;
318 UCArray level;
319 SetCStr(level, gdbm_level.getcstr());
320 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
321 cout<<"couldn't retrieve document text\n";
322 return false;
323 }
324
325 // convert UCArray to text_t
326 output.clear();
327 char* doctext_cstr = GetCStr(doctext);
328 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
329 delete doctext_cstr;
330
331 // here need to remove the <Document>, <Section>, <Paragraph> tags
332
333
334 //clean up
335 textdata.UnloadData ();
336 delete textname;
337
338 return true;
339
340 */
341
342 return false;
343}
344
345// used to clear any cached databases for persistent versions of
346// Greenstone like the Windows local library
347void lucenesearchclass::unload_database () {
348}
349
350
351
352
353
Note: See TracBrowser for help on using the repository browser.