source: trunk/gsdl/src/colservr/lucenesearch.cpp@ 9174

Last change on this file since 9174 was 9115, checked in by mdewsnip, 19 years ago

Added #ifdefs for slightly different popen and pclose on Windows.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.2 KB
Line 
1/**********************************************************************
2 *
3 * lucenesearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#if defined(GSDL_USE_OBJECTSPACE)
28# include <ospace\std\iostream>
29#elif defined(GSDL_USE_IOS_H)
30# include <iostream.h>
31#else
32# include <iostream>
33#endif
34
35#include <stdio.h>
36
37#include "gsdlconf.h"
38#include "lucenesearch.h"
39#include "fileutil.h"
40#include "queryinfo.h"
41#include "gsdlunicode.h"
42
43#include "expat_resultset.h"
44
45
46static text_t getindexsuffix(const queryparamclass &qp) {
47 text_t indexsuffix = "index";
48 // get the first char of the level to be the start of the index name
49 text_t suffix = substr(qp.level.begin(), qp.level.begin()+1);
50 lc(suffix);
51 text_t ind = qp.index;
52 text_t sub = qp.subcollection;
53 text_t lang = qp.language;
54
55 // collection name not added for Lucene
56 indexsuffix = filename_cat(indexsuffix, suffix +ind + sub + lang);
57 return indexsuffix;
58
59}
60
61////////////////////
62// lucenesearch class //
63////////////////////
64
65lucenesearchclass::lucenesearchclass ()
66 : searchclass() {
67
68 gdbm_level = "Doc";
69}
70
71lucenesearchclass::~lucenesearchclass ()
72{
73 if (cache != NULL)
74 {
75 delete cache;
76 cache = NULL;
77 }
78}
79
80void lucenesearchclass::set_gdbm_level(const text_t &level) {
81 gdbm_level = level;
82
83}
84
85
86bool lucenesearchclass::search(const queryparamclass &queryparams,
87 queryresultsclass &queryresult) {
88
89#ifdef __WIN32__
90 char basepath[]="";
91#else
92 char basepath[] = "/";
93#endif
94
95 cerr << "**** in luecen search" << endl;
96
97 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
98
99 // set default stem method from values originally set on prefs page
100 int defaultStemMethod = 0;
101 if (queryparams.casefolding) {
102 defaultStemMethod |= 1;
103 }
104 if (queryparams.stemming) {
105 defaultStemMethod |= 2;
106 }
107
108 // set default Boolean combiner from all/some setting
109 // if match_mode == 1, ie all, default=1 ie AND
110 // if match_mode == 0, ie some, default=0, ie OR
111 int defaultBoolCombine = 0;
112 if (queryparams.match_mode){
113 defaultBoolCombine = 1;
114 }
115 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
116 cerr << "**** query string = " << utf8querystring << endl;
117 cerr << "***** index name = " << indexname << endl;
118
119 text_t cmd = "lucene_query.pl ";
120 cmd += indexname + (text_t)" \"" + to_utf8(queryparams.querystring) + (text_t)"\"";
121
122#ifdef __WIN32__
123 FILE *PIN = _popen(cmd.getcstr(), "r");
124#else
125 FILE *PIN = popen(cmd.getcstr(), "r");
126#endif
127 if (PIN==NULL) {
128 cerr << "Error: unable to open pipe to " << cmd << endl;
129 return false;
130 }
131
132 text_t xml_text = "";
133
134 while (!feof(PIN)) {
135 char buffer[256];
136 int num_bytes = fread(buffer,1,256,PIN);
137 xml_text.appendcarr(buffer,num_bytes);
138 }
139
140 expat_resultset(xml_text,queryresult);
141
142#ifdef __WIN32__
143 _pclose(PIN);
144#else
145 pclose(PIN);
146#endif
147
148 return true;
149
150 /*
151 // use default query info settings - change to reflect user preferences??
152 QueryInfo queryInfo;
153
154 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
155 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
156 queryInfo.sortByRank = (queryparams.search_type == 1);
157 queryInfo.exactWeights = false;
158 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
159 queryInfo.needTermFreqs = true;
160
161 ExtQueryResult queryResult;
162
163 UCArray queryArray;
164 // greenstone gives us the query encoded in unicode. We want utf8.
165 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
166 SetCStr(queryArray, utf8querystring);
167 delete utf8querystring;
168
169 UCArray level;
170 UCArrayClear(level);
171
172 //set the level for results
173 SetCStr(level, gdbm_level.getcstr());
174
175
176 // do the query
177 // LuceneQuery(*indexData, queryInfo, queryTree, queryResult, level); // ****
178
179
180 // convert ExtQueryResult to queryresultclass
181
182 queryresult.docs_matched = (int)queryResult.docs.size();
183
184 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
185 queryresult.is_approx = Exact;
186 }
187 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
188 queryresult.is_approx = MoreThan;
189 }
190 else {
191 queryresult.is_approx = Approximate;
192 }
193
194 docresultclass doc;
195 for (int i=0; i<(int)queryResult.docs.size(); i++) {
196 doc.clear();
197 doc.docnum = (int)queryResult.levels[i];
198 doc.docweight = queryResult.ranks[i];
199 queryresult.docs.docset[doc.docnum] = doc;
200 queryresult.docs.docorder.push_back(doc.docnum);
201
202 }
203
204 // term info
205 termfreqclass term;
206 for (int k=0; k<(int)queryResult.termFreqs.size(); k++) {
207 term.clear();
208 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
209 term.termstr = to_uni(termfreq_cstr);
210 delete termfreq_cstr;
211 term.termstemstr = term.termstr;
212 // we don't set term.utf8equivterms ?? - jrm21
213 term.termfreq = queryResult.termFreqs[k].termFreq;
214 queryresult.terms.push_back(term);
215 queryresult.orgterms.push_back(term); // should this change??
216
217 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) {
218 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
219 queryresult.termvariants.insert(to_uni(equivterm_cstr));
220 delete equivterm_cstr;
221 }
222
223 }
224 // clean up
225 delete indexname;
226 return true;
227 */
228
229 return false;
230
231}
232
233
234bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
235 int start, int numDocs,
236 queryresultsclass &queryresult) {
237
238 cerr << "**** Not sure what this function does!" << endl;
239
240 /*
241#ifdef __WIN32__
242 char basepath[]="";
243#else
244 char basepath[] = "/";
245#endif
246
247 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
248
249 UCArray level;
250 UCArrayClear(level);
251
252 //browse always at top level
253 SetCStr(level, "Doc"); // this name may change.
254
255
256 BrowseQueryNode browseNode;
257 browseNode.startPosition = start;
258 browseNode.numTerms = numDocs;
259
260 BrowseQueryResult browseResult;
261
262
263 UCArrayClear(browseNode.term);
264 // greenstone gives us the query encoded in unicode. We want utf8.
265 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
266 SetCStr(browseNode.term, utf8querystring);
267 delete utf8querystring;
268
269 // do the actual query
270 // LuceneBrowseQuery(*indexData, level, browseNode, browseResult); // ****
271
272 // load results into term info
273 termfreqclass term;
274 for (int i=0; i<(int)browseResult.termFreqs.size(); i++) {
275 term.clear();
276 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
277 term.termstr = to_uni(term_cstr);
278 delete term_cstr;
279 term.termstemstr = term.termstr;
280 term.termfreq = browseResult.termFreqs[i].termFreq;
281 queryresult.terms.push_back(term);
282 queryresult.orgterms.push_back(term);
283
284 }
285 // clean up
286 delete indexname;
287
288 return true;
289
290 */
291
292 return false;
293}
294
295// the document text for 'docnum' is placed in 'output'
296// docTargetDocument returns 'true' if it was able to
297// try to get a document
298// collection is needed to see if an index from the
299// collection is loaded. THe default index bits are just there cos
300// the mg version needs them
301
302bool lucenesearchclass::docTargetDocument(const text_t &/*defaultindex*/,
303 const text_t &/*defaultsubcollection*/,
304 const text_t &/*defaultlanguage*/,
305 const text_t &collection,
306 int docnum,
307 text_t &output) {
308
309 cerr << "**** Should return document text here!" << endl;
310
311 /*
312#ifdef __WIN32__
313 char basepath[]="";
314#else
315 char basepath[] = "/";
316#endif
317 char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
318
319
320 TextData textdata;
321 if(!textdata.LoadData(basepath, textname)) {
322 cout<<"couldn't load text data\n"<<endl;
323 return false;
324 }
325
326 UCArray doctext;
327 UCArray level;
328 SetCStr(level, gdbm_level.getcstr());
329 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
330 cout<<"couldn't retrieve document text\n";
331 return false;
332 }
333
334 // convert UCArray to text_t
335 output.clear();
336 char* doctext_cstr = GetCStr(doctext);
337 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
338 delete doctext_cstr;
339
340 // here need to remove the <Document>, <Section>, <Paragraph> tags
341
342
343 //clean up
344 textdata.UnloadData ();
345 delete textname;
346
347 return true;
348
349 */
350
351 return false;
352}
353
354// used to clear any cached databases for persistent versions of
355// Greenstone like the Windows local library
356void lucenesearchclass::unload_database () {
357}
358
359
360
361
362
Note: See TracBrowser for help on using the repository browser.