source: trunk/gsdl/src/colservr/lucenesearch.cpp@ 8027

Last change on this file since 8027 was 8027, checked in by davidb, 20 years ago

Introduction of lucene*.cpp,h classes to support searching with
this Java based indexing tool.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.9 KB
Line 
1/**********************************************************************
2 *
3 * lucenesearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#if defined(GSDL_USE_OBJECTSPACE)
28# include <ospace\std\iostream>
29#elif defined(GSDL_USE_IOS_H)
30# include <iostream.h>
31#else
32# include <iostream>
33#endif
34
35
36#include "gsdlconf.h"
37#include "lucenesearch.h"
38#include "fileutil.h"
39#include "queryinfo.h"
40#include "gsdlunicode.h"
41
42#include "sax_resultset.h"
43
44
45static text_t getindexsuffix(const queryparamclass &qp) {
46 text_t indexsuffix = "index";
47 text_t ind = qp.index;
48 text_t sub = qp.subcollection;
49 text_t lang = qp.language;
50
51 // collection name not added for Lucene
52 indexsuffix = filename_cat(indexsuffix, ind + sub + lang);
53 return indexsuffix;
54
55}
56
57////////////////////
58// lucenesearch class //
59////////////////////
60
61lucenesearchclass::lucenesearchclass ()
62 : searchclass() {
63
64 gdbm_level = "Doc";
65}
66
67lucenesearchclass::~lucenesearchclass ()
68{
69 if (cache != NULL)
70 {
71 delete cache;
72 cache = NULL;
73 }
74}
75
76void lucenesearchclass::set_gdbm_level(const text_t &level) {
77 gdbm_level = level;
78
79}
80
81
82bool lucenesearchclass::search(const queryparamclass &queryparams,
83 queryresultsclass &queryresult) {
84
85#ifdef __WIN32__
86 char basepath[]="";
87#else
88 char basepath[] = "/";
89#endif
90
91 cerr << "**** in luecen search" << endl;
92
93 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
94
95 // set default stem method from values originally set on prefs page
96 int defaultStemMethod = 0;
97 if (queryparams.casefolding) {
98 defaultStemMethod |= 1;
99 }
100 if (queryparams.stemming) {
101 defaultStemMethod |= 2;
102 }
103
104 // set default Boolean combiner from all/some setting
105 // if match_mode == 1, ie all, default=1 ie AND
106 // if match_mode == 0, ie some, default=0, ie OR
107 int defaultBoolCombine = 0;
108 if (queryparams.match_mode){
109 defaultBoolCombine = 1;
110 }
111 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
112 cerr << "**** query string = " << utf8querystring << endl;
113 cerr << "***** index name = " << indexname << endl;
114
115 text_t cmd = "lucene_query.pl ";
116 cmd += indexname + (text_t)" " + to_utf8(queryparams.querystring);
117
118
119 FILE *PIN = popen(cmd.getcstr(),"r");
120 if (PIN==NULL) {
121 cerr << "Error: unable to open pipe to " << cmd << endl;
122 return false;
123 }
124
125 text_t xml_text = "";
126
127 while (!feof(PIN)) {
128 char buffer[256];
129 int num_bytes = fread(buffer,1,256,PIN);
130 xml_text.appendcarr(buffer,num_bytes);
131 }
132
133 sax_resultset(xml_text,queryresult);
134
135 pclose(PIN);
136
137 return true;
138
139 /*
140 // use default query info settings - change to reflect user preferences??
141 QueryInfo queryInfo;
142
143 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
144 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
145 queryInfo.sortByRank = (queryparams.search_type == 1);
146 queryInfo.exactWeights = false;
147 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
148 queryInfo.needTermFreqs = true;
149
150 ExtQueryResult queryResult;
151
152 UCArray queryArray;
153 // greenstone gives us the query encoded in unicode. We want utf8.
154 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
155 SetCStr(queryArray, utf8querystring);
156 delete utf8querystring;
157
158 UCArray level;
159 UCArrayClear(level);
160
161 //set the level for results
162 SetCStr(level, gdbm_level.getcstr());
163
164
165 // do the query
166 // LuceneQuery(*indexData, queryInfo, queryTree, queryResult, level); // ****
167
168
169 // convert ExtQueryResult to queryresultclass
170
171 queryresult.docs_matched = (int)queryResult.docs.size();
172
173 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
174 queryresult.is_approx = Exact;
175 }
176 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
177 queryresult.is_approx = MoreThan;
178 }
179 else {
180 queryresult.is_approx = Approximate;
181 }
182
183 docresultclass doc;
184 for (int i=0; i<(int)queryResult.docs.size(); i++) {
185 doc.clear();
186 doc.docnum = (int)queryResult.levels[i];
187 doc.docweight = queryResult.ranks[i];
188 queryresult.docs.docset[doc.docnum] = doc;
189 queryresult.docs.docorder.push_back(doc.docnum);
190
191 }
192
193 // term info
194 termfreqclass term;
195 for (int k=0; k<(int)queryResult.termFreqs.size(); k++) {
196 term.clear();
197 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
198 term.termstr = to_uni(termfreq_cstr);
199 delete termfreq_cstr;
200 term.termstemstr = term.termstr;
201 // we don't set term.utf8equivterms ?? - jrm21
202 term.termfreq = queryResult.termFreqs[k].termFreq;
203 queryresult.terms.push_back(term);
204 queryresult.orgterms.push_back(term); // should this change??
205
206 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) {
207 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
208 queryresult.termvariants.insert(to_uni(equivterm_cstr));
209 delete equivterm_cstr;
210 }
211
212 }
213 // clean up
214 delete indexname;
215 return true;
216 */
217
218 return false;
219
220}
221
222
223bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
224 int start, int numDocs,
225 queryresultsclass &queryresult) {
226
227 cerr << "**** Not sure what this function does!" << endl;
228
229 /*
230#ifdef __WIN32__
231 char basepath[]="";
232#else
233 char basepath[] = "/";
234#endif
235
236 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
237
238 UCArray level;
239 UCArrayClear(level);
240
241 //browse always at top level
242 SetCStr(level, "Doc"); // this name may change.
243
244
245 BrowseQueryNode browseNode;
246 browseNode.startPosition = start;
247 browseNode.numTerms = numDocs;
248
249 BrowseQueryResult browseResult;
250
251
252 UCArrayClear(browseNode.term);
253 // greenstone gives us the query encoded in unicode. We want utf8.
254 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
255 SetCStr(browseNode.term, utf8querystring);
256 delete utf8querystring;
257
258 // do the actual query
259 // LuceneBrowseQuery(*indexData, level, browseNode, browseResult); // ****
260
261 // load results into term info
262 termfreqclass term;
263 for (int i=0; i<(int)browseResult.termFreqs.size(); i++) {
264 term.clear();
265 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
266 term.termstr = to_uni(term_cstr);
267 delete term_cstr;
268 term.termstemstr = term.termstr;
269 term.termfreq = browseResult.termFreqs[i].termFreq;
270 queryresult.terms.push_back(term);
271 queryresult.orgterms.push_back(term);
272
273 }
274 // clean up
275 delete indexname;
276
277 return true;
278
279 */
280
281 return false;
282}
283
284// the document text for 'docnum' is placed in 'output'
285// docTargetDocument returns 'true' if it was able to
286// try to get a document
287// collection is needed to see if an index from the
288// collection is loaded. THe default index bits are just there cos
289// the mg version needs them
290
291bool lucenesearchclass::docTargetDocument(const text_t &/*defaultindex*/,
292 const text_t &/*defaultsubcollection*/,
293 const text_t &/*defaultlanguage*/,
294 const text_t &collection,
295 int docnum,
296 text_t &output) {
297
298 cerr << "**** Should return document text here!" << endl;
299
300 /*
301#ifdef __WIN32__
302 char basepath[]="";
303#else
304 char basepath[] = "/";
305#endif
306 char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
307
308
309 TextData textdata;
310 if(!textdata.LoadData(basepath, textname)) {
311 cout<<"couldn't load text data\n"<<endl;
312 return false;
313 }
314
315 UCArray doctext;
316 UCArray level;
317 SetCStr(level, gdbm_level.getcstr());
318 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
319 cout<<"couldn't retrieve document text\n";
320 return false;
321 }
322
323 // convert UCArray to text_t
324 output.clear();
325 char* doctext_cstr = GetCStr(doctext);
326 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
327 delete doctext_cstr;
328
329 // here need to remove the <Document>, <Section>, <Paragraph> tags
330
331
332 //clean up
333 textdata.UnloadData ();
334 delete textname;
335
336 return true;
337
338 */
339
340 return false;
341}
342
343// used to clear any cached databases for persistent versions of
344// Greenstone like the Windows local library
345void lucenesearchclass::unload_database () {
346}
347
348
349
350
351
Note: See TracBrowser for help on using the repository browser.