source: trunk/gsdl/src/colservr/lucenesearch.cpp@ 9904

Last change on this file since 9904 was 9620, checked in by kjdon, 19 years ago

added some x++ -> ++x changes submitted by Emanuel Dejanu

  • Property svn:keywords set to Author Date Id Revision
File size: 9.8 KB
Line 
1/**********************************************************************
2 *
3 * lucenesearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#if defined(GSDL_USE_OBJECTSPACE)
28# include <ospace\std\iostream>
29#elif defined(GSDL_USE_IOS_H)
30# include <iostream.h>
31#else
32# include <iostream>
33#endif
34
35#include <stdio.h>
36#include <time.h>
37
38#include "gsdlconf.h"
39#include "gsdltools.h"
40#include "lucenesearch.h"
41#include "fileutil.h"
42#include "queryinfo.h"
43#include "gsdlunicode.h"
44
45#include "expat_resultset.h"
46
47
48static text_t getindexsuffix(const queryparamclass &qp) {
49 text_t indexsuffix = "index";
50 // get the first char of the level to be the start of the index name
51 text_t suffix = substr(qp.level.begin(), qp.level.begin()+1);
52 lc(suffix);
53 text_t ind = qp.index;
54 text_t sub = qp.subcollection;
55 text_t lang = qp.language;
56
57 // collection name not added for Lucene
58 indexsuffix = filename_cat(indexsuffix, suffix +ind + sub + lang);
59 return indexsuffix;
60
61}
62
63////////////////////
64// lucenesearch class //
65////////////////////
66
67lucenesearchclass::lucenesearchclass ()
68 : searchclass() {
69
70 gdbm_level = "Doc";
71}
72
73lucenesearchclass::~lucenesearchclass ()
74{
75 if (cache != NULL)
76 {
77 delete cache;
78 cache = NULL;
79 }
80}
81
82void lucenesearchclass::set_gdbm_level(const text_t &level) {
83 gdbm_level = level;
84
85}
86
87
88bool lucenesearchclass::search(const queryparamclass &queryparams,
89 queryresultsclass &queryresult) {
90
91#ifdef __WIN32__
92 char basepath[]="";
93#else
94 char basepath[] = "/";
95#endif
96
97 cerr << "**** in lucene search" << endl;
98
99 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
100
101 // set default stem method from values originally set on prefs page
102 int defaultStemMethod = 0;
103 if (queryparams.casefolding) {
104 defaultStemMethod |= 1;
105 }
106 if (queryparams.stemming) {
107 defaultStemMethod |= 2;
108 }
109
110 // set default Boolean combiner from all/some setting
111 // if match_mode == 1, ie all, default=1 ie AND
112 // if match_mode == 0, ie some, default=0, ie OR
113 int defaultBoolCombine = 0;
114 if (queryparams.match_mode){
115 defaultBoolCombine = 1;
116 }
117 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
118 cerr << "**** query string = " << utf8querystring << endl;
119 cerr << "***** index name = " << indexname << endl;
120
121 text_t cmd = "lucene_query.pl ";
122 cmd += (text_t)" \""+indexname + (text_t)"\" \"" + to_utf8(queryparams.querystring) + (text_t)"\"";
123
124 text_t xml_text = "";
125
126#ifdef __WIN32__
127 //FILE *PIN = _popen(cmd.getcstr(), "r"); // didn't seem to work
128 cmd = (text_t)"perl -S "+cmd;
129 // we write the result to a file
130 clock_t this_time = clock();
131 text_t filename = "luc";
132 filename.append(this_time);
133 filename.append(".txt");
134
135 text_t out_file = filename_cat(collectdir, filename);
136 cmd += (text_t)" \""+out_file+ (text_t)"\"";
137 int rv = gsdl_system(cmd, true, cerr);
138 if (rv != 0) {
139 cerr << "tried to run command \""<<cmd<<"\", but it failed\n";
140 } else {
141 read_file(out_file, xml_text);
142 remove(out_file.getcstr()); // now delete it
143 }
144#else
145 FILE *PIN = popen(cmd.getcstr(), "r");
146
147 if (PIN==NULL) {
148 perror("PIPE");
149 cerr << "Error: unable to open pipe to " << cmd << endl;
150
151 return false;
152 }
153 while (!feof(PIN)) {
154 char buffer[256];
155 int num_bytes = fread(buffer,1,256,PIN);
156 xml_text.appendcarr(buffer,num_bytes);
157 }
158
159#endif
160 expat_resultset(xml_text,queryresult);
161
162#ifdef __WIN32__
163 // _pclose(PIN);
164#else
165 pclose(PIN);
166#endif
167
168 return true;
169}
170 /*
171 // use default query info settings - change to reflect user preferences??
172 QueryInfo queryInfo;
173
174 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
175 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
176 queryInfo.sortByRank = (queryparams.search_type == 1);
177 queryInfo.exactWeights = false;
178 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
179 queryInfo.needTermFreqs = true;
180
181 ExtQueryResult queryResult;
182
183 UCArray queryArray;
184 // greenstone gives us the query encoded in unicode. We want utf8.
185 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
186 SetCStr(queryArray, utf8querystring);
187 delete utf8querystring;
188
189 UCArray level;
190 UCArrayClear(level);
191
192 //set the level for results
193 SetCStr(level, gdbm_level.getcstr());
194
195
196 // do the query
197 // LuceneQuery(*indexData, queryInfo, queryTree, queryResult, level); // ****
198
199
200 // convert ExtQueryResult to queryresultclass
201
202 queryresult.docs_matched = (int)queryResult.docs.size();
203
204 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
205 queryresult.is_approx = Exact;
206 }
207 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
208 queryresult.is_approx = MoreThan;
209 }
210 else {
211 queryresult.is_approx = Approximate;
212 }
213
214 docresultclass doc;
215 for (int i=0; i<(int)queryResult.docs.size(); ++i) {
216 doc.clear();
217 doc.docnum = (int)queryResult.levels[i];
218 doc.docweight = queryResult.ranks[i];
219 queryresult.docs.docset[doc.docnum] = doc;
220 queryresult.docs.docorder.push_back(doc.docnum);
221
222 }
223
224 // term info
225 termfreqclass term;
226 for (int k=0; k<(int)queryResult.termFreqs.size(); ++k) {
227 term.clear();
228 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
229 term.termstr = to_uni(termfreq_cstr);
230 delete termfreq_cstr;
231 term.termstemstr = term.termstr;
232 // we don't set term.utf8equivterms ?? - jrm21
233 term.termfreq = queryResult.termFreqs[k].termFreq;
234 queryresult.terms.push_back(term);
235 queryresult.orgterms.push_back(term); // should this change??
236
237 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); ++j) {
238 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
239 queryresult.termvariants.insert(to_uni(equivterm_cstr));
240 delete equivterm_cstr;
241 }
242
243 }
244 // clean up
245 delete indexname;
246 return true;
247 */
248
249
250bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
251 int start, int numDocs,
252 queryresultsclass &queryresult) {
253
254 cerr << "**** Not sure what this function does!" << endl;
255
256 /*
257#ifdef __WIN32__
258 char basepath[]="";
259#else
260 char basepath[] = "/";
261#endif
262
263 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
264
265 UCArray level;
266 UCArrayClear(level);
267
268 //browse always at top level
269 SetCStr(level, "Doc"); // this name may change.
270
271
272 BrowseQueryNode browseNode;
273 browseNode.startPosition = start;
274 browseNode.numTerms = numDocs;
275
276 BrowseQueryResult browseResult;
277
278
279 UCArrayClear(browseNode.term);
280 // greenstone gives us the query encoded in unicode. We want utf8.
281 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
282 SetCStr(browseNode.term, utf8querystring);
283 delete utf8querystring;
284
285 // do the actual query
286 // LuceneBrowseQuery(*indexData, level, browseNode, browseResult); // ****
287
288 // load results into term info
289 termfreqclass term;
290 for (int i=0; i<(int)browseResult.termFreqs.size(); ++i) {
291 term.clear();
292 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
293 term.termstr = to_uni(term_cstr);
294 delete term_cstr;
295 term.termstemstr = term.termstr;
296 term.termfreq = browseResult.termFreqs[i].termFreq;
297 queryresult.terms.push_back(term);
298 queryresult.orgterms.push_back(term);
299
300 }
301 // clean up
302 delete indexname;
303
304 return true;
305
306 */
307
308 return false;
309}
310
311// the document text for 'docnum' is placed in 'output'
312// docTargetDocument returns 'true' if it was able to
313// try to get a document
314// collection is needed to see if an index from the
315// collection is loaded. THe default index bits are just there cos
316// the mg version needs them
317
318bool lucenesearchclass::docTargetDocument(const text_t &/*defaultindex*/,
319 const text_t &/*defaultsubcollection*/,
320 const text_t &/*defaultlanguage*/,
321 const text_t &collection,
322 int docnum,
323 text_t &output) {
324
325 cerr << "**** Should return document text here!" << endl;
326
327 /*
328#ifdef __WIN32__
329 char basepath[]="";
330#else
331 char basepath[] = "/";
332#endif
333 char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
334
335
336 TextData textdata;
337 if(!textdata.LoadData(basepath, textname)) {
338 cout<<"couldn't load text data\n"<<endl;
339 return false;
340 }
341
342 UCArray doctext;
343 UCArray level;
344 SetCStr(level, gdbm_level.getcstr());
345 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
346 cout<<"couldn't retrieve document text\n";
347 return false;
348 }
349
350 // convert UCArray to text_t
351 output.clear();
352 char* doctext_cstr = GetCStr(doctext);
353 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
354 delete doctext_cstr;
355
356 // here need to remove the <Document>, <Section>, <Paragraph> tags
357
358
359 //clean up
360 textdata.UnloadData ();
361 delete textname;
362
363 return true;
364
365 */
366
367 return false;
368}
369
370// used to clear any cached databases for persistent versions of
371// Greenstone like the Windows local library
372void lucenesearchclass::unload_database () {
373}
374
375
376
377
378
Note: See TracBrowser for help on using the repository browser.