source: trunk/gsdl/src/colservr/lucenesearch.cpp@ 9089

Last change on this file since 9089 was 9089, checked in by kjdon, 19 years ago

the index name is actually hte first char of the level (d, s, p) plus the index name (idx) - in getindexsuffix

  • Property svn:keywords set to Author Date Id Revision
File size: 9.1 KB
Line 
1/**********************************************************************
2 *
3 * lucenesearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#if defined(GSDL_USE_OBJECTSPACE)
28# include <ospace\std\iostream>
29#elif defined(GSDL_USE_IOS_H)
30# include <iostream.h>
31#else
32# include <iostream>
33#endif
34
35#include <stdio.h>
36
37#include "gsdlconf.h"
38#include "lucenesearch.h"
39#include "fileutil.h"
40#include "queryinfo.h"
41#include "gsdlunicode.h"
42
43#include "expat_resultset.h"
44
45
46static text_t getindexsuffix(const queryparamclass &qp) {
47 text_t indexsuffix = "index";
48 // get the first char of the level to be the start of the index name
49 text_t suffix = substr(qp.level.begin(), qp.level.begin()+1);
50 lc(suffix);
51 text_t ind = qp.index;
52 text_t sub = qp.subcollection;
53 text_t lang = qp.language;
54
55 // collection name not added for Lucene
56 indexsuffix = filename_cat(indexsuffix, suffix +ind + sub + lang);
57 return indexsuffix;
58
59}
60
61////////////////////
62// lucenesearch class //
63////////////////////
64
65lucenesearchclass::lucenesearchclass ()
66 : searchclass() {
67
68 gdbm_level = "Doc";
69}
70
71lucenesearchclass::~lucenesearchclass ()
72{
73 if (cache != NULL)
74 {
75 delete cache;
76 cache = NULL;
77 }
78}
79
80void lucenesearchclass::set_gdbm_level(const text_t &level) {
81 gdbm_level = level;
82
83}
84
85
86bool lucenesearchclass::search(const queryparamclass &queryparams,
87 queryresultsclass &queryresult) {
88
89#ifdef __WIN32__
90 char basepath[]="";
91#else
92 char basepath[] = "/";
93#endif
94
95 cerr << "**** in luecen search" << endl;
96
97 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
98
99 // set default stem method from values originally set on prefs page
100 int defaultStemMethod = 0;
101 if (queryparams.casefolding) {
102 defaultStemMethod |= 1;
103 }
104 if (queryparams.stemming) {
105 defaultStemMethod |= 2;
106 }
107
108 // set default Boolean combiner from all/some setting
109 // if match_mode == 1, ie all, default=1 ie AND
110 // if match_mode == 0, ie some, default=0, ie OR
111 int defaultBoolCombine = 0;
112 if (queryparams.match_mode){
113 defaultBoolCombine = 1;
114 }
115 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
116 cerr << "**** query string = " << utf8querystring << endl;
117 cerr << "***** index name = " << indexname << endl;
118
119 text_t cmd = "lucene_query.pl ";
120 cmd += indexname + (text_t)" \"" + to_utf8(queryparams.querystring) + (text_t)"\"";
121
122
123 FILE *PIN = popen(cmd.getcstr(),"r");
124 if (PIN==NULL) {
125 cerr << "Error: unable to open pipe to " << cmd << endl;
126 return false;
127 }
128
129 text_t xml_text = "";
130
131 while (!feof(PIN)) {
132 char buffer[256];
133 int num_bytes = fread(buffer,1,256,PIN);
134 xml_text.appendcarr(buffer,num_bytes);
135 }
136
137 expat_resultset(xml_text,queryresult);
138
139 pclose(PIN);
140
141 return true;
142
143 /*
144 // use default query info settings - change to reflect user preferences??
145 QueryInfo queryInfo;
146
147 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
148 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
149 queryInfo.sortByRank = (queryparams.search_type == 1);
150 queryInfo.exactWeights = false;
151 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
152 queryInfo.needTermFreqs = true;
153
154 ExtQueryResult queryResult;
155
156 UCArray queryArray;
157 // greenstone gives us the query encoded in unicode. We want utf8.
158 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
159 SetCStr(queryArray, utf8querystring);
160 delete utf8querystring;
161
162 UCArray level;
163 UCArrayClear(level);
164
165 //set the level for results
166 SetCStr(level, gdbm_level.getcstr());
167
168
169 // do the query
170 // LuceneQuery(*indexData, queryInfo, queryTree, queryResult, level); // ****
171
172
173 // convert ExtQueryResult to queryresultclass
174
175 queryresult.docs_matched = (int)queryResult.docs.size();
176
177 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
178 queryresult.is_approx = Exact;
179 }
180 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
181 queryresult.is_approx = MoreThan;
182 }
183 else {
184 queryresult.is_approx = Approximate;
185 }
186
187 docresultclass doc;
188 for (int i=0; i<(int)queryResult.docs.size(); i++) {
189 doc.clear();
190 doc.docnum = (int)queryResult.levels[i];
191 doc.docweight = queryResult.ranks[i];
192 queryresult.docs.docset[doc.docnum] = doc;
193 queryresult.docs.docorder.push_back(doc.docnum);
194
195 }
196
197 // term info
198 termfreqclass term;
199 for (int k=0; k<(int)queryResult.termFreqs.size(); k++) {
200 term.clear();
201 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
202 term.termstr = to_uni(termfreq_cstr);
203 delete termfreq_cstr;
204 term.termstemstr = term.termstr;
205 // we don't set term.utf8equivterms ?? - jrm21
206 term.termfreq = queryResult.termFreqs[k].termFreq;
207 queryresult.terms.push_back(term);
208 queryresult.orgterms.push_back(term); // should this change??
209
210 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) {
211 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
212 queryresult.termvariants.insert(to_uni(equivterm_cstr));
213 delete equivterm_cstr;
214 }
215
216 }
217 // clean up
218 delete indexname;
219 return true;
220 */
221
222 return false;
223
224}
225
226
227bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
228 int start, int numDocs,
229 queryresultsclass &queryresult) {
230
231 cerr << "**** Not sure what this function does!" << endl;
232
233 /*
234#ifdef __WIN32__
235 char basepath[]="";
236#else
237 char basepath[] = "/";
238#endif
239
240 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
241
242 UCArray level;
243 UCArrayClear(level);
244
245 //browse always at top level
246 SetCStr(level, "Doc"); // this name may change.
247
248
249 BrowseQueryNode browseNode;
250 browseNode.startPosition = start;
251 browseNode.numTerms = numDocs;
252
253 BrowseQueryResult browseResult;
254
255
256 UCArrayClear(browseNode.term);
257 // greenstone gives us the query encoded in unicode. We want utf8.
258 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
259 SetCStr(browseNode.term, utf8querystring);
260 delete utf8querystring;
261
262 // do the actual query
263 // LuceneBrowseQuery(*indexData, level, browseNode, browseResult); // ****
264
265 // load results into term info
266 termfreqclass term;
267 for (int i=0; i<(int)browseResult.termFreqs.size(); i++) {
268 term.clear();
269 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
270 term.termstr = to_uni(term_cstr);
271 delete term_cstr;
272 term.termstemstr = term.termstr;
273 term.termfreq = browseResult.termFreqs[i].termFreq;
274 queryresult.terms.push_back(term);
275 queryresult.orgterms.push_back(term);
276
277 }
278 // clean up
279 delete indexname;
280
281 return true;
282
283 */
284
285 return false;
286}
287
288// the document text for 'docnum' is placed in 'output'
289// docTargetDocument returns 'true' if it was able to
290// try to get a document
291// collection is needed to see if an index from the
292// collection is loaded. THe default index bits are just there cos
293// the mg version needs them
294
295bool lucenesearchclass::docTargetDocument(const text_t &/*defaultindex*/,
296 const text_t &/*defaultsubcollection*/,
297 const text_t &/*defaultlanguage*/,
298 const text_t &collection,
299 int docnum,
300 text_t &output) {
301
302 cerr << "**** Should return document text here!" << endl;
303
304 /*
305#ifdef __WIN32__
306 char basepath[]="";
307#else
308 char basepath[] = "/";
309#endif
310 char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
311
312
313 TextData textdata;
314 if(!textdata.LoadData(basepath, textname)) {
315 cout<<"couldn't load text data\n"<<endl;
316 return false;
317 }
318
319 UCArray doctext;
320 UCArray level;
321 SetCStr(level, gdbm_level.getcstr());
322 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
323 cout<<"couldn't retrieve document text\n";
324 return false;
325 }
326
327 // convert UCArray to text_t
328 output.clear();
329 char* doctext_cstr = GetCStr(doctext);
330 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
331 delete doctext_cstr;
332
333 // here need to remove the <Document>, <Section>, <Paragraph> tags
334
335
336 //clean up
337 textdata.UnloadData ();
338 delete textname;
339
340 return true;
341
342 */
343
344 return false;
345}
346
347// used to clear any cached databases for persistent versions of
348// Greenstone like the Windows local library
349void lucenesearchclass::unload_database () {
350}
351
352
353
354
355
Note: See TracBrowser for help on using the repository browser.