source: gsdl/trunk/src/colservr/lucenesearch.cpp@ 15049

Last change on this file since 15049 was 15049, checked in by mdewsnip, 14 years ago

Fixed the Lucene jar file name and class name in the commented out code for using Lucene without going through lucene_query.pl.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1/**********************************************************************
2 *
3 * lucenesearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#if defined(GSDL_USE_OBJECTSPACE)
28# include <ospace\std\iostream>
29#elif defined(GSDL_USE_IOS_H)
30# include <iostream.h>
31#else
32# include <iostream>
33#endif
34
35#include <stdio.h>
36#include <time.h>
37
38#include "gsdlconf.h"
39#include "gsdltools.h"
40#include "lucenesearch.h"
41#include "fileutil.h"
42#include "queryinfo.h"
43#include "gsdlunicode.h"
44
45#include "expat_resultset.h"
46
47text_t lucenesearchclass::getindexsuffix(const queryparamclass &qp) {
48 text_t indexsuffix = "index";
49 // get the first char of the level to be the start of the index name
50 text_t suffix = substr(qp.level.begin(), qp.level.begin()+1);
51 lc(suffix);
52 text_t ind = qp.index;
53 text_t sub = qp.subcollection;
54 text_t lang = qp.language;
55
56 // collection name not added for Lucene
57 indexsuffix = filename_cat(indexsuffix, suffix +ind + sub + lang);
58 return indexsuffix;
59
60}
61
62////////////////////
63// lucenesearch class //
64////////////////////
65
66lucenesearchclass::lucenesearchclass ()
67 : searchclass() {
68
69 gdbm_level = "Doc";
70}
71
72lucenesearchclass::~lucenesearchclass ()
73{
74 if (cache != NULL)
75 {
76 delete cache;
77 cache = NULL;
78 }
79}
80
81void lucenesearchclass::set_gdbm_level(const text_t &level) {
82 gdbm_level = level;
83
84}
85
86bool lucenesearchclass::search(const queryparamclass &queryparams,
87 queryresultsclass &queryresult) {
88
89#ifdef __WIN32__
90 char basepath[]="";
91#else
92 char basepath[] = "/";
93#endif
94
95 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
96
97 // set default stem method from values originally set on prefs page
98 int defaultStemMethod = 0;
99 if (queryparams.casefolding) {
100 defaultStemMethod |= 1;
101 }
102 if (queryparams.stemming) {
103 defaultStemMethod |= 2;
104 }
105
106 text_t utf8querystring = to_utf8(queryparams.querystring);
107
108 text_t escaped_utf8querystring = "";
109 text_t::const_iterator here = utf8querystring.begin();
110 while (here != utf8querystring.end()) {
111 if (*here == '"') escaped_utf8querystring.push_back('\\');
112 escaped_utf8querystring.push_back(*here);
113 ++here;
114 }
115
116 // Use this to skip lucene_query.pl and access GS2LuceneQuery directly (Java must be on path)
117 // text_t cmd = "java -classpath \"" + filename_cat(gsdlhome, "bin", "java", "LuceneWrapper.jar") + "\" org.greenstone.LuceneWrapper.GS2LuceneQuery ";
118 text_t cmd = "\"" + filename_cat(gsdlhome, "bin", "script", "lucene_query.pl") + "\"";
119 cmd += (text_t)" \""+indexname + (text_t)"\" \"" + escaped_utf8querystring + (text_t)"\"";
120
121 if (!queryparams.filterstring.empty()) {
122 cmd += " -filter \"" + queryparams.filterstring + "\"";
123 }
124 if (!queryparams.sortfield.empty()) {
125 cmd += " -sort \"" + queryparams.sortfield + "\"";
126 }
127 if (!queryparams.fuzziness.empty()) {
128 cmd += " -fuzziness " + queryparams.fuzziness;
129 }
130
131 // New code to support configuration of the default conjuction operator
132 // set default Boolean combiner from all/some setting
133 // if match_mode == 1, ie all, default=1 ie AND
134 // if match_mode == 0, ie some, default=0, ie OR
135 if (queryparams.match_mode)
136 {
137 cmd += " -dco AND";
138 }
139
140 // New code to allow Lucene to do paging of search results. This should
141 // substantially improve performance as we don't have to return all 12000
142 // hits if we only need the first 20!
143 if (queryparams.startresults && queryparams.endresults)
144 {
145 cmd += (text_t)" -startresults " + queryparams.startresults;
146 cmd += (text_t)" -endresults " + queryparams.endresults;
147 }
148
149 cerr << "Lucene command: " << cmd << endl;
150
151 text_t xml_text = "";
152
153 // I don't want to do this, but I have to.
154 text_t gsdlhome_env = "GSDLHOME=" + gsdlhome;
155 putenv(gsdlhome_env.getcstr());
156
157#ifdef __WIN32__
158 putenv("GSDLOS=windows");
159
160 //FILE *PIN = _popen(cmd.getcstr(), "r"); // didn't seem to work
161 cmd = (text_t)"perl -S "+cmd;
162 // we write the result to a file
163 clock_t this_time = clock();
164 text_t filename = "luc";
165 filename.append(this_time);
166 filename.append(".txt");
167
168 text_t out_file = filename_cat(collectdir, filename);
169 cmd += " -out \"" + out_file + "\"";
170 int rv = gsdl_system(cmd, true, cerr);
171 if (rv != 0) {
172 cerr << "tried to run command \""<<cmd<<"\", but it failed\n";
173 } else {
174 read_file(out_file, xml_text);
175 remove(out_file.getcstr()); // now delete it
176 }
177#else
178 putenv("GSDLOS=linux");
179
180 FILE *PIN = popen(cmd.getcstr(), "r");
181
182 if (PIN==NULL) {
183 perror("PIPE");
184 cerr << "Error: unable to open pipe to " << cmd << endl;
185
186 return false;
187 }
188 while (!feof(PIN)) {
189 char buffer[256];
190 int num_bytes = fread(buffer,1,256,PIN);
191 xml_text.appendcarr(buffer,num_bytes);
192 }
193
194#endif
195
196 expat_resultset(xml_text,queryresult);
197
198#ifdef __WIN32__
199 // _pclose(PIN);
200#else
201 pclose(PIN);
202#endif
203
204 return true;
205}
206
207
208bool lucenesearchclass::browse_search(const queryparamclass &queryparams,
209 int start, int numDocs,
210 queryresultsclass &queryresult) {
211
212 cerr << "**** Not sure what this function does!" << endl;
213 return false;
214}
215
216// the document text for 'docnum' is placed in 'output'
217// docTargetDocument returns 'true' if it was able to
218// try to get a document
219// collection is needed to see if an index from the
220// collection is loaded. THe default index bits are just there cos
221// the mg version needs them
222
223bool lucenesearchclass::docTargetDocument(const text_t &/*defaultindex*/,
224 const text_t &/*defaultsubcollection*/,
225 const text_t &/*defaultlanguage*/,
226 const text_t &collection,
227 int docnum,
228 text_t &output) {
229
230 // we now get the document directly by lucenegdbmsource, so don't use this
231 // method
232 return false;
233}
234
235// used to clear any cached databases for persistent versions of
236// Greenstone like the Windows local library
237void lucenesearchclass::unload_database () {
238}
239
240void lucenesearchclass::set_gsdlhome (const text_t &gh)
241{
242 gsdlhome = gh;
243}
Note: See TracBrowser for help on using the repository browser.