source: trunk/gsdl/src/colservr/mgppsearch.cpp@ 12018

Last change on this file since 12018 was 9937, checked in by kjdon, 19 years ago

modified the filters/sources etc so that if an indexstem is specified in the build.cfg file, then this will be used as the root of the index/gdbm filenames instead of the collection name. colleciton name still used by default. this means that we can rename a coll directory without rebuilding.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.0 KB
Line 
1/**********************************************************************
2 *
3 * mgppsearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#include "gsdlconf.h"
28#include "mgppsearch.h"
29#include "fileutil.h"
30#include "GSDLQueryParser.h"
31#include "MGQuery.h"
32#include "TextGet.h"
33#include "queryinfo.h"
34#include "gsdlunicode.h"
35
36
37text_t mgppsearchclass::getindexsuffix(const queryparamclass &qp) {
38 return getindexsuffix(qp.collection, qp.index+qp.subcollection+qp.language);
39
40}
41
42text_t mgppsearchclass::getindexsuffix (const text_t &collection,
43 const text_t &index) {
44
45 text_t indexsuffix = "index";
46 indexsuffix = filename_cat (indexsuffix, index);
47 if (indexstem.empty()) {
48 // no index stem, use the coll name
49 indexsuffix = filename_cat (indexsuffix, collection);
50 } else {
51 indexsuffix = filename_cat (indexsuffix, indexstem);
52 }
53 return indexsuffix;
54}
55
56////////////////////
57// mgppsearch class //
58////////////////////
59
60mgppsearchclass::mgppsearchclass ()
61 : searchclass() {
62
63 gdbm_level = "Doc";
64 indexData = NULL;
65}
66
67mgppsearchclass::~mgppsearchclass ()
68{
69 if (cache != NULL)
70 {
71 delete cache;
72 cache = NULL;
73 }
74
75 if (indexData !=NULL) {
76 indexData->UnloadData();
77 delete indexData;
78 indexData = NULL;
79 }
80
81}
82
83void mgppsearchclass::set_gdbm_level(const text_t &level) {
84 gdbm_level = level;
85
86}
87void mgppsearchclass::set_indexstem(const text_t &stem) {
88 indexstem = stem;
89
90}
91
92
93bool mgppsearchclass::search(const queryparamclass &queryparams,
94 queryresultsclass &queryresult) {
95
96#ifdef __WIN32__
97 char basepath[]="";
98#else
99 char basepath[] = "/";
100#endif
101
102 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
103
104 // load index data
105 if (indexData == NULL) {
106 indexData = new IndexData();
107 }
108 if (!indexData->LoadData (basepath, indexname)) {
109 cerr<<"couldn't load index data\n"<<endl;
110 return false;
111 }
112
113 // set default stem method from values originally set on prefs page
114 int defaultStemMethod = 0;
115 if (queryparams.casefolding) {
116 defaultStemMethod |= 1;
117 }
118 if (queryparams.stemming) {
119 defaultStemMethod |= 2;
120 }
121
122 // set default Boolean combiner from all/some setting
123 // if match_mode == 1, ie all, default=1 ie AND
124 // if match_mode == 0, ie some, default=0, ie OR
125 int defaultBoolCombine = 0;
126 if (queryparams.match_mode){
127 defaultBoolCombine = 1;
128 }
129
130 // use default query info settings - change to reflect user preferences??
131 QueryInfo queryInfo;
132
133 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
134 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
135 queryInfo.sortByRank = (queryparams.search_type == 1);
136 queryInfo.exactWeights = false;
137 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
138 queryInfo.needTermFreqs = true;
139
140 ExtQueryResult queryResult;
141
142 UCArray queryArray;
143 // greenstone gives us the query encoded in unicode. We want utf8.
144 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
145 SetCStr(queryArray, utf8querystring);
146 delete []utf8querystring;
147
148 // create the mgpp query tree
149 QueryNode *queryTree = NULL;
150 queryTree = ParseQuery(queryArray, defaultBoolCombine, defaultStemMethod);
151 if (queryTree == NULL) { // syntax error
152 queryresult.syntax_error = true;
153 return true; // should we return true or false?
154 }
155 UCArray level;
156 UCArrayClear(level);
157
158 //set the level for results
159 SetCStr(level, gdbm_level.getcstr());
160
161
162 // do the query
163 MGQuery(*indexData, queryInfo, queryTree, queryResult, level);
164
165
166 // convert ExtQueryResult to queryresultclass
167
168 queryresult.docs_matched = (int)queryResult.docs.size();
169
170 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
171 queryresult.is_approx = Exact;
172 }
173 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
174 queryresult.is_approx = MoreThan;
175 }
176 else {
177 queryresult.is_approx = Approximate;
178 }
179
180 docresultclass doc;
181 for (int i=0; i<(int)queryResult.docs.size(); ++i) {
182 doc.clear();
183 doc.docnum = (int)queryResult.levels[i];
184 doc.docweight = queryResult.ranks[i];
185 queryresult.docs.docset[doc.docnum] = doc;
186 queryresult.docs.docorder.push_back(doc.docnum);
187
188 }
189
190 // term info
191 termfreqclass term;
192 for (int k=0; k<(int)queryResult.termFreqs.size(); ++k) {
193 term.clear();
194 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
195 term.termstr = to_uni(termfreq_cstr);
196 delete []termfreq_cstr;
197 term.termstemstr = term.termstr;
198 // we don't set term.utf8equivterms ?? - jrm21
199 term.termfreq = queryResult.termFreqs[k].termFreq;
200 queryresult.terms.push_back(term);
201 queryresult.orgterms.push_back(term); // should this change??
202
203 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); ++j) {
204 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
205 queryresult.termvariants.insert(to_uni(equivterm_cstr));
206 delete []equivterm_cstr;
207 }
208
209 }
210 // clean up
211 delete []indexname;
212 return true;
213
214}
215
216
217bool mgppsearchclass::browse_search(const queryparamclass &queryparams, int start, int numDocs,
218 queryresultsclass &queryresult) {
219
220#ifdef __WIN32__
221 char basepath[]="";
222#else
223 char basepath[] = "/";
224#endif
225
226 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
227
228 if (indexData == NULL) {
229 indexData = new IndexData();
230 }
231 if (!indexData->LoadData (basepath, indexname)) {
232 cerr<<"couldn't load index data\n"<<endl;
233 return false;
234 }
235
236 UCArray level;
237 UCArrayClear(level);
238
239 //browse always at top level
240 SetCStr(level, "Doc"); // this name may change.
241
242
243 BrowseQueryNode browseNode;
244 browseNode.startPosition = start;
245 browseNode.numTerms = numDocs;
246
247 BrowseQueryResult browseResult;
248
249 UCArrayClear(browseNode.term);
250 // greenstone gives us the query encoded in unicode. We want utf8.
251 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
252 SetCStr(browseNode.term, utf8querystring);
253 delete []utf8querystring;
254
255 // do the actual query
256 MGBrowseQuery(*indexData, level, browseNode, browseResult);
257
258 // load results into term info
259 termfreqclass term;
260 for (int i=0; i<(int)browseResult.termFreqs.size(); ++i) {
261 term.clear();
262 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
263 term.termstr = to_uni(term_cstr);
264 delete []term_cstr;
265 term.termstemstr = term.termstr;
266 term.termfreq = browseResult.termFreqs[i].termFreq;
267 queryresult.terms.push_back(term);
268 queryresult.orgterms.push_back(term);
269
270 }
271 // clean up
272 delete []indexname;
273
274 return true;
275}
276
277// the document text for 'docnum' is placed in 'output'
278// docTargetDocument returns 'true' if it was able to
279// try to get a document
280// collection is needed to see if an index from the
281// collection is loaded. THe default index bits are just there cos
282// the mg version needs them
283
284bool mgppsearchclass::docTargetDocument(const text_t &/*defaultindex*/,
285 const text_t &/*defaultsubcollection*/,
286 const text_t &/*defaultlanguage*/,
287 const text_t &collection,
288 int docnum,
289 text_t &output) {
290
291#ifdef __WIN32__
292 char basepath[]="";
293#else
294 char basepath[] = "/";
295#endif
296 char *textname = (filename_cat(collectdir, getindexsuffix(collection, "text"))).getcstr();
297
298 TextData textdata;
299 if(!textdata.LoadData(basepath, textname)) {
300 cout<<"couldn't load text data\n"<<endl;
301 return false;
302 }
303 UCArray doctext;
304 UCArray level;
305 SetCStr(level, gdbm_level.getcstr());
306 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
307 cout<<"couldn't retrieve document text\n";
308 return false;
309 }
310
311 // convert UCArray to text_t
312 output.clear();
313 char* doctext_cstr = GetCStr(doctext);
314 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
315 delete []doctext_cstr;
316
317 // here need to remove the <Document>, <Section>, <Paragraph> tags
318
319
320 //clean up
321 textdata.UnloadData ();
322 delete []textname;
323
324 return true;
325
326}
327
328// used to clear any cached databases for persistent versions of
329// Greenstone like the Windows local library
330void mgppsearchclass::unload_database () {
331
332 if (indexData !=NULL) {
333 indexData->UnloadData();
334 }
335}
336
337
338
339
340
Note: See TracBrowser for help on using the repository browser.