source: main/trunk/greenstone2/runtime-src/src/colservr/mgppsearch.cpp@ 27064

Last change on this file since 27064 was 15594, checked in by mdewsnip, 16 years ago

(Adding new DB support) Replaced all "gdbm_level" with "text_level".

  • Property svn:keywords set to Author Date Id Revision
File size: 9.3 KB
Line 
1/**********************************************************************
2 *
3 * mgppsearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#include "gsdlconf.h"
28#include "mgppsearch.h"
29#include "fileutil.h"
30#include "GSDLQueryParser.h"
31#include "MGQuery.h"
32#include "TextGet.h"
33#include "queryinfo.h"
34#include "gsdlunicode.h"
35
36
37text_t mgppsearchclass::getindexsuffix(const queryparamclass &qp) {
38 return getindexsuffix(qp.collection, qp.index+qp.subcollection+qp.language);
39
40}
41
42text_t mgppsearchclass::getindexsuffix (const text_t &collection,
43 const text_t &index) {
44
45 text_t indexsuffix = "index";
46 indexsuffix = filename_cat (indexsuffix, index);
47 if (indexstem.empty()) {
48 // no index stem, use the coll name
49 indexsuffix = filename_cat (indexsuffix, collection);
50 } else {
51 indexsuffix = filename_cat (indexsuffix, indexstem);
52 }
53 return indexsuffix;
54}
55
56////////////////////
57// mgppsearch class //
58////////////////////
59
60mgppsearchclass::mgppsearchclass ()
61 : searchclass() {
62
63 textlevel = "Doc";
64 indexData = NULL;
65}
66
67mgppsearchclass::~mgppsearchclass ()
68{
69 if (cache != NULL)
70 {
71 delete cache;
72 cache = NULL;
73 }
74
75 if (indexData !=NULL) {
76 indexData->UnloadData();
77 delete indexData;
78 indexData = NULL;
79 }
80
81}
82
83void mgppsearchclass::set_text_level(const text_t &textlevel_arg)
84{
85 textlevel = textlevel_arg;
86}
87
88void mgppsearchclass::set_indexstem(const text_t &stem)
89{
90 indexstem = stem;
91}
92
93
94bool mgppsearchclass::search(const queryparamclass &queryparams,
95 queryresultsclass &queryresult) {
96
97#ifdef __WIN32__
98 char basepath[]="";
99#else
100 char basepath[] = "/";
101#endif
102
103 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
104
105 // load index data
106 if (indexData == NULL) {
107 indexData = new IndexData();
108 }
109 if (!indexData->LoadData (basepath, indexname)) {
110 cerr<<"couldn't load index data\n"<<endl;
111 return false;
112 }
113
114 // set default stem method from values originally set on prefs page
115 int defaultStemMethod = 0;
116 if (queryparams.casefolding) {
117 defaultStemMethod |= STEM_CaseFolding;
118 }
119 if (queryparams.stemming) {
120 defaultStemMethod |= STEM_Stemming;
121 }
122 if (queryparams.accentfolding) {
123 defaultStemMethod |= STEM_AccentFolding;
124 }
125
126 // set default Boolean combiner from all/some setting
127 // if match_mode == 1, ie all, default=1 ie AND
128 // if match_mode == 0, ie some, default=0, ie OR
129 int defaultBoolCombine = 0;
130 if (queryparams.match_mode){
131 defaultBoolCombine = 1;
132 }
133
134 // use default query info settings - change to reflect user preferences??
135 QueryInfo queryInfo;
136
137 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
138 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
139 queryInfo.sortByRank = (queryparams.search_type == 1);
140 queryInfo.exactWeights = false;
141 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
142 queryInfo.needTermFreqs = true;
143
144 ExtQueryResult queryResult;
145
146 UCArray queryArray;
147 // greenstone gives us the query encoded in unicode. We want utf8.
148 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
149 SetCStr(queryArray, utf8querystring);
150 delete []utf8querystring;
151
152 // create the mgpp query tree
153 QueryNode *queryTree = NULL;
154 queryTree = ParseQuery(queryArray, defaultBoolCombine, defaultStemMethod, queryparams.maxnumeric);
155 if (queryTree == NULL) { // syntax error
156 queryresult.syntax_error = true;
157 return true; // should we return true or false?
158 }
159 UCArray level;
160 UCArrayClear(level);
161
162 //set the level for results
163 SetCStr(level, textlevel.getcstr());
164
165
166 // do the query
167 MGQuery(*indexData, queryInfo, queryTree, queryResult, level);
168
169
170 // convert ExtQueryResult to queryresultclass
171
172 queryresult.docs_matched = (int)queryResult.docs.size();
173
174 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
175 queryresult.is_approx = Exact;
176 }
177 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
178 queryresult.is_approx = MoreThan;
179 }
180 else {
181 queryresult.is_approx = Approximate;
182 }
183
184 docresultclass doc;
185 for (int i=0; i<(int)queryResult.docs.size(); ++i) {
186 doc.clear();
187 doc.docnum = (int)queryResult.levels[i];
188 doc.docweight = queryResult.ranks[i];
189 queryresult.docs.docset[doc.docnum] = doc;
190 queryresult.docs.docorder.push_back(doc.docnum);
191
192 }
193
194 // term info
195 termfreqclass term;
196 for (int k=0; k<(int)queryResult.termFreqs.size(); ++k) {
197 term.clear();
198 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
199 term.termstr = to_uni(termfreq_cstr);
200 delete []termfreq_cstr;
201 term.termstemstr = term.termstr;
202 // we don't set term.utf8equivterms ?? - jrm21
203 term.termfreq = queryResult.termFreqs[k].termFreq;
204 queryresult.terms.push_back(term);
205 queryresult.orgterms.push_back(term); // should this change??
206
207 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); ++j) {
208 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
209 queryresult.termvariants.insert(to_uni(equivterm_cstr));
210 delete []equivterm_cstr;
211 }
212
213 }
214 // clean up
215 unload_database(); // Important that local library doesn't leave any files open
216 delete []indexname;
217 return true;
218
219}
220
221
222bool mgppsearchclass::browse_search(const queryparamclass &queryparams, int start, int numDocs,
223 queryresultsclass &queryresult) {
224
225#ifdef __WIN32__
226 char basepath[]="";
227#else
228 char basepath[] = "/";
229#endif
230
231 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
232
233 if (indexData == NULL) {
234 indexData = new IndexData();
235 }
236 if (!indexData->LoadData (basepath, indexname)) {
237 cerr<<"couldn't load index data\n"<<endl;
238 return false;
239 }
240
241 UCArray level;
242 UCArrayClear(level);
243
244 //browse always at top level
245 SetCStr(level, "Doc"); // this name may change.
246
247
248 BrowseQueryNode browseNode;
249 browseNode.startPosition = start;
250 browseNode.numTerms = numDocs;
251
252 BrowseQueryResult browseResult;
253
254 UCArrayClear(browseNode.term);
255 // greenstone gives us the query encoded in unicode. We want utf8.
256 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
257 SetCStr(browseNode.term, utf8querystring);
258 delete []utf8querystring;
259
260 // do the actual query
261 MGBrowseQuery(*indexData, level, browseNode, browseResult);
262
263 // load results into term info
264 termfreqclass term;
265 for (int i=0; i<(int)browseResult.termFreqs.size(); ++i) {
266 term.clear();
267 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
268 term.termstr = to_uni(term_cstr);
269 delete []term_cstr;
270 term.termstemstr = term.termstr;
271 term.termfreq = browseResult.termFreqs[i].termFreq;
272 queryresult.terms.push_back(term);
273 queryresult.orgterms.push_back(term);
274 }
275
276 // clean up
277 unload_database(); // Important that local library doesn't leave any files open
278 delete []indexname;
279
280 return true;
281}
282
283// the document text for 'docnum' is placed in 'output'
284// docTargetDocument returns 'true' if it was able to
285// try to get a document
286// collection is needed to see if an index from the
287// collection is loaded. THe default index bits are just there cos
288// the mg version needs them
289
290bool mgppsearchclass::docTargetDocument(const text_t &/*defaultindex*/,
291 const text_t &/*defaultsubcollection*/,
292 const text_t &/*defaultlanguage*/,
293 const text_t &collection,
294 int docnum,
295 text_t &output) {
296
297#ifdef __WIN32__
298 char basepath[]="";
299#else
300 char basepath[] = "/";
301#endif
302 char *textname = (filename_cat(collectdir, getindexsuffix(collection, "text"))).getcstr();
303
304 TextData textdata;
305 if(!textdata.LoadData(basepath, textname)) {
306 cout<<"couldn't load text data\n"<<endl;
307 return false;
308 }
309 UCArray doctext;
310 UCArray level;
311 SetCStr(level, textlevel.getcstr());
312 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
313 cout<<"couldn't retrieve document text\n";
314 return false;
315 }
316
317 // convert UCArray to text_t
318 output.clear();
319 char* doctext_cstr = GetCStr(doctext);
320 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
321 delete []doctext_cstr;
322
323 // here need to remove the <Document>, <Section>, <Paragraph> tags
324
325
326 //clean up
327 textdata.UnloadData ();
328 delete []textname;
329
330 return true;
331
332}
333
334// used to clear any cached databases for persistent versions of
335// Greenstone like the Windows local library
336void mgppsearchclass::unload_database () {
337
338 if (indexData !=NULL) {
339 indexData->UnloadData();
340 }
341}
342
343
344
345
346
Note: See TracBrowser for help on using the repository browser.