source: trunk/gsdl/src/colservr/mgppsearch.cpp@ 12785

Last change on this file since 12785 was 12314, checked in by kjdon, 18 years ago

maxnumeric moved from mgqueryfilterclass to queryfilterclass, cos now mgpp uses it too. Its passed in as an arg to ParseQuery

  • Property svn:keywords set to Author Date Id Revision
File size: 9.0 KB
Line 
1/**********************************************************************
2 *
3 * mgppsearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#include "gsdlconf.h"
28#include "mgppsearch.h"
29#include "fileutil.h"
30#include "GSDLQueryParser.h"
31#include "MGQuery.h"
32#include "TextGet.h"
33#include "queryinfo.h"
34#include "gsdlunicode.h"
35
36
37text_t mgppsearchclass::getindexsuffix(const queryparamclass &qp) {
38 return getindexsuffix(qp.collection, qp.index+qp.subcollection+qp.language);
39
40}
41
42text_t mgppsearchclass::getindexsuffix (const text_t &collection,
43 const text_t &index) {
44
45 text_t indexsuffix = "index";
46 indexsuffix = filename_cat (indexsuffix, index);
47 if (indexstem.empty()) {
48 // no index stem, use the coll name
49 indexsuffix = filename_cat (indexsuffix, collection);
50 } else {
51 indexsuffix = filename_cat (indexsuffix, indexstem);
52 }
53 return indexsuffix;
54}
55
56////////////////////
57// mgppsearch class //
58////////////////////
59
60mgppsearchclass::mgppsearchclass ()
61 : searchclass() {
62
63 gdbm_level = "Doc";
64 indexData = NULL;
65}
66
67mgppsearchclass::~mgppsearchclass ()
68{
69 if (cache != NULL)
70 {
71 delete cache;
72 cache = NULL;
73 }
74
75 if (indexData !=NULL) {
76 indexData->UnloadData();
77 delete indexData;
78 indexData = NULL;
79 }
80
81}
82
83void mgppsearchclass::set_gdbm_level(const text_t &level) {
84 gdbm_level = level;
85
86}
87void mgppsearchclass::set_indexstem(const text_t &stem) {
88 indexstem = stem;
89
90}
91
92
93bool mgppsearchclass::search(const queryparamclass &queryparams,
94 queryresultsclass &queryresult) {
95
96#ifdef __WIN32__
97 char basepath[]="";
98#else
99 char basepath[] = "/";
100#endif
101
102 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
103
104 // load index data
105 if (indexData == NULL) {
106 indexData = new IndexData();
107 }
108 if (!indexData->LoadData (basepath, indexname)) {
109 cerr<<"couldn't load index data\n"<<endl;
110 return false;
111 }
112
113 // set default stem method from values originally set on prefs page
114 int defaultStemMethod = 0;
115 if (queryparams.casefolding) {
116 defaultStemMethod |= 1;
117 }
118 if (queryparams.stemming) {
119 defaultStemMethod |= 2;
120 }
121
122 // set default Boolean combiner from all/some setting
123 // if match_mode == 1, ie all, default=1 ie AND
124 // if match_mode == 0, ie some, default=0, ie OR
125 int defaultBoolCombine = 0;
126 if (queryparams.match_mode){
127 defaultBoolCombine = 1;
128 }
129
130 // use default query info settings - change to reflect user preferences??
131 QueryInfo queryInfo;
132
133 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
134 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
135 queryInfo.sortByRank = (queryparams.search_type == 1);
136 queryInfo.exactWeights = false;
137 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
138 queryInfo.needTermFreqs = true;
139
140 ExtQueryResult queryResult;
141
142 UCArray queryArray;
143 // greenstone gives us the query encoded in unicode. We want utf8.
144 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
145 SetCStr(queryArray, utf8querystring);
146 delete []utf8querystring;
147
148 // create the mgpp query tree
149 QueryNode *queryTree = NULL;
150 queryTree = ParseQuery(queryArray, defaultBoolCombine, defaultStemMethod, queryparams.maxnumeric);
151 if (queryTree == NULL) { // syntax error
152 queryresult.syntax_error = true;
153 return true; // should we return true or false?
154 }
155 UCArray level;
156 UCArrayClear(level);
157
158 //set the level for results
159 SetCStr(level, gdbm_level.getcstr());
160
161
162 // do the query
163 MGQuery(*indexData, queryInfo, queryTree, queryResult, level);
164
165
166 // convert ExtQueryResult to queryresultclass
167
168 queryresult.docs_matched = (int)queryResult.docs.size();
169
170 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
171 queryresult.is_approx = Exact;
172 }
173 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
174 queryresult.is_approx = MoreThan;
175 }
176 else {
177 queryresult.is_approx = Approximate;
178 }
179
180 docresultclass doc;
181 for (int i=0; i<(int)queryResult.docs.size(); ++i) {
182 doc.clear();
183 doc.docnum = (int)queryResult.levels[i];
184 doc.docweight = queryResult.ranks[i];
185 queryresult.docs.docset[doc.docnum] = doc;
186 queryresult.docs.docorder.push_back(doc.docnum);
187
188 }
189
190 // term info
191 termfreqclass term;
192 for (int k=0; k<(int)queryResult.termFreqs.size(); ++k) {
193 term.clear();
194 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
195 term.termstr = to_uni(termfreq_cstr);
196 delete []termfreq_cstr;
197 term.termstemstr = term.termstr;
198 // we don't set term.utf8equivterms ?? - jrm21
199 term.termfreq = queryResult.termFreqs[k].termFreq;
200 queryresult.terms.push_back(term);
201 queryresult.orgterms.push_back(term); // should this change??
202
203 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); ++j) {
204 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
205 queryresult.termvariants.insert(to_uni(equivterm_cstr));
206 delete []equivterm_cstr;
207 }
208
209 }
210 // clean up
211 delete []indexname;
212 return true;
213
214}
215
216
217bool mgppsearchclass::browse_search(const queryparamclass &queryparams, int start, int numDocs,
218 queryresultsclass &queryresult) {
219
220#ifdef __WIN32__
221 char basepath[]="";
222#else
223 char basepath[] = "/";
224#endif
225
226 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
227
228 if (indexData == NULL) {
229 indexData = new IndexData();
230 }
231 if (!indexData->LoadData (basepath, indexname)) {
232 cerr<<"couldn't load index data\n"<<endl;
233 return false;
234 }
235
236 UCArray level;
237 UCArrayClear(level);
238
239 //browse always at top level
240 SetCStr(level, "Doc"); // this name may change.
241
242
243 BrowseQueryNode browseNode;
244 browseNode.startPosition = start;
245 browseNode.numTerms = numDocs;
246
247 BrowseQueryResult browseResult;
248
249 UCArrayClear(browseNode.term);
250 // greenstone gives us the query encoded in unicode. We want utf8.
251 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
252 SetCStr(browseNode.term, utf8querystring);
253 delete []utf8querystring;
254
255 // do the actual query
256 MGBrowseQuery(*indexData, level, browseNode, browseResult);
257
258 // load results into term info
259 termfreqclass term;
260 for (int i=0; i<(int)browseResult.termFreqs.size(); ++i) {
261 term.clear();
262 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
263 term.termstr = to_uni(term_cstr);
264 delete []term_cstr;
265 term.termstemstr = term.termstr;
266 term.termfreq = browseResult.termFreqs[i].termFreq;
267 queryresult.terms.push_back(term);
268 queryresult.orgterms.push_back(term);
269
270 }
271 // clean up
272 delete []indexname;
273
274 return true;
275}
276
277// the document text for 'docnum' is placed in 'output'
278// docTargetDocument returns 'true' if it was able to
279// try to get a document
280// collection is needed to see if an index from the
281// collection is loaded. THe default index bits are just there cos
282// the mg version needs them
283
284bool mgppsearchclass::docTargetDocument(const text_t &/*defaultindex*/,
285 const text_t &/*defaultsubcollection*/,
286 const text_t &/*defaultlanguage*/,
287 const text_t &collection,
288 int docnum,
289 text_t &output) {
290
291#ifdef __WIN32__
292 char basepath[]="";
293#else
294 char basepath[] = "/";
295#endif
296 char *textname = (filename_cat(collectdir, getindexsuffix(collection, "text"))).getcstr();
297
298 TextData textdata;
299 if(!textdata.LoadData(basepath, textname)) {
300 cout<<"couldn't load text data\n"<<endl;
301 return false;
302 }
303 UCArray doctext;
304 UCArray level;
305 SetCStr(level, gdbm_level.getcstr());
306 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
307 cout<<"couldn't retrieve document text\n";
308 return false;
309 }
310
311 // convert UCArray to text_t
312 output.clear();
313 char* doctext_cstr = GetCStr(doctext);
314 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
315 delete []doctext_cstr;
316
317 // here need to remove the <Document>, <Section>, <Paragraph> tags
318
319
320 //clean up
321 textdata.UnloadData ();
322 delete []textname;
323
324 return true;
325
326}
327
328// used to clear any cached databases for persistent versions of
329// Greenstone like the Windows local library
330void mgppsearchclass::unload_database () {
331
332 if (indexData !=NULL) {
333 indexData->UnloadData();
334 }
335}
336
337
338
339
340
Note: See TracBrowser for help on using the repository browser.