source: trunk/gsdl/src/colservr/mgppsearch.cpp@ 12869

Last change on this file since 12869 was 12869, checked in by kjdon, 18 years ago

Accent Folding patch, thanks to Juan Grigera. defaultStemMethod uses constants, and includes accent folding

  • Property svn:keywords set to Author Date Id Revision
File size: 9.1 KB
Line 
1/**********************************************************************
2 *
3 * mgppsearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#include "gsdlconf.h"
28#include "mgppsearch.h"
29#include "fileutil.h"
30#include "GSDLQueryParser.h"
31#include "MGQuery.h"
32#include "TextGet.h"
33#include "queryinfo.h"
34#include "gsdlunicode.h"
35
36
37text_t mgppsearchclass::getindexsuffix(const queryparamclass &qp) {
38 return getindexsuffix(qp.collection, qp.index+qp.subcollection+qp.language);
39
40}
41
42text_t mgppsearchclass::getindexsuffix (const text_t &collection,
43 const text_t &index) {
44
45 text_t indexsuffix = "index";
46 indexsuffix = filename_cat (indexsuffix, index);
47 if (indexstem.empty()) {
48 // no index stem, use the coll name
49 indexsuffix = filename_cat (indexsuffix, collection);
50 } else {
51 indexsuffix = filename_cat (indexsuffix, indexstem);
52 }
53 return indexsuffix;
54}
55
56////////////////////
57// mgppsearch class //
58////////////////////
59
60mgppsearchclass::mgppsearchclass ()
61 : searchclass() {
62
63 gdbm_level = "Doc";
64 indexData = NULL;
65}
66
67mgppsearchclass::~mgppsearchclass ()
68{
69 if (cache != NULL)
70 {
71 delete cache;
72 cache = NULL;
73 }
74
75 if (indexData !=NULL) {
76 indexData->UnloadData();
77 delete indexData;
78 indexData = NULL;
79 }
80
81}
82
83void mgppsearchclass::set_gdbm_level(const text_t &level) {
84 gdbm_level = level;
85
86}
87void mgppsearchclass::set_indexstem(const text_t &stem) {
88 indexstem = stem;
89
90}
91
92
93bool mgppsearchclass::search(const queryparamclass &queryparams,
94 queryresultsclass &queryresult) {
95
96#ifdef __WIN32__
97 char basepath[]="";
98#else
99 char basepath[] = "/";
100#endif
101
102 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
103
104 // load index data
105 if (indexData == NULL) {
106 indexData = new IndexData();
107 }
108 if (!indexData->LoadData (basepath, indexname)) {
109 cerr<<"couldn't load index data\n"<<endl;
110 return false;
111 }
112
113 // set default stem method from values originally set on prefs page
114 int defaultStemMethod = 0;
115 if (queryparams.casefolding) {
116 defaultStemMethod |= STEM_CaseFolding;
117 }
118 if (queryparams.stemming) {
119 defaultStemMethod |= STEM_Stemming;
120 }
121 if (queryparams.accentfolding) {
122 defaultStemMethod |= STEM_AccentFolding;
123 }
124
125 // set default Boolean combiner from all/some setting
126 // if match_mode == 1, ie all, default=1 ie AND
127 // if match_mode == 0, ie some, default=0, ie OR
128 int defaultBoolCombine = 0;
129 if (queryparams.match_mode){
130 defaultBoolCombine = 1;
131 }
132
133 // use default query info settings - change to reflect user preferences??
134 QueryInfo queryInfo;
135
136 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
137 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
138 queryInfo.sortByRank = (queryparams.search_type == 1);
139 queryInfo.exactWeights = false;
140 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
141 queryInfo.needTermFreqs = true;
142
143 ExtQueryResult queryResult;
144
145 UCArray queryArray;
146 // greenstone gives us the query encoded in unicode. We want utf8.
147 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
148 SetCStr(queryArray, utf8querystring);
149 delete []utf8querystring;
150
151 // create the mgpp query tree
152 QueryNode *queryTree = NULL;
153 queryTree = ParseQuery(queryArray, defaultBoolCombine, defaultStemMethod, queryparams.maxnumeric);
154 if (queryTree == NULL) { // syntax error
155 queryresult.syntax_error = true;
156 return true; // should we return true or false?
157 }
158 UCArray level;
159 UCArrayClear(level);
160
161 //set the level for results
162 SetCStr(level, gdbm_level.getcstr());
163
164
165 // do the query
166 MGQuery(*indexData, queryInfo, queryTree, queryResult, level);
167
168
169 // convert ExtQueryResult to queryresultclass
170
171 queryresult.docs_matched = (int)queryResult.docs.size();
172
173 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
174 queryresult.is_approx = Exact;
175 }
176 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
177 queryresult.is_approx = MoreThan;
178 }
179 else {
180 queryresult.is_approx = Approximate;
181 }
182
183 docresultclass doc;
184 for (int i=0; i<(int)queryResult.docs.size(); ++i) {
185 doc.clear();
186 doc.docnum = (int)queryResult.levels[i];
187 doc.docweight = queryResult.ranks[i];
188 queryresult.docs.docset[doc.docnum] = doc;
189 queryresult.docs.docorder.push_back(doc.docnum);
190
191 }
192
193 // term info
194 termfreqclass term;
195 for (int k=0; k<(int)queryResult.termFreqs.size(); ++k) {
196 term.clear();
197 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
198 term.termstr = to_uni(termfreq_cstr);
199 delete []termfreq_cstr;
200 term.termstemstr = term.termstr;
201 // we don't set term.utf8equivterms ?? - jrm21
202 term.termfreq = queryResult.termFreqs[k].termFreq;
203 queryresult.terms.push_back(term);
204 queryresult.orgterms.push_back(term); // should this change??
205
206 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); ++j) {
207 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
208 queryresult.termvariants.insert(to_uni(equivterm_cstr));
209 delete []equivterm_cstr;
210 }
211
212 }
213 // clean up
214 delete []indexname;
215 return true;
216
217}
218
219
220bool mgppsearchclass::browse_search(const queryparamclass &queryparams, int start, int numDocs,
221 queryresultsclass &queryresult) {
222
223#ifdef __WIN32__
224 char basepath[]="";
225#else
226 char basepath[] = "/";
227#endif
228
229 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
230
231 if (indexData == NULL) {
232 indexData = new IndexData();
233 }
234 if (!indexData->LoadData (basepath, indexname)) {
235 cerr<<"couldn't load index data\n"<<endl;
236 return false;
237 }
238
239 UCArray level;
240 UCArrayClear(level);
241
242 //browse always at top level
243 SetCStr(level, "Doc"); // this name may change.
244
245
246 BrowseQueryNode browseNode;
247 browseNode.startPosition = start;
248 browseNode.numTerms = numDocs;
249
250 BrowseQueryResult browseResult;
251
252 UCArrayClear(browseNode.term);
253 // greenstone gives us the query encoded in unicode. We want utf8.
254 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
255 SetCStr(browseNode.term, utf8querystring);
256 delete []utf8querystring;
257
258 // do the actual query
259 MGBrowseQuery(*indexData, level, browseNode, browseResult);
260
261 // load results into term info
262 termfreqclass term;
263 for (int i=0; i<(int)browseResult.termFreqs.size(); ++i) {
264 term.clear();
265 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
266 term.termstr = to_uni(term_cstr);
267 delete []term_cstr;
268 term.termstemstr = term.termstr;
269 term.termfreq = browseResult.termFreqs[i].termFreq;
270 queryresult.terms.push_back(term);
271 queryresult.orgterms.push_back(term);
272
273 }
274 // clean up
275 delete []indexname;
276
277 return true;
278}
279
280// the document text for 'docnum' is placed in 'output'
281// docTargetDocument returns 'true' if it was able to
282// try to get a document
283// collection is needed to see if an index from the
284// collection is loaded. THe default index bits are just there cos
285// the mg version needs them
286
287bool mgppsearchclass::docTargetDocument(const text_t &/*defaultindex*/,
288 const text_t &/*defaultsubcollection*/,
289 const text_t &/*defaultlanguage*/,
290 const text_t &collection,
291 int docnum,
292 text_t &output) {
293
294#ifdef __WIN32__
295 char basepath[]="";
296#else
297 char basepath[] = "/";
298#endif
299 char *textname = (filename_cat(collectdir, getindexsuffix(collection, "text"))).getcstr();
300
301 TextData textdata;
302 if(!textdata.LoadData(basepath, textname)) {
303 cout<<"couldn't load text data\n"<<endl;
304 return false;
305 }
306 UCArray doctext;
307 UCArray level;
308 SetCStr(level, gdbm_level.getcstr());
309 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
310 cout<<"couldn't retrieve document text\n";
311 return false;
312 }
313
314 // convert UCArray to text_t
315 output.clear();
316 char* doctext_cstr = GetCStr(doctext);
317 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
318 delete []doctext_cstr;
319
320 // here need to remove the <Document>, <Section>, <Paragraph> tags
321
322
323 //clean up
324 textdata.UnloadData ();
325 delete []textname;
326
327 return true;
328
329}
330
331// used to clear any cached databases for persistent versions of
332// Greenstone like the Windows local library
333void mgppsearchclass::unload_database () {
334
335 if (indexData !=NULL) {
336 indexData->UnloadData();
337 }
338}
339
340
341
342
343
Note: See TracBrowser for help on using the repository browser.