source: trunk/gsdl/src/colservr/mgppsearch.cpp@ 13780

Last change on this file since 13780 was 13780, checked in by mdewsnip, 17 years ago

GLI/LOCAL LIBRARY: To prevent the problems with the GLI being unable to install newly built collections because the local library is holding files open, much more care needs to be taken to close files (typically the GDBM database and the MG/MGPP index files) after use. Fixed a lot of places where files were being left open.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.3 KB
Line 
1/**********************************************************************
2 *
3 * mgppsearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#include "gsdlconf.h"
28#include "mgppsearch.h"
29#include "fileutil.h"
30#include "GSDLQueryParser.h"
31#include "MGQuery.h"
32#include "TextGet.h"
33#include "queryinfo.h"
34#include "gsdlunicode.h"
35
36
37text_t mgppsearchclass::getindexsuffix(const queryparamclass &qp) {
38 return getindexsuffix(qp.collection, qp.index+qp.subcollection+qp.language);
39
40}
41
42text_t mgppsearchclass::getindexsuffix (const text_t &collection,
43 const text_t &index) {
44
45 text_t indexsuffix = "index";
46 indexsuffix = filename_cat (indexsuffix, index);
47 if (indexstem.empty()) {
48 // no index stem, use the coll name
49 indexsuffix = filename_cat (indexsuffix, collection);
50 } else {
51 indexsuffix = filename_cat (indexsuffix, indexstem);
52 }
53 return indexsuffix;
54}
55
56////////////////////
57// mgppsearch class //
58////////////////////
59
60mgppsearchclass::mgppsearchclass ()
61 : searchclass() {
62
63 gdbm_level = "Doc";
64 indexData = NULL;
65}
66
67mgppsearchclass::~mgppsearchclass ()
68{
69 if (cache != NULL)
70 {
71 delete cache;
72 cache = NULL;
73 }
74
75 if (indexData !=NULL) {
76 indexData->UnloadData();
77 delete indexData;
78 indexData = NULL;
79 }
80
81}
82
83void mgppsearchclass::set_gdbm_level(const text_t &level) {
84 gdbm_level = level;
85
86}
87void mgppsearchclass::set_indexstem(const text_t &stem) {
88 indexstem = stem;
89
90}
91
92
93bool mgppsearchclass::search(const queryparamclass &queryparams,
94 queryresultsclass &queryresult) {
95
96#ifdef __WIN32__
97 char basepath[]="";
98#else
99 char basepath[] = "/";
100#endif
101
102 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
103
104 // load index data
105 if (indexData == NULL) {
106 indexData = new IndexData();
107 }
108 if (!indexData->LoadData (basepath, indexname)) {
109 cerr<<"couldn't load index data\n"<<endl;
110 return false;
111 }
112
113 // set default stem method from values originally set on prefs page
114 int defaultStemMethod = 0;
115 if (queryparams.casefolding) {
116 defaultStemMethod |= STEM_CaseFolding;
117 }
118 if (queryparams.stemming) {
119 defaultStemMethod |= STEM_Stemming;
120 }
121 if (queryparams.accentfolding) {
122 defaultStemMethod |= STEM_AccentFolding;
123 }
124
125 // set default Boolean combiner from all/some setting
126 // if match_mode == 1, ie all, default=1 ie AND
127 // if match_mode == 0, ie some, default=0, ie OR
128 int defaultBoolCombine = 0;
129 if (queryparams.match_mode){
130 defaultBoolCombine = 1;
131 }
132
133 // use default query info settings - change to reflect user preferences??
134 QueryInfo queryInfo;
135
136 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
137 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
138 queryInfo.sortByRank = (queryparams.search_type == 1);
139 queryInfo.exactWeights = false;
140 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
141 queryInfo.needTermFreqs = true;
142
143 ExtQueryResult queryResult;
144
145 UCArray queryArray;
146 // greenstone gives us the query encoded in unicode. We want utf8.
147 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
148 SetCStr(queryArray, utf8querystring);
149 delete []utf8querystring;
150
151 // create the mgpp query tree
152 QueryNode *queryTree = NULL;
153 queryTree = ParseQuery(queryArray, defaultBoolCombine, defaultStemMethod, queryparams.maxnumeric);
154 if (queryTree == NULL) { // syntax error
155 queryresult.syntax_error = true;
156 return true; // should we return true or false?
157 }
158 UCArray level;
159 UCArrayClear(level);
160
161 //set the level for results
162 SetCStr(level, gdbm_level.getcstr());
163
164
165 // do the query
166 MGQuery(*indexData, queryInfo, queryTree, queryResult, level);
167
168
169 // convert ExtQueryResult to queryresultclass
170
171 queryresult.docs_matched = (int)queryResult.docs.size();
172
173 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
174 queryresult.is_approx = Exact;
175 }
176 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
177 queryresult.is_approx = MoreThan;
178 }
179 else {
180 queryresult.is_approx = Approximate;
181 }
182
183 docresultclass doc;
184 for (int i=0; i<(int)queryResult.docs.size(); ++i) {
185 doc.clear();
186 doc.docnum = (int)queryResult.levels[i];
187 doc.docweight = queryResult.ranks[i];
188 queryresult.docs.docset[doc.docnum] = doc;
189 queryresult.docs.docorder.push_back(doc.docnum);
190
191 }
192
193 // term info
194 termfreqclass term;
195 for (int k=0; k<(int)queryResult.termFreqs.size(); ++k) {
196 term.clear();
197 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
198 term.termstr = to_uni(termfreq_cstr);
199 delete []termfreq_cstr;
200 term.termstemstr = term.termstr;
201 // we don't set term.utf8equivterms ?? - jrm21
202 term.termfreq = queryResult.termFreqs[k].termFreq;
203 queryresult.terms.push_back(term);
204 queryresult.orgterms.push_back(term); // should this change??
205
206 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); ++j) {
207 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
208 queryresult.termvariants.insert(to_uni(equivterm_cstr));
209 delete []equivterm_cstr;
210 }
211
212 }
213 // clean up
214 unload_database(); // Important that local library doesn't leave any files open
215 delete []indexname;
216 return true;
217
218}
219
220
221bool mgppsearchclass::browse_search(const queryparamclass &queryparams, int start, int numDocs,
222 queryresultsclass &queryresult) {
223
224#ifdef __WIN32__
225 char basepath[]="";
226#else
227 char basepath[] = "/";
228#endif
229
230 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
231
232 if (indexData == NULL) {
233 indexData = new IndexData();
234 }
235 if (!indexData->LoadData (basepath, indexname)) {
236 cerr<<"couldn't load index data\n"<<endl;
237 return false;
238 }
239
240 UCArray level;
241 UCArrayClear(level);
242
243 //browse always at top level
244 SetCStr(level, "Doc"); // this name may change.
245
246
247 BrowseQueryNode browseNode;
248 browseNode.startPosition = start;
249 browseNode.numTerms = numDocs;
250
251 BrowseQueryResult browseResult;
252
253 UCArrayClear(browseNode.term);
254 // greenstone gives us the query encoded in unicode. We want utf8.
255 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
256 SetCStr(browseNode.term, utf8querystring);
257 delete []utf8querystring;
258
259 // do the actual query
260 MGBrowseQuery(*indexData, level, browseNode, browseResult);
261
262 // load results into term info
263 termfreqclass term;
264 for (int i=0; i<(int)browseResult.termFreqs.size(); ++i) {
265 term.clear();
266 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
267 term.termstr = to_uni(term_cstr);
268 delete []term_cstr;
269 term.termstemstr = term.termstr;
270 term.termfreq = browseResult.termFreqs[i].termFreq;
271 queryresult.terms.push_back(term);
272 queryresult.orgterms.push_back(term);
273 }
274
275 // clean up
276 unload_database(); // Important that local library doesn't leave any files open
277 delete []indexname;
278
279 return true;
280}
281
282// the document text for 'docnum' is placed in 'output'
283// docTargetDocument returns 'true' if it was able to
284// try to get a document
285// collection is needed to see if an index from the
286// collection is loaded. THe default index bits are just there cos
287// the mg version needs them
288
289bool mgppsearchclass::docTargetDocument(const text_t &/*defaultindex*/,
290 const text_t &/*defaultsubcollection*/,
291 const text_t &/*defaultlanguage*/,
292 const text_t &collection,
293 int docnum,
294 text_t &output) {
295
296#ifdef __WIN32__
297 char basepath[]="";
298#else
299 char basepath[] = "/";
300#endif
301 char *textname = (filename_cat(collectdir, getindexsuffix(collection, "text"))).getcstr();
302
303 TextData textdata;
304 if(!textdata.LoadData(basepath, textname)) {
305 cout<<"couldn't load text data\n"<<endl;
306 return false;
307 }
308 UCArray doctext;
309 UCArray level;
310 SetCStr(level, gdbm_level.getcstr());
311 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
312 cout<<"couldn't retrieve document text\n";
313 return false;
314 }
315
316 // convert UCArray to text_t
317 output.clear();
318 char* doctext_cstr = GetCStr(doctext);
319 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
320 delete []doctext_cstr;
321
322 // here need to remove the <Document>, <Section>, <Paragraph> tags
323
324
325 //clean up
326 textdata.UnloadData ();
327 delete []textname;
328
329 return true;
330
331}
332
333// used to clear any cached databases for persistent versions of
334// Greenstone like the Windows local library
335void mgppsearchclass::unload_database () {
336
337 if (indexData !=NULL) {
338 indexData->UnloadData();
339 }
340}
341
342
343
344
345
Note: See TracBrowser for help on using the repository browser.