source: trunk/gsdl/src/colservr/mgppsearch.cpp@ 9174

Last change on this file since 9174 was 4823, checked in by kjdon, 21 years ago

changed the default levels to Doc instead of DOcument. shoudl really not have any hard coded defaults, cos level names may change

  • Property svn:keywords set to Author Date Id Revision
File size: 8.6 KB
Line 
1/**********************************************************************
2 *
3 * mgppsearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#include "gsdlconf.h"
28#include "mgppsearch.h"
29#include "fileutil.h"
30#include "GSDLQueryParser.h"
31#include "MGQuery.h"
32#include "TextGet.h"
33#include "queryinfo.h"
34#include "gsdlunicode.h"
35
36
37static text_t getindexsuffix(const queryparamclass &qp) {
38 text_t indexsuffix = "index";
39 text_t ind = qp.index;
40 text_t sub = qp.subcollection;
41 text_t lang = qp.language;
42
43 indexsuffix = filename_cat(indexsuffix, ind + sub + lang, qp.collection);
44 return indexsuffix;
45
46}
47
48////////////////////
49// mgppsearch class //
50////////////////////
51
52mgppsearchclass::mgppsearchclass ()
53 : searchclass() {
54
55 gdbm_level = "Doc";
56 indexData = NULL;
57}
58
59mgppsearchclass::~mgppsearchclass ()
60{
61 if (cache != NULL)
62 {
63 delete cache;
64 cache = NULL;
65 }
66
67 if (indexData !=NULL) {
68 indexData->UnloadData();
69 delete indexData;
70 indexData = NULL;
71 }
72
73}
74
75void mgppsearchclass::set_gdbm_level(const text_t &level) {
76 gdbm_level = level;
77
78}
79
80
81bool mgppsearchclass::search(const queryparamclass &queryparams,
82 queryresultsclass &queryresult) {
83
84#ifdef __WIN32__
85 char basepath[]="";
86#else
87 char basepath[] = "/";
88#endif
89
90 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
91
92 // load index data
93 if (indexData == NULL) {
94 indexData = new IndexData();
95 }
96 if (!indexData->LoadData (basepath, indexname)) {
97 cerr<<"couldn't load index data\n"<<endl;
98 return false;
99 }
100
101 // set default stem method from values originally set on prefs page
102 int defaultStemMethod = 0;
103 if (queryparams.casefolding) {
104 defaultStemMethod |= 1;
105 }
106 if (queryparams.stemming) {
107 defaultStemMethod |= 2;
108 }
109
110 // set default Boolean combiner from all/some setting
111 // if match_mode == 1, ie all, default=1 ie AND
112 // if match_mode == 0, ie some, default=0, ie OR
113 int defaultBoolCombine = 0;
114 if (queryparams.match_mode){
115 defaultBoolCombine = 1;
116 }
117
118 // use default query info settings - change to reflect user preferences??
119 QueryInfo queryInfo;
120
121 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
122 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
123 queryInfo.sortByRank = (queryparams.search_type == 1);
124 queryInfo.exactWeights = false;
125 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
126 queryInfo.needTermFreqs = true;
127
128 ExtQueryResult queryResult;
129
130 UCArray queryArray;
131 // greenstone gives us the query encoded in unicode. We want utf8.
132 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
133 SetCStr(queryArray, utf8querystring);
134 delete utf8querystring;
135
136 // create the mgpp query tree
137 QueryNode *queryTree = NULL;
138 queryTree = ParseQuery(queryArray, defaultBoolCombine, defaultStemMethod);
139 if (queryTree == NULL) { // syntax error
140 queryresult.syntax_error = true;
141 return true; // should we return true or false?
142 }
143 UCArray level;
144 UCArrayClear(level);
145
146 //set the level for results
147 SetCStr(level, gdbm_level.getcstr());
148
149
150 // do the query
151 MGQuery(*indexData, queryInfo, queryTree, queryResult, level);
152
153
154 // convert ExtQueryResult to queryresultclass
155
156 queryresult.docs_matched = (int)queryResult.docs.size();
157
158 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
159 queryresult.is_approx = Exact;
160 }
161 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
162 queryresult.is_approx = MoreThan;
163 }
164 else {
165 queryresult.is_approx = Approximate;
166 }
167
168 docresultclass doc;
169 for (int i=0; i<(int)queryResult.docs.size(); i++) {
170 doc.clear();
171 doc.docnum = (int)queryResult.levels[i];
172 doc.docweight = queryResult.ranks[i];
173 queryresult.docs.docset[doc.docnum] = doc;
174 queryresult.docs.docorder.push_back(doc.docnum);
175
176 }
177
178 // term info
179 termfreqclass term;
180 for (int k=0; k<(int)queryResult.termFreqs.size(); k++) {
181 term.clear();
182 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
183 term.termstr = to_uni(termfreq_cstr);
184 delete termfreq_cstr;
185 term.termstemstr = term.termstr;
186 // we don't set term.utf8equivterms ?? - jrm21
187 term.termfreq = queryResult.termFreqs[k].termFreq;
188 queryresult.terms.push_back(term);
189 queryresult.orgterms.push_back(term); // should this change??
190
191 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) {
192 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
193 queryresult.termvariants.insert(to_uni(equivterm_cstr));
194 delete equivterm_cstr;
195 }
196
197 }
198 // clean up
199 delete indexname;
200 return true;
201
202}
203
204
205bool mgppsearchclass::browse_search(const queryparamclass &queryparams, int start, int numDocs,
206 queryresultsclass &queryresult) {
207
208#ifdef __WIN32__
209 char basepath[]="";
210#else
211 char basepath[] = "/";
212#endif
213
214 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
215
216 if (indexData == NULL) {
217 indexData = new IndexData();
218 }
219 if (!indexData->LoadData (basepath, indexname)) {
220 cerr<<"couldn't load index data\n"<<endl;
221 return false;
222 }
223
224 UCArray level;
225 UCArrayClear(level);
226
227 //browse always at top level
228 SetCStr(level, "Doc"); // this name may change.
229
230
231 BrowseQueryNode browseNode;
232 browseNode.startPosition = start;
233 browseNode.numTerms = numDocs;
234
235 BrowseQueryResult browseResult;
236
237 UCArrayClear(browseNode.term);
238 // greenstone gives us the query encoded in unicode. We want utf8.
239 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
240 SetCStr(browseNode.term, utf8querystring);
241 delete utf8querystring;
242
243 // do the actual query
244 MGBrowseQuery(*indexData, level, browseNode, browseResult);
245
246 // load results into term info
247 termfreqclass term;
248 for (int i=0; i<(int)browseResult.termFreqs.size(); i++) {
249 term.clear();
250 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
251 term.termstr = to_uni(term_cstr);
252 delete term_cstr;
253 term.termstemstr = term.termstr;
254 term.termfreq = browseResult.termFreqs[i].termFreq;
255 queryresult.terms.push_back(term);
256 queryresult.orgterms.push_back(term);
257
258 }
259 // clean up
260 delete indexname;
261
262 return true;
263}
264
265// the document text for 'docnum' is placed in 'output'
266// docTargetDocument returns 'true' if it was able to
267// try to get a document
268// collection is needed to see if an index from the
269// collection is loaded. THe default index bits are just there cos
270// the mg version needs them
271
272bool mgppsearchclass::docTargetDocument(const text_t &/*defaultindex*/,
273 const text_t &/*defaultsubcollection*/,
274 const text_t &/*defaultlanguage*/,
275 const text_t &collection,
276 int docnum,
277 text_t &output) {
278
279#ifdef __WIN32__
280 char basepath[]="";
281#else
282 char basepath[] = "/";
283#endif
284 char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
285
286 TextData textdata;
287 if(!textdata.LoadData(basepath, textname)) {
288 cout<<"couldn't load text data\n"<<endl;
289 return false;
290 }
291 UCArray doctext;
292 UCArray level;
293 SetCStr(level, gdbm_level.getcstr());
294 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
295 cout<<"couldn't retrieve document text\n";
296 return false;
297 }
298
299 // convert UCArray to text_t
300 output.clear();
301 char* doctext_cstr = GetCStr(doctext);
302 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
303 delete doctext_cstr;
304
305 // here need to remove the <Document>, <Section>, <Paragraph> tags
306
307
308 //clean up
309 textdata.UnloadData ();
310 delete textname;
311
312 return true;
313
314}
315
316// used to clear any cached databases for persistent versions of
317// Greenstone like the Windows local library
318void mgppsearchclass::unload_database () {
319
320 if (indexData !=NULL) {
321 indexData->UnloadData();
322 }
323}
324
325
326
327
328
Note: See TracBrowser for help on using the repository browser.