source: trunk/gsdl/src/colservr/mgppsearch.cpp@ 3150

Last change on this file since 3150 was 3150, checked in by jrm21, 22 years ago

Text to/from the receptionist is encoded in unicode, so we convert to/from utf8 for mgpp.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.5 KB
Line 
1/**********************************************************************
2 *
3 * mgppsearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#include "gsdlconf.h"
28#include "mgppsearch.h"
29#include "fileutil.h"
30#include "GSDLQueryParser.h"
31#include "MGQuery.h"
32#include "TextGet.h"
33#include "queryinfo.h"
34#include "gsdlunicode.h"
35
36
37static text_t getindexsuffix(const queryparamclass &qp) {
38 text_t indexsuffix = "index";
39 text_t ind = qp.index;
40 text_t sub = qp.subcollection;
41 text_t lang = qp.language;
42
43 indexsuffix = filename_cat(indexsuffix, ind + sub + lang, qp.collection);
44 return indexsuffix;
45
46}
47
48////////////////////
49// mgppsearch class //
50////////////////////
51
52mgppsearchclass::mgppsearchclass ()
53 : searchclass() {
54
55 gdbm_level = "Document";
56}
57
58mgppsearchclass::~mgppsearchclass ()
59{
60 if (cache != NULL)
61 {
62 delete cache;
63 cache = NULL;
64 }
65
66 if (indexData !=NULL) {
67 indexData->UnloadData();
68 delete indexData;
69 indexData = NULL;
70 }
71
72}
73
74void mgppsearchclass::set_gdbm_level(text_t &level) {
75 gdbm_level = level;
76
77}
78
79bool mgppsearchclass::search(const queryparamclass &queryparams,
80 queryresultsclass &queryresult) {
81
82#ifdef __WIN32__
83 char basepath[]="";
84#else
85 char basepath[] = "/";
86#endif
87
88 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
89
90 // load index data
91 if (indexData == NULL) {
92 indexData = new IndexData();
93 }
94 if (!indexData->LoadData (basepath, indexname)) {
95 cerr<<"couldn't load index data\n"<<endl;
96 return false;
97 }
98
99 // set default stem method from values originally set on prefs page
100 int defaultStemMethod = 0;
101 if (queryparams.casefolding) {
102 defaultStemMethod |= 1;
103 }
104 if (queryparams.stemming) {
105 defaultStemMethod |= 2;
106 }
107
108 // set default Boolean combiner from all/some setting
109 // if match_mode == 1, ie all, default=1 ie AND
110 // if match_mode == 0, ie some, default=0, ie OR
111 int defaultBoolCombine = 0;
112 if (queryparams.match_mode){
113 defaultBoolCombine = 1;
114 }
115
116 // use default query info settings - change to reflect user preferences??
117 QueryInfo queryInfo;
118 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
119 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
120 queryInfo.sortByRank = (queryparams.search_type == 1);
121 queryInfo.exactWeights = false;
122 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
123 queryInfo.needTermFreqs = true;
124
125 ExtQueryResult queryResult;
126
127 UCArray queryArray;
128 // greenstone gives us the query encoded in unicode. We want utf8.
129 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
130 SetCStr(queryArray, utf8querystring);
131 delete utf8querystring;
132
133 // create the mgpp query tree
134 QueryNode *queryTree = NULL;
135 queryTree = ParseQuery(queryArray, defaultBoolCombine, defaultStemMethod);
136
137 UCArray level;
138 UCArrayClear(level);
139
140 //set the level for results
141 SetCStr(level, gdbm_level.getcstr());
142
143
144 // do the query
145 MGQuery(*indexData, queryInfo, queryTree, queryResult, level);
146
147
148 // convert ExtQueryResult to queryresultclass
149
150 queryresult.docs_matched = (int)queryResult.docs.size();
151
152 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
153 queryresult.is_approx = Exact;
154 }
155 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
156 queryresult.is_approx = MoreThan;
157 }
158 else {
159 queryresult.is_approx = Approximate;
160 }
161
162 docresultclass doc;
163 for (int i=0; i<(int)queryResult.docs.size(); i++) {
164 doc.clear();
165 doc.docnum = (int)queryResult.levels[i];
166 doc.docweight = queryResult.ranks[i];
167 queryresult.docs.docset[doc.docnum] = doc;
168 queryresult.docs.docorder.push_back(doc.docnum);
169
170 }
171
172 // term info
173 termfreqclass term;
174 for (int k=0; k<(int)queryResult.termFreqs.size(); k++) {
175 term.clear();
176 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
177 term.termstr = to_uni(termfreq_cstr);
178 delete termfreq_cstr;
179 term.termstemstr = term.termstr;
180 // we don't set term.utf8equivterms ?? - jrm21
181 term.termfreq = queryResult.termFreqs[k].termFreq;
182 queryresult.terms.push_back(term);
183 queryresult.orgterms.push_back(term); // should this change??
184
185 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) {
186 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
187 queryresult.termvariants.insert(to_uni(equivterm_cstr));
188 delete equivterm_cstr;
189 }
190
191 }
192 // clean up
193 delete indexname;
194 return true;
195
196}
197
198
199bool mgppsearchclass::browse_search(const queryparamclass &queryparams, int start, int numDocs,
200 queryresultsclass &queryresult) {
201
202#ifdef __WIN32__
203 char basepath[]="";
204#else
205 char basepath[] = "/";
206#endif
207
208 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
209
210 if (indexData == NULL) {
211 indexData = new IndexData();
212 }
213 if (!indexData->LoadData (basepath, indexname)) {
214 cerr<<"couldn't load index data\n"<<endl;
215 return false;
216 }
217
218 UCArray level;
219 UCArrayClear(level);
220
221 //browse always at top level
222 SetCStr(level, "Document");
223
224
225 BrowseQueryNode browseNode;
226 browseNode.startPosition = start;
227 browseNode.numTerms = numDocs;
228
229 BrowseQueryResult browseResult;
230
231 UCArrayClear(browseNode.term);
232 // greenstone gives us the query encoded in unicode. We want utf8.
233 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
234 SetCStr(browseNode.term, utf8querystring);
235 delete utf8querystring;
236
237 // do the actual query
238 MGBrowseQuery(*indexData, level, browseNode, browseResult);
239
240 // load results into term info
241 termfreqclass term;
242 for (int i=0; i<(int)browseResult.termFreqs.size(); i++) {
243 term.clear();
244 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
245 term.termstr = to_uni(term_cstr);
246 delete term_cstr;
247 term.termstemstr = term.termstr;
248 term.termfreq = browseResult.termFreqs[i].termFreq;
249 queryresult.terms.push_back(term);
250 queryresult.orgterms.push_back(term);
251
252 }
253 // clean up
254 delete indexname;
255
256 return true;
257}
258
259// the document text for 'docnum' is placed in 'output'
260// docTargetDocument returns 'true' if it was able to
261// try to get a document
262// collection is needed to see if an index from the
263// collection is loaded. THe default index bits are just there cos
264// the mg version needs them
265
266bool mgppsearchclass::docTargetDocument(const text_t &/*defaultindex*/,
267 const text_t &/*defaultsubcollection*/,
268 const text_t &/*defaultlanguage*/,
269 const text_t &collection,
270 int docnum,
271 text_t &output) {
272
273#ifdef __WIN32__
274 char basepath[]="";
275#else
276 char basepath[] = "/";
277#endif
278 char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
279
280 TextData textdata;
281 if(!textdata.LoadData(basepath, textname)) {
282 cout<<"couldn't load text data\n"<<endl;
283 return false;
284 }
285 UCArray doctext;
286 UCArray level;
287 SetCStr(level, gdbm_level.getcstr());
288 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
289 cout<<"couldn't retrieve document text\n";
290 return false;
291 }
292
293 // convert UCArray to text_t
294 output.clear();
295 char* doctext_cstr = GetCStr(doctext);
296 output = doctext_cstr;
297 delete doctext_cstr;
298
299 // here need to remove the <Document>, <Section>, <Paragraph> tags
300
301 // mg converts to unicode, this may need to be added here???
302
303 //clean up
304 textdata.UnloadData ();
305 delete textname;
306
307 return true;
308
309}
310
311// used to clear any cached databases for persistent versions of
312// Greenstone like the Windows local library
313void mgppsearchclass::unload_database () {
314
315 if (indexData !=NULL) {
316 indexData->UnloadData();
317 }
318}
319
320
321
322
323
Note: See TracBrowser for help on using the repository browser.