source: trunk/gsdl/src/colservr/mgppsearch.cpp@ 3197

Last change on this file since 3197 was 3197, checked in by jrm21, 22 years ago

convert document text from utf-8 to unicode for greenstone

  • Property svn:keywords set to Author Date Id Revision
File size: 8.5 KB
Line 
1/**********************************************************************
2 *
3 * mgppsearch.cpp --
4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#include "gsdlconf.h"
28#include "mgppsearch.h"
29#include "fileutil.h"
30#include "GSDLQueryParser.h"
31#include "MGQuery.h"
32#include "TextGet.h"
33#include "queryinfo.h"
34#include "gsdlunicode.h"
35
36
37static text_t getindexsuffix(const queryparamclass &qp) {
38 text_t indexsuffix = "index";
39 text_t ind = qp.index;
40 text_t sub = qp.subcollection;
41 text_t lang = qp.language;
42
43 indexsuffix = filename_cat(indexsuffix, ind + sub + lang, qp.collection);
44 return indexsuffix;
45
46}
47
48////////////////////
49// mgppsearch class //
50////////////////////
51
52mgppsearchclass::mgppsearchclass ()
53 : searchclass() {
54
55 gdbm_level = "Document";
56 indexData = NULL;
57}
58
59mgppsearchclass::~mgppsearchclass ()
60{
61 if (cache != NULL)
62 {
63 delete cache;
64 cache = NULL;
65 }
66
67 if (indexData !=NULL) {
68 indexData->UnloadData();
69 delete indexData;
70 indexData = NULL;
71 }
72
73}
74
75void mgppsearchclass::set_gdbm_level(text_t &level) {
76 gdbm_level = level;
77
78}
79
80bool mgppsearchclass::search(const queryparamclass &queryparams,
81 queryresultsclass &queryresult) {
82
83#ifdef __WIN32__
84 char basepath[]="";
85#else
86 char basepath[] = "/";
87#endif
88
89 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
90
91 // load index data
92 if (indexData == NULL) {
93 indexData = new IndexData();
94 }
95 if (!indexData->LoadData (basepath, indexname)) {
96 cerr<<"couldn't load index data\n"<<endl;
97 return false;
98 }
99
100 // set default stem method from values originally set on prefs page
101 int defaultStemMethod = 0;
102 if (queryparams.casefolding) {
103 defaultStemMethod |= 1;
104 }
105 if (queryparams.stemming) {
106 defaultStemMethod |= 2;
107 }
108
109 // set default Boolean combiner from all/some setting
110 // if match_mode == 1, ie all, default=1 ie AND
111 // if match_mode == 0, ie some, default=0, ie OR
112 int defaultBoolCombine = 0;
113 if (queryparams.match_mode){
114 defaultBoolCombine = 1;
115 }
116
117 // use default query info settings - change to reflect user preferences??
118 QueryInfo queryInfo;
119 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
120 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
121 queryInfo.sortByRank = (queryparams.search_type == 1);
122 queryInfo.exactWeights = false;
123 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
124 queryInfo.needTermFreqs = true;
125
126 ExtQueryResult queryResult;
127
128 UCArray queryArray;
129 // greenstone gives us the query encoded in unicode. We want utf8.
130 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
131 SetCStr(queryArray, utf8querystring);
132 delete utf8querystring;
133
134 // create the mgpp query tree
135 QueryNode *queryTree = NULL;
136 queryTree = ParseQuery(queryArray, defaultBoolCombine, defaultStemMethod);
137
138 UCArray level;
139 UCArrayClear(level);
140
141 //set the level for results
142 SetCStr(level, gdbm_level.getcstr());
143
144
145 // do the query
146 MGQuery(*indexData, queryInfo, queryTree, queryResult, level);
147
148
149 // convert ExtQueryResult to queryresultclass
150
151 queryresult.docs_matched = (int)queryResult.docs.size();
152
153 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
154 queryresult.is_approx = Exact;
155 }
156 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
157 queryresult.is_approx = MoreThan;
158 }
159 else {
160 queryresult.is_approx = Approximate;
161 }
162
163 docresultclass doc;
164 for (int i=0; i<(int)queryResult.docs.size(); i++) {
165 doc.clear();
166 doc.docnum = (int)queryResult.levels[i];
167 doc.docweight = queryResult.ranks[i];
168 queryresult.docs.docset[doc.docnum] = doc;
169 queryresult.docs.docorder.push_back(doc.docnum);
170
171 }
172
173 // term info
174 termfreqclass term;
175 for (int k=0; k<(int)queryResult.termFreqs.size(); k++) {
176 term.clear();
177 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
178 term.termstr = to_uni(termfreq_cstr);
179 delete termfreq_cstr;
180 term.termstemstr = term.termstr;
181 // we don't set term.utf8equivterms ?? - jrm21
182 term.termfreq = queryResult.termFreqs[k].termFreq;
183 queryresult.terms.push_back(term);
184 queryresult.orgterms.push_back(term); // should this change??
185
186 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) {
187 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
188 queryresult.termvariants.insert(to_uni(equivterm_cstr));
189 delete equivterm_cstr;
190 }
191
192 }
193 // clean up
194 delete indexname;
195 return true;
196
197}
198
199
200bool mgppsearchclass::browse_search(const queryparamclass &queryparams, int start, int numDocs,
201 queryresultsclass &queryresult) {
202
203#ifdef __WIN32__
204 char basepath[]="";
205#else
206 char basepath[] = "/";
207#endif
208
209 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
210
211 if (indexData == NULL) {
212 indexData = new IndexData();
213 }
214 if (!indexData->LoadData (basepath, indexname)) {
215 cerr<<"couldn't load index data\n"<<endl;
216 return false;
217 }
218
219 UCArray level;
220 UCArrayClear(level);
221
222 //browse always at top level
223 SetCStr(level, "Document");
224
225
226 BrowseQueryNode browseNode;
227 browseNode.startPosition = start;
228 browseNode.numTerms = numDocs;
229
230 BrowseQueryResult browseResult;
231
232 UCArrayClear(browseNode.term);
233 // greenstone gives us the query encoded in unicode. We want utf8.
234 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
235 SetCStr(browseNode.term, utf8querystring);
236 delete utf8querystring;
237
238 // do the actual query
239 MGBrowseQuery(*indexData, level, browseNode, browseResult);
240
241 // load results into term info
242 termfreqclass term;
243 for (int i=0; i<(int)browseResult.termFreqs.size(); i++) {
244 term.clear();
245 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
246 term.termstr = to_uni(term_cstr);
247 delete term_cstr;
248 term.termstemstr = term.termstr;
249 term.termfreq = browseResult.termFreqs[i].termFreq;
250 queryresult.terms.push_back(term);
251 queryresult.orgterms.push_back(term);
252
253 }
254 // clean up
255 delete indexname;
256
257 return true;
258}
259
260// the document text for 'docnum' is placed in 'output'
261// docTargetDocument returns 'true' if it was able to
262// try to get a document
263// collection is needed to see if an index from the
264// collection is loaded. THe default index bits are just there cos
265// the mg version needs them
266
267bool mgppsearchclass::docTargetDocument(const text_t &/*defaultindex*/,
268 const text_t &/*defaultsubcollection*/,
269 const text_t &/*defaultlanguage*/,
270 const text_t &collection,
271 int docnum,
272 text_t &output) {
273
274#ifdef __WIN32__
275 char basepath[]="";
276#else
277 char basepath[] = "/";
278#endif
279 char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
280
281 TextData textdata;
282 if(!textdata.LoadData(basepath, textname)) {
283 cout<<"couldn't load text data\n"<<endl;
284 return false;
285 }
286 UCArray doctext;
287 UCArray level;
288 SetCStr(level, gdbm_level.getcstr());
289 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
290 cout<<"couldn't retrieve document text\n";
291 return false;
292 }
293
294 // convert UCArray to text_t
295 output.clear();
296 char* doctext_cstr = GetCStr(doctext);
297 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
298 delete doctext_cstr;
299
300 // here need to remove the <Document>, <Section>, <Paragraph> tags
301
302
303 //clean up
304 textdata.UnloadData ();
305 delete textname;
306
307 return true;
308
309}
310
311// used to clear any cached databases for persistent versions of
312// Greenstone like the Windows local library
313void mgppsearchclass::unload_database () {
314
315 if (indexData !=NULL) {
316 indexData->UnloadData();
317 }
318}
319
320
321
322
323
Note: See TracBrowser for help on using the repository browser.