source: trunk/gsdl/src/colservr/mgppsearch.cpp@ 4160

Last change on this file since 4160 was 3197, checked in by jrm21, 22 years ago

convert document text from utf-8 to unicode for greenstone

  • Property svn:keywords set to Author Date Id Revision
File size: 8.5 KB
RevLine 
[1324]1/**********************************************************************
2 *
3 * mgppsearch.cpp --
[3150]4 * Copyright (C) 1999-2002 The New Zealand Digital Library Project
[1324]5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26
27#include "gsdlconf.h"
28#include "mgppsearch.h"
29#include "fileutil.h"
30#include "GSDLQueryParser.h"
31#include "MGQuery.h"
32#include "TextGet.h"
33#include "queryinfo.h"
[3150]34#include "gsdlunicode.h"
[1324]35
36
37static text_t getindexsuffix(const queryparamclass &qp) {
38 text_t indexsuffix = "index";
39 text_t ind = qp.index;
40 text_t sub = qp.subcollection;
41 text_t lang = qp.language;
42
43 indexsuffix = filename_cat(indexsuffix, ind + sub + lang, qp.collection);
44 return indexsuffix;
45
46}
47
48////////////////////
49// mgppsearch class //
50////////////////////
51
52mgppsearchclass::mgppsearchclass ()
53 : searchclass() {
[1908]54
55 gdbm_level = "Document";
[3175]56 indexData = NULL;
[1324]57}
58
59mgppsearchclass::~mgppsearchclass ()
60{
61 if (cache != NULL)
62 {
63 delete cache;
64 cache = NULL;
65 }
[2697]66
67 if (indexData !=NULL) {
68 indexData->UnloadData();
69 delete indexData;
70 indexData = NULL;
71 }
72
[1324]73}
74
[1908]75void mgppsearchclass::set_gdbm_level(text_t &level) {
76 gdbm_level = level;
77
78}
79
[1324]80bool mgppsearchclass::search(const queryparamclass &queryparams,
81 queryresultsclass &queryresult) {
[2701]82
83#ifdef __WIN32__
84 char basepath[]="";
85#else
86 char basepath[] = "/";
87#endif
88
[2699]89 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
90
91 // load index data
[2697]92 if (indexData == NULL) {
93 indexData = new IndexData();
94 }
95 if (!indexData->LoadData (basepath, indexname)) {
[1324]96 cerr<<"couldn't load index data\n"<<endl;
97 return false;
98 }
99
100 // set default stem method from values originally set on prefs page
101 int defaultStemMethod = 0;
102 if (queryparams.casefolding) {
103 defaultStemMethod |= 1;
104 }
105 if (queryparams.stemming) {
106 defaultStemMethod |= 2;
107 }
108
[1771]109 // set default Boolean combiner from all/some setting
110 // if match_mode == 1, ie all, default=1 ie AND
111 // if match_mode == 0, ie some, default=0, ie OR
112 int defaultBoolCombine = 0;
113 if (queryparams.match_mode){
114 defaultBoolCombine = 1;
115 }
116
[1324]117 // use default query info settings - change to reflect user preferences??
118 QueryInfo queryInfo;
119 SetCStr (queryInfo.docLevel, (queryparams.level.getcstr()));
120 queryInfo.maxDocs = (unsigned long)queryparams.maxdocs;
[1771]121 queryInfo.sortByRank = (queryparams.search_type == 1);
[1324]122 queryInfo.exactWeights = false;
[1771]123 queryInfo.needRankInfo = true; // used for overall term freq as well as ranking
[1324]124 queryInfo.needTermFreqs = true;
125
126 ExtQueryResult queryResult;
127
128 UCArray queryArray;
[3150]129 // greenstone gives us the query encoded in unicode. We want utf8.
130 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
131 SetCStr(queryArray, utf8querystring);
132 delete utf8querystring;
133
[1324]134 // create the mgpp query tree
135 QueryNode *queryTree = NULL;
[1771]136 queryTree = ParseQuery(queryArray, defaultBoolCombine, defaultStemMethod);
[1324]137
138 UCArray level;
139 UCArrayClear(level);
140
[1908]141 //set the level for results
142 SetCStr(level, gdbm_level.getcstr());
143
[1324]144
145 // do the query
[2697]146 MGQuery(*indexData, queryInfo, queryTree, queryResult, level);
[1324]147
148
149 // convert ExtQueryResult to queryresultclass
150
151 queryresult.docs_matched = (int)queryResult.docs.size();
[1689]152
153 if (queryresult.docs_matched == (int)queryResult.actualNumDocs) {
154 queryresult.is_approx = Exact;
155 }
156 else if (queryresult.docs_matched < (int)queryResult.actualNumDocs) {
157 queryresult.is_approx = MoreThan;
158 }
159 else {
160 queryresult.is_approx = Approximate;
161 }
162
[1324]163 docresultclass doc;
164 for (int i=0; i<(int)queryResult.docs.size(); i++) {
165 doc.clear();
166 doc.docnum = (int)queryResult.levels[i];
167 doc.docweight = queryResult.ranks[i];
168 queryresult.docs.docset[doc.docnum] = doc;
169 queryresult.docs.docorder.push_back(doc.docnum);
170
171 }
172
173 // term info
174 termfreqclass term;
[2545]175 for (int k=0; k<(int)queryResult.termFreqs.size(); k++) {
[1324]176 term.clear();
[3150]177 char* termfreq_cstr=GetCStr(queryResult.termFreqs[k].term);
178 term.termstr = to_uni(termfreq_cstr);
179 delete termfreq_cstr;
[1324]180 term.termstemstr = term.termstr;
[3150]181 // we don't set term.utf8equivterms ?? - jrm21
[2545]182 term.termfreq = queryResult.termFreqs[k].termFreq;
[1324]183 queryresult.terms.push_back(term);
[1834]184 queryresult.orgterms.push_back(term); // should this change??
185
[2549]186 for (int j=0; j<(int)queryResult.termFreqs[k].equivTerms.size(); j++) {
[3150]187 char* equivterm_cstr=GetCStr(queryResult.termFreqs[k].equivTerms[j]);
188 queryresult.termvariants.insert(to_uni(equivterm_cstr));
189 delete equivterm_cstr;
[1834]190 }
[1324]191
192 }
193 // clean up
194 delete indexname;
195 return true;
196
197}
198
199
200bool mgppsearchclass::browse_search(const queryparamclass &queryparams, int start, int numDocs,
201 queryresultsclass &queryresult) {
202
[2701]203#ifdef __WIN32__
204 char basepath[]="";
205#else
206 char basepath[] = "/";
207#endif
208
[2699]209 char *indexname = (filename_cat(collectdir, getindexsuffix(queryparams))).getcstr();
[1324]210
[2697]211 if (indexData == NULL) {
212 indexData = new IndexData();
213 }
214 if (!indexData->LoadData (basepath, indexname)) {
[1324]215 cerr<<"couldn't load index data\n"<<endl;
216 return false;
217 }
218
219 UCArray level;
220 UCArrayClear(level);
221
222 //browse always at top level
223 SetCStr(level, "Document");
224
225
226 BrowseQueryNode browseNode;
227 browseNode.startPosition = start;
228 browseNode.numTerms = numDocs;
229
230 BrowseQueryResult browseResult;
231
232 UCArrayClear(browseNode.term);
[3150]233 // greenstone gives us the query encoded in unicode. We want utf8.
234 char* utf8querystring=to_utf8(queryparams.querystring).getcstr();
235 SetCStr(browseNode.term, utf8querystring);
236 delete utf8querystring;
[1324]237
238 // do the actual query
[2697]239 MGBrowseQuery(*indexData, level, browseNode, browseResult);
[1324]240
241 // load results into term info
242 termfreqclass term;
243 for (int i=0; i<(int)browseResult.termFreqs.size(); i++) {
244 term.clear();
[3150]245 char* term_cstr = GetCStr(browseResult.termFreqs[i].term);
246 term.termstr = to_uni(term_cstr);
247 delete term_cstr;
[1324]248 term.termstemstr = term.termstr;
249 term.termfreq = browseResult.termFreqs[i].termFreq;
250 queryresult.terms.push_back(term);
251 queryresult.orgterms.push_back(term);
252
253 }
254 // clean up
255 delete indexname;
256
257 return true;
258}
259
260// the document text for 'docnum' is placed in 'output'
261// docTargetDocument returns 'true' if it was able to
262// try to get a document
263// collection is needed to see if an index from the
264// collection is loaded. THe default index bits are just there cos
265// the mg version needs them
266
267bool mgppsearchclass::docTargetDocument(const text_t &/*defaultindex*/,
268 const text_t &/*defaultsubcollection*/,
269 const text_t &/*defaultlanguage*/,
270 const text_t &collection,
271 int docnum,
272 text_t &output) {
273
[2701]274#ifdef __WIN32__
275 char basepath[]="";
276#else
277 char basepath[] = "/";
278#endif
279 char *textname = (filename_cat(collectdir, "index", "text", collection)).getcstr();;
[2699]280
[1324]281 TextData textdata;
[2699]282 if(!textdata.LoadData(basepath, textname)) {
283 cout<<"couldn't load text data\n"<<endl;
[1324]284 return false;
285 }
286 UCArray doctext;
287 UCArray level;
[1908]288 SetCStr(level, gdbm_level.getcstr());
[1324]289 if (!GetDocText(textdata, level, (unsigned long)docnum, doctext)) {
[2699]290 cout<<"couldn't retrieve document text\n";
[1324]291 return false;
292 }
293
294 // convert UCArray to text_t
295 output.clear();
[3150]296 char* doctext_cstr = GetCStr(doctext);
[3197]297 output = to_uni(doctext_cstr); // convert from utf-8 to unicode
[3150]298 delete doctext_cstr;
[1324]299
300 // here need to remove the <Document>, <Section>, <Paragraph> tags
301
[3197]302
[1324]303 //clean up
304 textdata.UnloadData ();
[2699]305 delete textname;
[1324]306
307 return true;
308
309}
310
[2679]311// used to clear any cached databases for persistent versions of
312// Greenstone like the Windows local library
313void mgppsearchclass::unload_database () {
[1324]314
[2699]315 if (indexData !=NULL) {
316 indexData->UnloadData();
317 }
[2679]318}
[1324]319
320
321
322
[2679]323
Note: See TracBrowser for help on using the repository browser.