source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/AbstractGS2TextSearch.java@ 26344

Last change on this file since 26344 was 26344, checked in by kjdon, 12 years ago

fiddling with index options (stem, case, accent). can now set the default value in collectionCOnfig.xml by adding default=on/off to the indexOption element

File size: 12.6 KB
Line 
1/*
2 * AbstractGS2TextSearch.java
3 * Copyright (C) 2011 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18package org.greenstone.gsdl3.service;
19
20import java.util.ArrayList;
21
22import org.apache.log4j.Logger;
23import org.greenstone.gsdl3.util.GSXML;
24import org.greenstone.gsdl3.util.BasicDocumentDatabase;
25import org.w3c.dom.Document;
26import org.w3c.dom.Element;
27import org.w3c.dom.NodeList;
28
29public abstract class AbstractGS2TextSearch extends AbstractTextSearch
30{
31 protected static final String EQUIV_TERM_ELEM = "equivTerm";
32
33 protected static final String STEM_ATT = "stem";
34 protected static final String NUM_DOCS_MATCH_ATT = "numDocsMatch";
35 protected static final String FREQ_ATT = "freq";
36
37 // Elements used in the config file that are specific to this class
38 protected static final String DEFAULT_INDEX_ELEM = "defaultIndex";
39 protected static final String INDEX_STEM_ELEM = "indexStem";
40 protected static final String INDEX_ELEM = "index";
41 protected static final String DEFAULT_INDEX_SUBCOLLECTION_ELEM = "defaultIndexSubcollection";
42 protected static final String DEFAULT_INDEX_LANGUAGE_ELEM = "defaultIndexLanguage";
43 protected static final String INDEX_SUBCOLLECTION_ELEM = "indexSubcollection";
44 protected static final String INDEX_LANGUAGE_ELEM = "indexLanguage";
45
46 // Some indexing options
47 protected static final String STEMINDEX_OPTION = "stemIndexes";
48 protected static final String MAXNUMERIC_OPTION = "maxnumeric";
49
50 /** the stem used for the index files */
51 protected String index_stem = null;
52
53 // stem indexes available
54 protected boolean does_case = false;
55 protected boolean does_stem = false;
56 protected boolean does_accent = false;
57
58 // default values for stem indexes
59 protected String case_default = BOOLEAN_PARAM_ON;
60 protected String accent_default = BOOLEAN_PARAM_ON;
61 protected String stem_default = BOOLEAN_PARAM_OFF;
62 // maxnumeric -
63 protected int maxnumeric = 4;
64
65 BasicDocumentDatabase gs_doc_db = null;
66
67 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.AbstractGS2TextSearch.class.getName());
68
69 /** constructor */
70 public AbstractGS2TextSearch()
71 {
72
73 }
74
75 public void cleanUp()
76 {
77 super.cleanUp();
78 this.gs_doc_db.cleanUp();
79 }
80
81 /** configure this service */
82 public boolean configure(Element info, Element extra_info)
83 {
84 if (!super.configure(info, extra_info))
85 {
86 return false;
87 }
88
89 // find out what kind of database we have
90 Element database_type_elem = (Element) GSXML.getChildByTagName(info, GSXML.DATABASE_TYPE_ELEM);
91 String database_type = null;
92 if (database_type_elem != null)
93 {
94 database_type = database_type_elem.getAttribute(GSXML.NAME_ATT);
95 }
96 if (database_type == null || database_type.equals(""))
97 {
98 database_type = "gdbm"; // the default
99 }
100
101 // the index stem is either the collection name or is specified in the config file
102 Element index_stem_elem = (Element) GSXML.getChildByTagName(info, INDEX_STEM_ELEM);
103 if (index_stem_elem != null)
104 {
105 this.index_stem = index_stem_elem.getAttribute(GSXML.NAME_ATT);
106 }
107 if (this.index_stem == null || this.index_stem.equals(""))
108 {
109 logger.warn("indexStem element not found, stem will default to collection name");
110 this.index_stem = this.cluster_name;
111 }
112
113 // replaces default AbstractSearch version with one tied to database
114 gs_doc_db = new BasicDocumentDatabase(this.doc, database_type, this.site_home, this.cluster_name, this.index_stem);
115 if (!gs_doc_db.isValid())
116 {
117 logger.error("Failed to open Document Database.");
118 return false;
119 }
120 this.gs_doc = gs_doc_db;
121
122 // do we support any of the extended features?
123 does_chunking = true;
124
125 // Get the default index out of <defaultIndex> (buildConfig.xml)
126 Element def = (Element) GSXML.getChildByTagName(info, DEFAULT_INDEX_ELEM);
127 if (def != null)
128 {
129 this.default_index = def.getAttribute(GSXML.SHORTNAME_ATT);
130 } // otherwise will be "", and the first one will be the default
131
132 //get the default indexSubcollection out of <defaultIndexSubcollection> (buildConfig.xml)
133 Element defSub = (Element) GSXML.getChildByTagName(info, DEFAULT_INDEX_SUBCOLLECTION_ELEM);
134 if (defSub != null)
135 {
136 this.default_index_subcollection = defSub.getAttribute(GSXML.SHORTNAME_ATT);
137 }
138
139 //get the default indexLanguage out of <defaultIndexLanguage> (buildConfig.xml)
140 Element defLang = (Element) GSXML.getChildByTagName(info, DEFAULT_INDEX_LANGUAGE_ELEM);
141 if (defLang != null)
142 {
143 this.default_index_language = defLang.getAttribute(GSXML.SHORTNAME_ATT);
144 } //concate defaultIndex + defaultIndexSubcollection + defaultIndexLanguage
145
146 // get index options
147 Element index_option_list = (Element) GSXML.getChildByTagName(info, GSXML.INDEX_OPTION_ELEM + GSXML.LIST_MODIFIER);
148 if (index_option_list != null)
149 {
150 NodeList options = index_option_list.getElementsByTagName(GSXML.INDEX_OPTION_ELEM);
151 for (int i = 0; i < options.getLength(); i++)
152 {
153 Element opt = (Element) options.item(i);
154 String name = opt.getAttribute(GSXML.NAME_ATT);
155 String value = opt.getAttribute(GSXML.VALUE_ATT);
156 if (name.equals(MAXNUMERIC_OPTION))
157 {
158 int maxnum = Integer.parseInt(value);
159 if (4 <= maxnum && maxnum < 512)
160 {
161 maxnumeric = maxnum;
162 }
163 }
164 else if (name.equals(STEMINDEX_OPTION))
165 {
166 int stemindex = Integer.parseInt(value);
167 if ((stemindex & 1) != 0)
168 {
169 does_case = true;
170 }
171 if ((stemindex & 2) != 0)
172 {
173 does_stem = true;
174 }
175 if ((stemindex & 4) != 0)
176 {
177 does_accent = true;
178 }
179 }
180 }
181 }
182
183 // get display info from extra info
184 if (extra_info != null)
185 {
186 Document owner = info.getOwnerDocument();
187 Element config_search = (Element) GSXML.getChildByTagName(extra_info, GSXML.SEARCH_ELEM);
188
189 // work out what the default values for the stemming options are
190 if (does_case || does_accent || does_stem) {
191 // only bother looking for this is we have some of these set
192 NodeList index_options = config_search.getElementsByTagName(GSXML.INDEX_OPTION_ELEM);
193 for (int i = 0; i < index_options.getLength(); i++) {
194 Element ind = (Element) index_options.item(i);
195 String name = ind.getAttribute(GSXML.NAME_ATT);
196 String def_val = ind.getAttribute(GSXML.DEFAULT_ATT);
197
198 if (!def_val.equals("")) {
199 if (name.equals("stem")) {
200 stem_default = (def_val.equals("on")? BOOLEAN_PARAM_ON: BOOLEAN_PARAM_OFF);
201 } else if (name.equals("casefold")) {
202 case_default = (def_val.equals("on")? BOOLEAN_PARAM_ON: BOOLEAN_PARAM_OFF);
203 } else if (name.equals("accentfold")) {
204 accent_default = (def_val.equals("on")? BOOLEAN_PARAM_ON: BOOLEAN_PARAM_OFF);
205 }
206 }
207 }
208 }
209
210 // so far we have index and indexSubcollection specific display elements, and global format elements
211
212 NodeList indexes = info.getElementsByTagName(GSXML.INDEX_ELEM);
213 for (int i = 0; i < indexes.getLength(); i++)
214 {
215 Element ind = (Element) indexes.item(i);
216 String name = ind.getAttribute(GSXML.NAME_ATT);
217 Element node_extra = GSXML.getNamedElement(config_search, GSXML.INDEX_ELEM, GSXML.NAME_ATT, name);
218 if (node_extra == null)
219 {
220 logger.error("haven't found extra info for index named " + name);
221 continue;
222 }
223
224 // get the display elements if any - displayName
225 NodeList display_names = node_extra.getElementsByTagName(GSXML.DISPLAY_TEXT_ELEM);
226 if (display_names != null)
227 {
228 for (int j = 0; j < display_names.getLength(); j++)
229 {
230 Element e = (Element) display_names.item(j);
231 ind.appendChild(owner.importNode(e, true));
232 }
233 }
234 } // for each index
235
236 NodeList indexSubcollections = info.getElementsByTagName(INDEX_SUBCOLLECTION_ELEM); // buildConfig.xml
237
238 for (int i = 0; i < indexSubcollections.getLength(); i++)
239 {
240 Element indexSubcollection = (Element) indexSubcollections.item(i);
241 String name = indexSubcollection.getAttribute(GSXML.NAME_ATT);
242 Element node_extra = GSXML.getNamedElement(config_search, INDEX_SUBCOLLECTION_ELEM, GSXML.NAME_ATT, name); // collectionConfig.xml
243 if (node_extra == null)
244 {
245 logger.error("haven't found extra info for indexSubCollection named " + name);
246 continue;
247 }
248
249 // get the display elements if any - displayName
250 NodeList display_names = node_extra.getElementsByTagName(GSXML.DISPLAY_TEXT_ELEM);
251 if (display_names != null)
252 {
253 for (int j = 0; j < display_names.getLength(); j++)
254 {
255 Element e = (Element) display_names.item(j);
256 indexSubcollection.appendChild(owner.importNode(e, true));
257 }
258 }
259 } // for each indexSubCollection
260 }
261 return true;
262 }
263
264 protected void getIndexData(ArrayList<String> index_ids, ArrayList<String> index_names, String lang)
265 {
266 // the index info -
267 Element index_list = (Element) GSXML.getChildByTagName(this.config_info, INDEX_ELEM + GSXML.LIST_MODIFIER);
268 NodeList indexes = index_list.getElementsByTagName(INDEX_ELEM);
269 int len = indexes.getLength();
270 // now add even if there is only one
271 for (int i = 0; i < len; i++)
272 {
273 Element index = (Element) indexes.item(i);
274 String shortname = index.getAttribute(GSXML.SHORTNAME_ATT);
275 if (shortname.equals(""))
276 {
277 continue;
278 }
279 index_ids.add(shortname);
280 String display_name = GSXML.getDisplayText(index, GSXML.DISPLAY_TEXT_NAME, lang, "en");
281 if (display_name.equals(""))
282 {
283 display_name = index.getAttribute(GSXML.NAME_ATT);
284 if (display_name.equals(""))
285 {
286 display_name = shortname;
287 }
288 }
289 index_names.add(display_name);
290 }
291 }
292
293 protected void getIndexSubcollectionData(ArrayList<String> index_sub_ids, ArrayList<String> index_sub_names, String lang)
294 {
295 // the index info -
296 Element index_sub_list = (Element) GSXML.getChildByTagName(this.config_info, INDEX_SUBCOLLECTION_ELEM + GSXML.LIST_MODIFIER);
297 NodeList index_subs = index_sub_list.getElementsByTagName(INDEX_SUBCOLLECTION_ELEM);
298 int len = index_subs.getLength();
299 // now add even if there is only one
300 for (int i = 0; i < len; i++)
301 {
302 Element indexsub = (Element) index_subs.item(i);
303 String shortname = indexsub.getAttribute(GSXML.SHORTNAME_ATT);
304 if (shortname.equals(""))
305 {
306 continue;
307 }
308 index_sub_ids.add(shortname);
309 String display_name = GSXML.getDisplayText(indexsub, GSXML.DISPLAY_TEXT_NAME, lang, "en");
310 if (display_name.equals(""))
311 {
312 display_name = indexsub.getAttribute(GSXML.NAME_ATT);
313 if (display_name.equals(""))
314 {
315 display_name = shortname;
316 }
317 }
318 index_sub_names.add(display_name);
319 }
320 }
321
322 protected void getIndexLanguageData(ArrayList<String> index_lang_ids, ArrayList<String> index_lang_names, String lang)
323 {
324 // the index info -
325 Element index_lang_list = (Element) GSXML.getChildByTagName(this.config_info, INDEX_LANGUAGE_ELEM + GSXML.LIST_MODIFIER);
326 NodeList index_langs = index_lang_list.getElementsByTagName(INDEX_LANGUAGE_ELEM);
327 int len = index_langs.getLength();
328 // now add even if there is only one
329 for (int i = 0; i < len; i++)
330 {
331 Element indexlang = (Element) index_langs.item(i);
332 String shortname = indexlang.getAttribute(GSXML.SHORTNAME_ATT);
333 if (shortname.equals(""))
334 {
335 continue;
336 }
337 index_lang_ids.add(shortname);
338 String display_name = GSXML.getDisplayText(indexlang, GSXML.DISPLAY_TEXT_NAME, lang, "en");
339 if (display_name.equals(""))
340 {
341 display_name = indexlang.getAttribute(GSXML.NAME_ATT);
342 if (display_name.equals(""))
343 {
344 display_name = shortname;
345 }
346 }
347 index_lang_names.add(display_name);
348 }
349
350 }
351
352 protected void addCustomQueryParams(Element param_list, String lang)
353 {
354 if (this.does_case)
355 {
356 createParameter(CASE_PARAM, param_list, lang, case_default);
357 }
358 if (this.does_stem)
359 {
360 createParameter(STEM_PARAM, param_list, lang, stem_default);
361 }
362 if (this.does_accent)
363 {
364 createParameter(ACCENT_PARAM, param_list, lang, accent_default);
365 }
366 createParameter(MATCH_PARAM, param_list, lang);
367 }
368
369 /** convert indexer internal id to Greenstone oid */
370 protected String internalNum2OID(long docnum)
371 {
372 return this.gs_doc_db.internalNum2OID(docnum);
373 }
374
375 protected String internalNum2OID(String docnum)
376 {
377 return this.gs_doc_db.internalNum2OID(docnum);
378 }
379
380}
Note: See TracBrowser for help on using the repository browser.