source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/AbstractGS2TextSearch.java@ 25128

Last change on this file since 25128 was 24857, checked in by sjm84, 12 years ago

Reformatting this file ahead of some changes

File size: 10.9 KB
Line 
1/*
2 * AbstractGS2TextSearch.java
3 * Copyright (C) 2011 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18package org.greenstone.gsdl3.service;
19
20// Greenstone classes
21import org.greenstone.gsdl3.util.OID;
22import org.greenstone.gsdl3.util.DBInfo;
23import org.greenstone.gsdl3.util.GSXML;
24import org.greenstone.gsdl3.util.SimpleDocumentDatabase;
25import org.greenstone.gsdl3.util.GSFile;
26
27// XML classes
28import org.w3c.dom.Document;
29import org.w3c.dom.Element;
30import org.w3c.dom.NodeList;
31
32// java
33import java.util.Vector;
34import java.util.ArrayList;
35import java.util.HashMap;
36import java.util.Map;
37import java.util.Set;
38import java.util.Iterator;
39import java.io.File;
40
41import org.apache.log4j.*;
42
43public abstract class AbstractGS2TextSearch extends AbstractTextSearch
44{
45
46 protected static final String EQUIV_TERM_ELEM = "equivTerm";
47
48 protected static final String STEM_ATT = "stem";
49 protected static final String NUM_DOCS_MATCH_ATT = "numDocsMatch";
50 protected static final String FREQ_ATT = "freq";
51
52 // Elements used in the config file that are specific to this class
53 protected static final String DEFAULT_INDEX_ELEM = "defaultIndex";
54 protected static final String INDEX_STEM_ELEM = "indexStem";
55 protected static final String INDEX_ELEM = "index";
56 protected static final String DEFAULT_INDEX_SUBCOLLECTION_ELEM = "defaultIndexSubcollection";
57 protected static final String DEFAULT_INDEX_LANGUAGE_ELEM = "defaultIndexLanguage";
58 protected static final String INDEX_SUBCOLLECTION_ELEM = "indexSubcollection";
59 protected static final String INDEX_LANGUAGE_ELEM = "indexLanguage";
60
61 // Some indexing options
62 protected static final String STEMINDEX_OPTION = "stemIndexes";
63 protected static final String MAXNUMERIC_OPTION = "maxnumeric";
64
65 /** the stem used for the index files */
66 protected String index_stem = null;
67
68 // stem indexes available
69 protected boolean does_case = true;
70 protected boolean does_stem = true;
71 protected boolean does_accent = false;
72
73 // maxnumeric -
74 protected int maxnumeric = 4;
75
76 SimpleDocumentDatabase gs_doc_db = null;
77
78 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.AbstractGS2TextSearch.class.getName());
79
80 /** constructor */
81 public AbstractGS2TextSearch()
82 {
83
84 }
85
86 public void cleanUp()
87 {
88 super.cleanUp();
89 this.gs_doc_db.cleanUp();
90 }
91
92 /** configure this service */
93 public boolean configure(Element info, Element extra_info)
94 {
95 if (!super.configure(info, extra_info))
96 {
97 return false;
98 }
99
100 // find out what kind of database we have
101 Element database_type_elem = (Element) GSXML.getChildByTagName(info, GSXML.DATABASE_TYPE_ELEM);
102 String database_type = null;
103 if (database_type_elem != null)
104 {
105 database_type = database_type_elem.getAttribute(GSXML.NAME_ATT);
106 }
107 if (database_type == null || database_type.equals(""))
108 {
109 database_type = "gdbm"; // the default
110 }
111
112 // the index stem is either the collection name or is specified in the config file
113 Element index_stem_elem = (Element) GSXML.getChildByTagName(info, INDEX_STEM_ELEM);
114 if (index_stem_elem != null)
115 {
116 this.index_stem = index_stem_elem.getAttribute(GSXML.NAME_ATT);
117 }
118 if (this.index_stem == null || this.index_stem.equals(""))
119 {
120 logger.warn("indexStem element not found, stem will default to collection name");
121 this.index_stem = this.cluster_name;
122 }
123
124 // replaces default AbstractSearch version with one tied to database
125 gs_doc_db = new SimpleDocumentDatabase(this.doc, database_type, this.site_home, this.cluster_name, this.index_stem);
126 if (!gs_doc_db.isValid())
127 {
128 logger.error("Failed to open Document Database.");
129 return false;
130 }
131 this.gs_doc = gs_doc_db;
132
133 // do we support any of the extended features?
134 does_chunking = true;
135
136 // Get the default index out of <defaultIndex> (buildConfig.xml)
137 Element def = (Element) GSXML.getChildByTagName(info, DEFAULT_INDEX_ELEM);
138 if (def != null)
139 {
140 this.default_index = def.getAttribute(GSXML.SHORTNAME_ATT);
141 } // otherwise will be "", and the first one will be the default
142
143 //get the default indexSubcollection out of <defaultIndexSubcollection> (buildConfig.xml)
144 Element defSub = (Element) GSXML.getChildByTagName(info, DEFAULT_INDEX_SUBCOLLECTION_ELEM);
145 if (defSub != null)
146 {
147 this.default_index_subcollection = defSub.getAttribute(GSXML.SHORTNAME_ATT);
148 }
149
150 //get the default indexLanguage out of <defaultIndexLanguage> (buildConfig.xml)
151 Element defLang = (Element) GSXML.getChildByTagName(info, DEFAULT_INDEX_LANGUAGE_ELEM);
152 if (defLang != null)
153 {
154 this.default_index_language = defLang.getAttribute(GSXML.SHORTNAME_ATT);
155 } //concate defaultIndex + defaultIndexSubcollection + defaultIndexLanguage
156
157 // get index options
158 Element index_option_list = (Element) GSXML.getChildByTagName(info, GSXML.INDEX_OPTION_ELEM + GSXML.LIST_MODIFIER);
159 if (index_option_list != null)
160 {
161 NodeList options = index_option_list.getElementsByTagName(GSXML.INDEX_OPTION_ELEM);
162 for (int i = 0; i < options.getLength(); i++)
163 {
164 Element opt = (Element) options.item(i);
165 String name = opt.getAttribute(GSXML.NAME_ATT);
166 String value = opt.getAttribute(GSXML.VALUE_ATT);
167 if (name.equals(MAXNUMERIC_OPTION))
168 {
169 int maxnum = Integer.parseInt(value);
170 if (4 <= maxnum && maxnum < 512)
171 {
172 maxnumeric = maxnum;
173 }
174 }
175 else if (name.equals(STEMINDEX_OPTION))
176 {
177 int stemindex = Integer.parseInt(value);
178 // stem and case are true by default, accent folding false by default
179 if ((stemindex & 1) == 0)
180 {
181 does_case = false;
182 }
183 if ((stemindex & 2) == 0)
184 {
185 does_stem = false;
186 }
187 if ((stemindex & 4) != 0)
188 {
189 does_accent = true;
190 }
191 }
192 }
193 }
194
195 // get display info from extra info
196 if (extra_info != null)
197 {
198 Document owner = info.getOwnerDocument();
199 // so far we have index specific display elements, and global format elements
200 NodeList indexes = info.getElementsByTagName(GSXML.INDEX_ELEM);
201 Element config_search = (Element) GSXML.getChildByTagName(extra_info, GSXML.SEARCH_ELEM);
202
203 for (int i = 0; i < indexes.getLength(); i++)
204 {
205 Element ind = (Element) indexes.item(i);
206 String name = ind.getAttribute(GSXML.NAME_ATT);
207 Element node_extra = GSXML.getNamedElement(config_search, GSXML.INDEX_ELEM, GSXML.NAME_ATT, name);
208 if (node_extra == null)
209 {
210 logger.error("haven't found extra info for index named " + name);
211 continue;
212 }
213
214 // get the display elements if any - displayName
215 NodeList display_names = node_extra.getElementsByTagName(GSXML.DISPLAY_TEXT_ELEM);
216 if (display_names != null)
217 {
218 for (int j = 0; j < display_names.getLength(); j++)
219 {
220 Element e = (Element) display_names.item(j);
221 ind.appendChild(owner.importNode(e, true));
222 }
223 }
224 } // for each index
225 }
226 return true;
227 }
228
229 protected void getIndexData(ArrayList index_ids, ArrayList index_names, String lang)
230 {
231 // the index info -
232 Element index_list = (Element) GSXML.getChildByTagName(this.config_info, INDEX_ELEM + GSXML.LIST_MODIFIER);
233 NodeList indexes = index_list.getElementsByTagName(INDEX_ELEM);
234 int len = indexes.getLength();
235 // now add even if there is only one
236 for (int i = 0; i < len; i++)
237 {
238 Element index = (Element) indexes.item(i);
239 String shortname = index.getAttribute(GSXML.SHORTNAME_ATT);
240 if (shortname.equals(""))
241 {
242 continue;
243 }
244 index_ids.add(shortname);
245 String display_name = GSXML.getDisplayText(index, GSXML.DISPLAY_TEXT_NAME, lang, "en");
246 if (display_name.equals(""))
247 {
248 display_name = index.getAttribute(GSXML.NAME_ATT);
249 if (display_name.equals(""))
250 {
251 display_name = shortname;
252 }
253 }
254 index_names.add(display_name);
255 }
256 }
257
258 protected void getIndexSubcollectionData(ArrayList index_sub_ids, ArrayList index_sub_names, String lang)
259 {
260 // the index info -
261 Element index_sub_list = (Element) GSXML.getChildByTagName(this.config_info, INDEX_SUBCOLLECTION_ELEM + GSXML.LIST_MODIFIER);
262 NodeList index_subs = index_sub_list.getElementsByTagName(INDEX_SUBCOLLECTION_ELEM);
263 int len = index_subs.getLength();
264 // now add even if there is only one
265 for (int i = 0; i < len; i++)
266 {
267 Element indexsub = (Element) index_subs.item(i);
268 String shortname = indexsub.getAttribute(GSXML.SHORTNAME_ATT);
269 if (shortname.equals(""))
270 {
271 continue;
272 }
273 index_sub_ids.add(shortname);
274 String display_name = GSXML.getDisplayText(indexsub, GSXML.DISPLAY_TEXT_NAME, lang, "en");
275 if (display_name.equals(""))
276 {
277 display_name = indexsub.getAttribute(GSXML.NAME_ATT);
278 if (display_name.equals(""))
279 {
280 display_name = shortname;
281 }
282 }
283 index_sub_names.add(display_name);
284 }
285 }
286
287 protected void getIndexLanguageData(ArrayList index_lang_ids, ArrayList index_lang_names, String lang)
288 {
289 // the index info -
290 Element index_lang_list = (Element) GSXML.getChildByTagName(this.config_info, INDEX_LANGUAGE_ELEM + GSXML.LIST_MODIFIER);
291 NodeList index_langs = index_lang_list.getElementsByTagName(INDEX_LANGUAGE_ELEM);
292 int len = index_langs.getLength();
293 // now add even if there is only one
294 for (int i = 0; i < len; i++)
295 {
296 Element indexlang = (Element) index_langs.item(i);
297 String shortname = indexlang.getAttribute(GSXML.SHORTNAME_ATT);
298 if (shortname.equals(""))
299 {
300 continue;
301 }
302 index_lang_ids.add(shortname);
303 String display_name = GSXML.getDisplayText(indexlang, GSXML.DISPLAY_TEXT_NAME, lang, "en");
304 if (display_name.equals(""))
305 {
306 display_name = indexlang.getAttribute(GSXML.NAME_ATT);
307 if (display_name.equals(""))
308 {
309 display_name = shortname;
310 }
311 }
312 index_lang_names.add(display_name);
313 }
314
315 }
316
317 protected void addCustomQueryParams(Element param_list, String lang)
318 {
319 if (this.does_case)
320 {
321 // gs2 has case on by default
322 createParameter(CASE_PARAM, param_list, lang, BOOLEAN_PARAM_ON);
323 }
324 if (this.does_stem)
325 {
326 // but stem is off by default
327 createParameter(STEM_PARAM, param_list, lang, BOOLEAN_PARAM_OFF);
328 }
329 if (this.does_accent)
330 {
331 // and so is accent folding
332 createParameter(ACCENT_PARAM, param_list, lang, BOOLEAN_PARAM_OFF);
333 }
334 createParameter(MATCH_PARAM, param_list, lang);
335 }
336
337 /** convert indexer internal id to Greenstone oid */
338 protected String internalNum2OID(long docnum)
339 {
340 return this.gs_doc_db.internalNum2OID(docnum);
341 }
342
343 protected String internalNum2OID(String docnum)
344 {
345 return this.gs_doc_db.internalNum2OID(docnum);
346 }
347
348}
Note: See TracBrowser for help on using the repository browser.