source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/GS2LuceneSearch.java@ 38154

Last change on this file since 38154 was 38154, checked in by kjdon, 7 months ago

moved sidx and didx to static strings

  • Property svn:keywords set to Author Date Id Revision
File size: 15.9 KB
Line 
1/*
2 * GS2LuceneSearch.java
3 * Copyright (C) 2006 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19package org.greenstone.gsdl3.service;
20
21// Greenstone classes
22import java.io.File;
23import java.io.IOException;
24import java.io.Serializable;
25import java.util.ArrayList;
26import java.util.HashMap;
27import java.util.Iterator;
28import java.util.List;
29import java.util.Map;
30import java.util.Set;
31import java.util.Vector;
32
33// For maintaining Lucene IndexReader objects at collection level
34import org.apache.lucene.index.DirectoryReader;
35import org.apache.lucene.index.IndexReader;
36import org.apache.lucene.store.Directory;
37import org.apache.lucene.store.FSDirectory;
38
39import org.apache.log4j.Logger;
40import org.greenstone.LuceneWrapper4.GS2LuceneQuery;
41import org.greenstone.LuceneWrapper4.LuceneQueryResult;
42import org.greenstone.gsdl3.util.FacetWrapper;
43import org.greenstone.gsdl3.util.GSFile;
44import org.greenstone.gsdl3.util.GSXML;
45import org.greenstone.gsdl3.util.XMLConverter;
46import org.w3c.dom.Document;
47import org.w3c.dom.Element;
48
49
50public class GS2LuceneSearch extends SharedSoleneGS2FieldSearch
51{
52
53 protected static final String SORT_ORDER_PARAM = "reverseSort";
54 protected static final String SORT_ORDER_REVERSE = "1";
55 protected static final String SORT_ORDER_NORMAL = "0";
56
57 // IndexReader objects are to be opened for each index level (e.g. one for didx, one for sidx) of a
58 // collection and will live for the duration of that collection, which is from collection activation
59 // until deactivation.
60 // So we want singletons of each index level's IndexReader, since IndexReaders are "multi-threaded
61 // re-entrant", so there's support for just one reader per index with concurrent access by multiple users'
62 // search queries.
63 // When a collection is deactivated, we need to close the reader objects to prevent handles to the
64 // index lingering and causing file locking issues on windows.
65 // Since GS2LuceneQuery now becomes a local member variable instantiated per query, we have to maintain
66 // IndexReader objects in GS2LuceneSearch instead, as GS2LuceneSearch is a collection's service, and
67 // therefore activated and deactivated along with the collection.
68 // The uniqueness of an IndexReader is indicated in the filepath to its index folder (collection path + sidx/didx).
69 // It doesn't have to be a static map of index_dir to IndexReader, and can be a member variable, since
70 // no other collection will refer to the same didx and sidx index folders: each collection has unique filepaths
71 // to its collection folder's index subdirs, not shared with other collections so the Readers don't have to be
72 // shared between collections either.
73
74 // We now store IndexReaders in a map of singleton index_dir -> IndexReaders opened for this collection:
75 // one Reader singleton for each index_dir
76 private Map<String, IndexReader> index_to_reader_map = new HashMap<String, IndexReader>();
77
78 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneSearch.class.getName());
79
80 public GS2LuceneSearch()
81 {
82 does_paging = true;
83 paramDefaults.put(SORT_ORDER_PARAM, SORT_ORDER_NORMAL);
84 }
85
86 public void cleanUp()
87 {
88 super.cleanUp();
89
90 // Prevent file locking issues: close all IndexReader objects maintained for this collection
91 synchronized(index_to_reader_map) { // Regular Map implementations are not synchronized, so adding/removing requires synchronizing on the map object.
92 // see https://docs.oracle.com/javase/7/docs/api/java/util/HashMap.html
93 // And ConcurrentHashMap seems complicated, https://docs.oracle.com/javase/7/docs/api/java/util/concurrent/ConcurrentHashMap.html
94
95 // Synchronizing *outside* the loop because cleanUp() clears the entire HashMap.
96 // Don't let any other threads access the map, hence synchronizing.
97 // Not sure if there may be other threads accessing the map when deactivating a collection which calls cleanUp().
98 // However, when multiple users' search queries lead to adding to the hashmap, definitely need to
99 // synchronize as there's a greater possibility of concurrent access then.
100
101 Iterator<Map.Entry<String,IndexReader>> map_iterator = index_to_reader_map.entrySet().iterator();
102 // Can use the Map.Entry Set view iterator to remove (key, value) entry from underlying Map!
103 // See https://docs.oracle.com/javase/7/docs/api/java/util/HashMap.html#keySet()
104 // Same thread creates the iterator as synchronizes on the map, so we should be allowed to remove() from the map
105 // but only through iterator!
106 while(map_iterator.hasNext()) {
107 Map.Entry<String,IndexReader> entry = map_iterator.next();
108 //index_to_reader_map.remove(...); // concurrentmodexception! Only allowed to remove through iterator. Will remove recent object returned by next()
109 IndexReader reader = entry.getValue(); //keys are index dir paths, e.g. path to current collection's didx folder, values are IndexReader objects
110 map_iterator.remove(); // removes current key's (key,value) entry from underlying map! (Remember, we're iterating on the keyset)
111 // We're first removing the reader singleton from map because reader.close() will only close the reader
112 //if it's the final reference to it in case that has a bearing here
113
114 if(reader != null) { // if there was a reader singleton instantiated for this index directory, e.g. coll-didx, close it
115 try {
116 // We're opening an IndexReader per indexdir once and closing it once: at start and end of collection.
117 // If Reader was a member var of GS2LuceneQuery and if multiple GS2LuceneQuery Objects were to call close() on the
118 // same reader object (on the singleton instance of reader for an index dir), so close is called multiple times,
119 // then would use incRef and decRef, see http://lucene.472066.n3.nabble.com/IndexReader-close-behavior-td2865515.html
120 // But then when concurrent queries are done, the final one would have closed the IndexReader and it would have to
121 // be reopened for the next query. We'd rather keep an opened IndexReader around until the collection's deactivated.
122 reader.close();
123 // Closes files associated with this index. Also saves any new deletions to disk.
124 // No other methods should be called after this has been called.
125 } catch (IOException exception) {
126 exception.printStackTrace();
127 }
128 }
129 } // end loop
130 } // end synchronising on index_to_reader_map
131
132 // Now we've closed all the Readers maintained for this collection and cleared the map.
133 }
134
135 public boolean configure(Element info, Element extra_info)
136 {
137 if (!super.configure(info, extra_info))
138 {
139 return false;
140 }
141 logger.info("Configuring GS2LuceneSearch...");
142
143 // add our reverseSort param to be saved to the session
144 this.save_params.add(SORT_ORDER_PARAM);
145 return true;
146 }
147 /** add in the Lucene specific params to TextQuery */
148 protected void addCustomQueryParams(Element param_list, String lang)
149 {
150 super.addCustomQueryParams(param_list, lang);
151 /** Add in the reverse sort on/off param */
152 createParameter(SORT_ORDER_PARAM, param_list, lang);
153 }
154 /** add in Lucene specific params for AdvancedFieldQuery */
155 protected void addCustomQueryParamsAdvField(Element param_list, String lang)
156 {
157 super.addCustomQueryParamsAdvField(param_list, lang);
158 createParameter(SORT_ORDER_PARAM, param_list, lang);
159
160 }
161 /** create a param and add to the list */
162 protected void createParameter(String name, Element param_list, String lang)
163 {
164 Document doc = param_list.getOwnerDocument();
165 Element param = null;
166 String param_default = paramDefaults.get(name);
167 if (name.equals(SORT_ORDER_PARAM)) {
168 String[] vals = { SORT_ORDER_REVERSE, SORT_ORDER_NORMAL };
169 String[] vals_texts = { getTextString("param." + SORT_ORDER_PARAM + "." + SORT_ORDER_REVERSE, lang), getTextString("param." + SORT_ORDER_PARAM + "." + SORT_ORDER_NORMAL, lang) };
170
171 param = GSXML.createParameterDescription(doc, SORT_ORDER_PARAM, getTextString("param." + SORT_ORDER_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, param_default, vals, vals_texts);
172 }
173
174 if (param != null)
175 {
176 param_list.appendChild(param);
177 }
178 else
179 {
180 super.createParameter(name, param_list, lang);
181 }
182
183 }
184
185 /** methods to handle actually doing the query */
186
187 /** do any initialisation of the query object */
188 protected Object setUpQueryer(HashMap params)
189 {
190 // local Query object
191 GS2LuceneQuery lucene_src = new GS2LuceneQuery();
192
193 String indexdir = GSFile.collectionBaseDir(this.site_home, this.cluster_name) + File.separatorChar + "index" + File.separatorChar;
194
195 String index = DOCUMENT_INDEX;
196 if (this.default_level.toUpperCase().equals("SEC")) {
197 index = SECTION_INDEX;
198 }
199 String physical_index_language_name = null;
200 String physical_sub_index_name = null;
201 int hits_per_page = Integer.parseInt(paramDefaults.get(HITS_PER_PAGE_PARAM));
202 int start_page = Integer.parseInt(paramDefaults.get(START_PAGE_PARAM));
203 String sort_field = getLuceneSort(default_sort);
204 String sort_order = paramDefaults.get(SORT_ORDER_PARAM);
205
206 // set up the query params
207 Set entries = params.entrySet();
208 Iterator i = entries.iterator();
209 while (i.hasNext())
210 {
211 Map.Entry m = (Map.Entry) i.next();
212 String name = (String) m.getKey();
213 String value = (String) m.getValue();
214
215 if (name.equals(HITS_PER_PAGE_PARAM))
216 {
217 if (value.equals("all")) {
218 hits_per_page = -1;
219 } else {
220 hits_per_page = Integer.parseInt(value);
221 }
222 }
223 else if (name.equals(START_PAGE_PARAM))
224 {
225 start_page = Integer.parseInt(value);
226
227 }
228 else if (name.equals(MATCH_PARAM))
229 {
230 if (value.equals(MATCH_PARAM_ALL))
231 {
232 lucene_src.setDefaultConjunctionOperator("AND");
233 }
234 else
235 {
236 lucene_src.setDefaultConjunctionOperator("OR");
237 }
238 }
239 else if (name.equals(RANK_PARAM))
240 {
241 sort_field = getLuceneSort(value);
242 lucene_src.setSortField(sort_field);
243
244 }
245 else if (name.equals(SORT_ORDER_PARAM)) {
246 sort_order = value;
247 }
248 else if (name.equals(LEVEL_PARAM))
249 {
250 if (value.toUpperCase().equals("SEC"))
251 {
252 index = SECTION_INDEX;
253 }
254 else
255 {
256 index = DOCUMENT_INDEX;
257 }
258 }
259 else if (name.equals(INDEX_SUBCOLLECTION_PARAM))
260 {
261 physical_sub_index_name = value;
262 }
263 else if (name.equals(INDEX_LANGUAGE_PARAM))
264 {
265 physical_index_language_name = value;
266 } // ignore any others
267 }
268 // set up start and end results if necessary
269 // start results always start at 0
270 int start_results = 0;
271 if (start_page > 1 && hits_per_page > 0)
272 {
273 start_results = ((start_page - 1) * hits_per_page) ;
274 }
275 int end_results = Integer.MAX_VALUE;
276 if (hits_per_page > 0) {
277 end_results = hits_per_page * start_page;
278 }
279 lucene_src.setStartResults(start_results);
280 lucene_src.setEndResults(end_results);
281
282 if (index.equals(SECTION_INDEX) || index.equals(DOCUMENT_INDEX))
283 {
284 if (physical_sub_index_name != null)
285 {
286 index += physical_sub_index_name;
287 }
288 if (physical_index_language_name != null)
289 {
290 index += physical_index_language_name;
291 }
292 }
293
294 if (sort_order.equals(SORT_ORDER_REVERSE)) {
295 lucene_src.setReverseSort(true);
296 } else {
297 lucene_src.setReverseSort(false);
298 }
299
300 String full_index_dir_str = indexdir + index;
301 lucene_src.setIndexDir(full_index_dir_str);
302
303 // Ensure we have an IndexReader for this full_index_dir_str:
304 // check the hashmap first, in case we already opened a reader and searcher for this index dir, e.g. didx
305 // if there was a reader singleton instantiated for this index directory, e.g. <coll>didx, use that.
306 // Else open a new reader for this index_dir and store it in the map.
307 IndexReader reader = index_to_reader_map.get(full_index_dir_str);
308 if(reader == null) {
309 try {
310 Directory full_indexdir_dir = FSDirectory.open(new File(full_index_dir_str));
311 reader = DirectoryReader.open(full_indexdir_dir); // Returns an IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
312 synchronized(index_to_reader_map) {
313 // If storing searcher along with reader, mimic Pairs with: https://stackoverflow.com/questions/2670982/using-pairs-or-2-tuples-in-java
314 index_to_reader_map.put(full_index_dir_str, reader);
315 }
316 }
317 catch (IOException exception) {
318 exception.printStackTrace();
319 }
320 }
321
322 lucene_src.initialise(reader); // sets IndexReader and IndexSearcher
323
324 return lucene_src; // return the queryobject
325 }
326
327 /** do the query */
328 protected Object runQuery(Object queryObject, String query)
329 {
330 GS2LuceneQuery lucene_src = (GS2LuceneQuery) queryObject;
331 try
332 {
333 LuceneQueryResult lqr = lucene_src.runQuery(query);
334 return lqr;
335 }
336 catch (Exception e)
337 {
338 logger.error("Exception happened in runQuery(): ", e);
339 }
340
341 return null;
342 }
343
344 /** get the total number of docs that match */
345 protected long numDocsMatched(Object query_result)
346 {
347 return ((LuceneQueryResult) query_result).getTotalDocs();
348 }
349
350 /** get the list of doc ids */
351 protected String[] getDocIDs(Object query_result)
352 {
353 Vector docs = ((LuceneQueryResult) query_result).getDocs();
354 String[] doc_nums = new String[docs.size()];
355 for (int d = 0; d < docs.size(); d++)
356 {
357 String doc_num = ((LuceneQueryResult.DocInfo) docs.elementAt(d)).id_;
358 doc_nums[d] = doc_num;
359 }
360 return doc_nums;
361 }
362
363 /** get the list of doc ranks */
364 protected String[] getDocRanks(Object query_result)
365 {
366 Vector docs = ((LuceneQueryResult) query_result).getDocs();
367 String[] doc_ranks = new String[docs.size()];
368 for (int d = 0; d < docs.size(); d++)
369 {
370 doc_ranks[d] = Float.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).rank_);
371 }
372 return doc_ranks;
373 }
374
375 /** add in term info if available */
376 protected boolean addTermInfo(Element term_list, HashMap params, Object query_result)
377 {
378 Document doc = term_list.getOwnerDocument();
379 String query_level = (String) params.get(LEVEL_PARAM); // the current query level
380
381 Vector terms = ((LuceneQueryResult) query_result).getTerms();
382 for (int t = 0; t < terms.size(); t++)
383 {
384 LuceneQueryResult.TermInfo term_info = (LuceneQueryResult.TermInfo) terms.get(t);
385
386 Element term_elem = doc.createElement(GSXML.TERM_ELEM);
387 term_elem.setAttribute(GSXML.NAME_ATT, term_info.term_);
388 term_elem.setAttribute(FREQ_ATT, "" + term_info.term_freq_);
389 term_elem.setAttribute(NUM_DOCS_MATCH_ATT, "" + term_info.match_docs_);
390 term_elem.setAttribute(FIELD_ATT, term_info.field_);
391 term_list.appendChild(term_elem);
392 }
393
394 Vector stopwords = ((LuceneQueryResult) query_result).getStopWords();
395 for (int t = 0; t < stopwords.size(); t++)
396 {
397 String stopword = (String) stopwords.get(t);
398
399 Element stopword_elem = doc.createElement(GSXML.STOPWORD_ELEM);
400 stopword_elem.setAttribute(GSXML.NAME_ATT, stopword);
401 term_list.appendChild(stopword_elem);
402 }
403
404 return true;
405 }
406
407 protected ArrayList<FacetWrapper> getFacets(Object query_result, String lang)
408 {
409 return null;
410 }
411
412 protected String getLuceneSort(String gs3_sort) {
413
414 if (gs3_sort.equals(RANK_PARAM_RANK)) {
415 return GS2LuceneQuery.SORT_RANK;
416 }
417 if (gs3_sort.equals(RANK_PARAM_NONE)) {
418 return GS2LuceneQuery.SORT_NATURAL;
419 }
420 return gs3_sort;
421 }
422
423@Override
424protected Map<String, Map<String, List<String>>> getHighlightSnippets(
425 Object query_result) {
426 // TODO Auto-generated method stub
427 return null;
428}
429
430}
Note: See TracBrowser for help on using the repository browser.