source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/GS2LuceneSearch.java@ 32619

Last change on this file since 32619 was 32619, checked in by ak19, 5 years ago

3 significant changes in 1 commit particularly impacting Lucene queries: 1. Instead if GS2LuceneSearch havinga GS2LuceneQuery object member variable for doing each and every search, each query now instantiates its own local GS2LuceneQuery object, configures it for that specific search, runs the search and then the GS2LuceneQuery object expires. This fixes a bug by preventing multiple concurrent searches getting the search configurations of other searches run at the same time. 2. Though GS2LuceneQuery objects need to be instantiated 1 per query over a collection, we don't want to keep reopening a collection's sidx and didx index folders with IndexReader objects for every query. Since IndexReaders support concurrent access, we'd like to use one IndexReader per collection index (one for didx, one for sidx) with the IndexReaders existing for the life of a collection. This meant moving the maintaining of IndexReader objects from GS2LuceneQuery into the GS2LuceneSearch service and turning them into singletons by using a HashMap to maintain index-dir, reader pairs. GS3 Services, e.g. GS2LuceneSearch, are loaded and unloaded on collection activate and deactivate respectively. On deactivate, cleanUp() is called on services and other GS3 modules. When GS2LuceneSearch.cleanUp() is called, we now finally close the singleton IndexReader objects/resources that a collection's GS2LuceneSearch object maintains. 3. Redid previous bugfix (then committed to GS2LuceneQuery): Point 2 again solves the filelocking problem of multiple handles to the index being opened and not all being closed on deactivate, but it's solved in a different and better/more optimal way than in the previous commit.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.9 KB
Line 
1/*
2 * GS2LuceneSearch.java
3 * Copyright (C) 2006 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19package org.greenstone.gsdl3.service;
20
21// Greenstone classes
22import java.io.File;
23import java.io.IOException;
24import java.io.Serializable;
25import java.util.ArrayList;
26import java.util.HashMap;
27import java.util.Iterator;
28import java.util.List;
29import java.util.Map;
30import java.util.Set;
31import java.util.Vector;
32
33// For maintaining Lucene IndexReader objects at collection level
34import org.apache.lucene.index.DirectoryReader;
35import org.apache.lucene.index.IndexReader;
36import org.apache.lucene.store.Directory;
37import org.apache.lucene.store.FSDirectory;
38
39import org.apache.log4j.Logger;
40import org.greenstone.LuceneWrapper4.GS2LuceneQuery;
41import org.greenstone.LuceneWrapper4.LuceneQueryResult;
42import org.greenstone.gsdl3.util.FacetWrapper;
43import org.greenstone.gsdl3.util.GSFile;
44import org.greenstone.gsdl3.util.GSXML;
45import org.greenstone.gsdl3.util.XMLConverter;
46import org.w3c.dom.Document;
47import org.w3c.dom.Element;
48
49
50public class GS2LuceneSearch extends SharedSoleneGS2FieldSearch
51{
52
53 protected static final String SORT_ORDER_PARAM = "reverseSort";
54 protected static final String SORT_ORDER_REVERSE = "1";
55 protected static final String SORT_ORDER_NORMAL = "0";
56
57 // IndexReader objects are to be opened for each index level (e.g. one for didx, one for sidx) of a
58 // collection and will live for the duration of that collection, which is from collection activation
59 // until deactivation.
60 // So we want singletons of each index level's IndexReader, since IndexReaders are "multi-threaded
61 // re-entrant", so there's support for just one reader per index with concurrent access by multiple users'
62 // search queries.
63 // When a collection is deactivated, we need to close the reader objects to prevent handles to the
64 // index lingering and causing file locking issues on windows.
65 // Since GS2LuceneQuery now becomes a local member variable instantiated per query, we have to maintain
66 // IndexReader objects in GS2LuceneSearch instead, as GS2LuceneSearch is a collection's service, and
67 // therefore activated and deactivated along with the collection.
68 // The uniqueness of an IndexReader is indicated in the filepath to its index folder (collection path + sidx/didx).
69 // It doesn't have to be a static map of index_dir to IndexReader, and can be a member variable, since
70 // no other collection will refer to the same didx and sidx index folders: each collection has unique filepaths
71 // to its collection folder's index subdirs, not shared with other collections so the Readers don't have to be
72 // shared between collections either.
73
74 // We now store IndexReaders in a map of singleton index_dir -> IndexReaders opened for this collection:
75 // one Reader singleton for each index_dir
76 private Map<String, IndexReader> index_to_reader_map = new HashMap<String, IndexReader>();
77
78 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneSearch.class.getName());
79
80 public GS2LuceneSearch()
81 {
82 does_paging = true;
83 paramDefaults.put(SORT_ORDER_PARAM, SORT_ORDER_NORMAL);
84 }
85
86 public void cleanUp()
87 {
88 super.cleanUp();
89
90 // Prevent file locking issues: close all IndexReader objects maintained for this collection
91 synchronized(index_to_reader_map) { // Regular Map implementations are not synchronized, so adding/removing requires synchronizing on the map object.
92 // see https://docs.oracle.com/javase/7/docs/api/java/util/HashMap.html
93 // And ConcurrentHashMap seems complicated, https://docs.oracle.com/javase/7/docs/api/java/util/concurrent/ConcurrentHashMap.html
94
95 // Synchronizing *outside* the loop because cleanUp() clears the entire HashMap.
96 // Don't let any other threads access the map, hence synchronizing.
97 // Not sure if there may be other threads accessing the map when deactivating a collection which calls cleanUp().
98 // However, when multiple users' search queries lead to adding to the hashmap, definitely need to
99 // synchronize as there's a greater possibility of concurrent access then.
100
101 Iterator<Map.Entry<String,IndexReader>> map_iterator = index_to_reader_map.entrySet().iterator();
102 // Can use the Map.Entry Set view iterator to remove (key, value) entry from underlying Map!
103 // See https://docs.oracle.com/javase/7/docs/api/java/util/HashMap.html#keySet()
104 // Same thread creates the iterator as synchronizes on the map, so we should be allowed to remove() from the map
105 // but only through iterator!
106 while(map_iterator.hasNext()) {
107 Map.Entry<String,IndexReader> entry = map_iterator.next();
108 //index_to_reader_map.remove(...); // concurrentmodexception! Only allowed to remove through iterator. Will remove recent object returned by next()
109 IndexReader reader = entry.getValue(); //keys are index dir paths, e.g. path to current collection's didx folder, values are IndexReader objects
110 map_iterator.remove(); // removes current key's (key,value) entry from underlying map! (Remember, we're iterating on the keyset)
111 // We're first removing the reader singleton from map because reader.close() will only close the reader
112 //if it's the final reference to it in case that has a bearing here
113
114 if(reader != null) { // if there was a reader singleton instantiated for this index directory, e.g. coll-didx, close it
115 try {
116 // We're opening an IndexReader per indexdir once and closing it once: at start and end of collection.
117 // If Reader was a member var of GS2LuceneQuery and if multiple GS2LuceneQuery Objects were to call close() on the
118 // same reader object (on the singleton instance of reader for an index dir), so close is called multiple times,
119 // then would use incRef and decRef, see http://lucene.472066.n3.nabble.com/IndexReader-close-behavior-td2865515.html
120 // But then when concurrent queries are done, the final one would have closed the IndexReader and it would have to
121 // be reopened for the next query. We'd rather keep an opened IndexReader around until the collection's deactivated.
122 reader.close();
123 // Closes files associated with this index. Also saves any new deletions to disk.
124 // No other methods should be called after this has been called.
125 } catch (IOException exception) {
126 exception.printStackTrace();
127 }
128 }
129 } // end loop
130 } // end synchronising on index_to_reader_map
131
132 // Now we've closed all the Readers maintained for this collection and cleared the map.
133 }
134
135 public boolean configure(Element info, Element extra_info)
136 {
137 if (!super.configure(info, extra_info))
138 {
139 return false;
140 }
141 logger.info("Configuring GS2LuceneSearch...");
142
143 // add our reverseSort param to be saved to the session
144 this.save_params.add(SORT_ORDER_PARAM);
145 return true;
146 }
147 /** add in the Lucene specific params to TextQuery */
148 protected void addCustomQueryParams(Element param_list, String lang)
149 {
150 super.addCustomQueryParams(param_list, lang);
151 /** Add in the reverse sort on/off param */
152 createParameter(SORT_ORDER_PARAM, param_list, lang);
153 }
154 /** add in Lucene specific params for AdvancedFieldQuery */
155 protected void addCustomQueryParamsAdvField(Element param_list, String lang)
156 {
157 super.addCustomQueryParamsAdvField(param_list, lang);
158 createParameter(SORT_ORDER_PARAM, param_list, lang);
159
160 }
161 /** create a param and add to the list */
162 protected void createParameter(String name, Element param_list, String lang)
163 {
164 Document doc = param_list.getOwnerDocument();
165 Element param = null;
166 String param_default = paramDefaults.get(name);
167 if (name.equals(SORT_ORDER_PARAM)) {
168 String[] vals = { SORT_ORDER_REVERSE, SORT_ORDER_NORMAL };
169 String[] vals_texts = { getTextString("param." + SORT_ORDER_PARAM + "." + SORT_ORDER_REVERSE, lang), getTextString("param." + SORT_ORDER_PARAM + "." + SORT_ORDER_NORMAL, lang) };
170
171 param = GSXML.createParameterDescription(doc, SORT_ORDER_PARAM, getTextString("param." + SORT_ORDER_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, param_default, vals, vals_texts);
172 }
173
174 if (param != null)
175 {
176 param_list.appendChild(param);
177 }
178 else
179 {
180 super.createParameter(name, param_list, lang);
181 }
182
183 }
184
185 /** methods to handle actually doing the query */
186
187 /** do any initialisation of the query object */
188 protected Object setUpQueryer(HashMap params)
189 {
190 // local Query object
191 GS2LuceneQuery lucene_src = new GS2LuceneQuery();
192
193 String indexdir = GSFile.collectionBaseDir(this.site_home, this.cluster_name) + File.separatorChar + "index" + File.separatorChar;
194
195 String index = "didx";
196 if (this.default_level.toUpperCase().equals("SEC")) {
197 index = "sidx";
198 }
199 String physical_index_language_name = null;
200 String physical_sub_index_name = null;
201 int hits_per_page = Integer.parseInt(paramDefaults.get(HITS_PER_PAGE_PARAM));
202 int start_page = Integer.parseInt(paramDefaults.get(START_PAGE_PARAM));
203 String sort_field = getLuceneSort(default_sort);
204 String sort_order = paramDefaults.get(SORT_ORDER_PARAM);
205
206 // set up the query params
207 Set entries = params.entrySet();
208 Iterator i = entries.iterator();
209 while (i.hasNext())
210 {
211 Map.Entry m = (Map.Entry) i.next();
212 String name = (String) m.getKey();
213 String value = (String) m.getValue();
214
215 if (name.equals(HITS_PER_PAGE_PARAM))
216 {
217 if (value.equals("all")) {
218 hits_per_page = -1;
219 } else {
220 hits_per_page = Integer.parseInt(value);
221 }
222 }
223 else if (name.equals(START_PAGE_PARAM))
224 {
225 start_page = Integer.parseInt(value);
226
227 }
228 else if (name.equals(MATCH_PARAM))
229 {
230 if (value.equals(MATCH_PARAM_ALL))
231 {
232 lucene_src.setDefaultConjunctionOperator("AND");
233 }
234 else
235 {
236 lucene_src.setDefaultConjunctionOperator("OR");
237 }
238 }
239 else if (name.equals(RANK_PARAM))
240 {
241 sort_field = getLuceneSort(value);
242 lucene_src.setSortField(sort_field);
243
244 }
245 else if (name.equals(SORT_ORDER_PARAM)) {
246 sort_order = value;
247 }
248 else if (name.equals(LEVEL_PARAM))
249 {
250 if (value.toUpperCase().equals("SEC"))
251 {
252 index = "sidx";
253 }
254 else
255 {
256 index = "didx";
257 }
258 }
259 else if (name.equals(INDEX_SUBCOLLECTION_PARAM))
260 {
261 physical_sub_index_name = value;
262 }
263 else if (name.equals(INDEX_LANGUAGE_PARAM))
264 {
265 physical_index_language_name = value;
266 } // ignore any others
267 }
268 // set up start and end results if necessary
269 // start results always start at 0
270 int start_results = 0;
271 if (start_page > 1 && hits_per_page > 0)
272 {
273 start_results = ((start_page - 1) * hits_per_page) ;
274 }
275 int end_results = Integer.MAX_VALUE;
276 if (hits_per_page > 0) {
277 end_results = hits_per_page * start_page;
278 }
279 lucene_src.setStartResults(start_results);
280 lucene_src.setEndResults(end_results);
281
282 if (index.equals("sidx") || index.equals("didx"))
283 {
284 if (physical_sub_index_name != null)
285 {
286 index += physical_sub_index_name;
287 }
288 if (physical_index_language_name != null)
289 {
290 index += physical_index_language_name;
291 }
292 }
293
294 if (sort_order.equals(SORT_ORDER_REVERSE)) {
295 lucene_src.setReverseSort(true);
296 } else {
297 lucene_src.setReverseSort(false);
298 }
299
300 String full_index_dir_str = indexdir + index;
301 lucene_src.setIndexDir(full_index_dir_str);
302
303 // Ensure we have an IndexReader for this full_index_dir_str:
304 // check the hashmap first, in case we already opened a reader and searcher for this index dir, e.g. didx
305 // if there was a reader singleton instantiated for this index directory, e.g. <coll>didx, use that.
306 // Else open a new reader for this index_dir and store it in the map.
307 IndexReader reader = index_to_reader_map.get(full_index_dir_str);
308 if(reader == null) {
309 try {
310 Directory full_indexdir_dir = FSDirectory.open(new File(full_index_dir_str));
311 reader = DirectoryReader.open(full_indexdir_dir); // Returns an IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
312 synchronized(index_to_reader_map) {
313 // If storing searcher along with reader, mimic Pairs with: https://stackoverflow.com/questions/2670982/using-pairs-or-2-tuples-in-java
314 index_to_reader_map.put(full_index_dir_str, reader);
315 }
316 }
317 catch (IOException exception) {
318 exception.printStackTrace();
319 }
320 }
321
322 lucene_src.initialise(reader); // sets IndexReader and IndexSearcher
323
324 return lucene_src; // return the queryobject
325 }
326
327 /** do the query */
328 protected Object runQuery(Object queryObject, String query)
329 {
330 GS2LuceneQuery lucene_src = (GS2LuceneQuery) queryObject;
331 try
332 {
333 LuceneQueryResult lqr = lucene_src.runQuery(query);
334 return lqr;
335 }
336 catch (Exception e)
337 {
338 logger.error("Exception happened in runQuery(): ", e);
339 }
340
341 return null;
342 }
343
344 /** get the total number of docs that match */
345 protected long numDocsMatched(Object query_result)
346 {
347 return ((LuceneQueryResult) query_result).getTotalDocs();
348 }
349
350 /** get the list of doc ids */
351 protected String[] getDocIDs(Object query_result)
352 {
353 Vector docs = ((LuceneQueryResult) query_result).getDocs();
354 String[] doc_nums = new String[docs.size()];
355 for (int d = 0; d < docs.size(); d++)
356 {
357 String doc_num = ((LuceneQueryResult.DocInfo) docs.elementAt(d)).id_;
358 doc_nums[d] = doc_num;
359 }
360 return doc_nums;
361 }
362
363 /** get the list of doc ranks */
364 protected String[] getDocRanks(Object query_result)
365 {
366 Vector docs = ((LuceneQueryResult) query_result).getDocs();
367 String[] doc_ranks = new String[docs.size()];
368 for (int d = 0; d < docs.size(); d++)
369 {
370 doc_ranks[d] = Float.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).rank_);
371 }
372 return doc_ranks;
373 }
374
375 /** add in term info if available */
376 protected boolean addTermInfo(Element term_list, HashMap params, Object query_result)
377 {
378 Document doc = term_list.getOwnerDocument();
379 String query_level = (String) params.get(LEVEL_PARAM); // the current query level
380
381 Vector terms = ((LuceneQueryResult) query_result).getTerms();
382 for (int t = 0; t < terms.size(); t++)
383 {
384 LuceneQueryResult.TermInfo term_info = (LuceneQueryResult.TermInfo) terms.get(t);
385
386 Element term_elem = doc.createElement(GSXML.TERM_ELEM);
387 term_elem.setAttribute(GSXML.NAME_ATT, term_info.term_);
388 term_elem.setAttribute(FREQ_ATT, "" + term_info.term_freq_);
389 term_elem.setAttribute(NUM_DOCS_MATCH_ATT, "" + term_info.match_docs_);
390 term_elem.setAttribute(FIELD_ATT, term_info.field_);
391 term_list.appendChild(term_elem);
392 }
393
394 Vector stopwords = ((LuceneQueryResult) query_result).getStopWords();
395 for (int t = 0; t < stopwords.size(); t++)
396 {
397 String stopword = (String) stopwords.get(t);
398
399 Element stopword_elem = doc.createElement(GSXML.STOPWORD_ELEM);
400 stopword_elem.setAttribute(GSXML.NAME_ATT, stopword);
401 term_list.appendChild(stopword_elem);
402 }
403
404 return true;
405 }
406
407 protected ArrayList<FacetWrapper> getFacets(Object query_result, String lang)
408 {
409 return null;
410 }
411
412 protected String getLuceneSort(String gs3_sort) {
413
414 if (gs3_sort.equals(RANK_PARAM_RANK)) {
415 return GS2LuceneQuery.SORT_RANK;
416 }
417 if (gs3_sort.equals(RANK_PARAM_NONE)) {
418 return GS2LuceneQuery.SORT_NATURAL;
419 }
420 return gs3_sort;
421 }
422
423@Override
424protected Map<String, Map<String, List<String>>> getHighlightSnippets(
425 Object query_result) {
426 // TODO Auto-generated method stub
427 return null;
428}
429
430}
Note: See TracBrowser for help on using the repository browser.