Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/GS2LuceneSearch.java@ 32619

Last change on this file since 32619 was 32619, checked in by ak19, 5 years ago
3 significant changes in 1 commit particularly impacting Lucene queries: 1. Instead if GS2LuceneSearch havinga GS2LuceneQuery object member variable for doing each and every search, each query now instantiates its own local GS2LuceneQuery object, configures it for that specific search, runs the search and then the GS2LuceneQuery object expires. This fixes a bug by preventing multiple concurrent searches getting the search configurations of other searches run at the same time. 2. Though GS2LuceneQuery objects need to be instantiated 1 per query over a collection, we don't want to keep reopening a collection's sidx and didx index folders with IndexReader objects for every query. Since IndexReaders support concurrent access, we'd like to use one IndexReader per collection index (one for didx, one for sidx) with the IndexReaders existing for the life of a collection. This meant moving the maintaining of IndexReader objects from GS2LuceneQuery into the GS2LuceneSearch service and turning them into singletons by using a HashMap to maintain index-dir, reader pairs. GS3 Services, e.g. GS2LuceneSearch, are loaded and unloaded on collection activate and deactivate respectively. On deactivate, cleanUp() is called on services and other GS3 modules. When GS2LuceneSearch.cleanUp() is called, we now finally close the singleton IndexReader objects/resources that a collection's GS2LuceneSearch object maintains. 3. Redid previous bugfix (then committed to GS2LuceneQuery): Point 2 again solves the filelocking problem of multiple handles to the index being opened and not all being closed on deactivate, but it's solved in a different and better/more optimal way than in the previous commit.
Property svn:keywords set to `Author Date Id Revision`
File size: 15.9 KB

Line
1	/*
2	* GS2LuceneSearch.java
3	* Copyright (C) 2006 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* the Free Software Foundation; either version 2 of the License, or
7	* (at your option) any later version.
8	*
9	* This program is distributed in the hope that it will be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.
13	*
14	* You should have received a copy of the GNU General Public License
15	* along with this program; if not, write to the Free Software
16	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17	*/
18
19	package org.greenstone.gsdl3.service;
20
21	// Greenstone classes
22	import java.io.File;
23	import java.io.IOException;
24	import java.io.Serializable;
25	import java.util.ArrayList;
26	import java.util.HashMap;
27	import java.util.Iterator;
28	import java.util.List;
29	import java.util.Map;
30	import java.util.Set;
31	import java.util.Vector;
32
33	// For maintaining Lucene IndexReader objects at collection level
34	import org.apache.lucene.index.DirectoryReader;
35	import org.apache.lucene.index.IndexReader;
36	import org.apache.lucene.store.Directory;
37	import org.apache.lucene.store.FSDirectory;
38
39	import org.apache.log4j.Logger;
40	import org.greenstone.LuceneWrapper4.GS2LuceneQuery;
41	import org.greenstone.LuceneWrapper4.LuceneQueryResult;
42	import org.greenstone.gsdl3.util.FacetWrapper;
43	import org.greenstone.gsdl3.util.GSFile;
44	import org.greenstone.gsdl3.util.GSXML;
45	import org.greenstone.gsdl3.util.XMLConverter;
46	import org.w3c.dom.Document;
47	import org.w3c.dom.Element;
48
49
50	public class GS2LuceneSearch extends SharedSoleneGS2FieldSearch
51	{
52
53	protected static final String SORT_ORDER_PARAM = "reverseSort";
54	protected static final String SORT_ORDER_REVERSE = "1";
55	protected static final String SORT_ORDER_NORMAL = "0";
56
57	// IndexReader objects are to be opened for each index level (e.g. one for didx, one for sidx) of a
58	// collection and will live for the duration of that collection, which is from collection activation
59	// until deactivation.
60	// So we want singletons of each index level's IndexReader, since IndexReaders are "multi-threaded
61	// re-entrant", so there's support for just one reader per index with concurrent access by multiple users'
62	// search queries.
63	// When a collection is deactivated, we need to close the reader objects to prevent handles to the
64	// index lingering and causing file locking issues on windows.
65	// Since GS2LuceneQuery now becomes a local member variable instantiated per query, we have to maintain
66	// IndexReader objects in GS2LuceneSearch instead, as GS2LuceneSearch is a collection's service, and
67	// therefore activated and deactivated along with the collection.
68	// The uniqueness of an IndexReader is indicated in the filepath to its index folder (collection path + sidx/didx).
69	// It doesn't have to be a static map of index_dir to IndexReader, and can be a member variable, since
70	// no other collection will refer to the same didx and sidx index folders: each collection has unique filepaths
71	// to its collection folder's index subdirs, not shared with other collections so the Readers don't have to be
72	// shared between collections either.
73
74	// We now store IndexReaders in a map of singleton index_dir -> IndexReaders opened for this collection:
75	// one Reader singleton for each index_dir
76	private Map<String, IndexReader> index_to_reader_map = new HashMap<String, IndexReader>();
77
78	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneSearch.class.getName());
79
80	public GS2LuceneSearch()
81	{
82	does_paging = true;
83	paramDefaults.put(SORT_ORDER_PARAM, SORT_ORDER_NORMAL);
84	}
85
86	public void cleanUp()
87	{
88	super.cleanUp();
89
90	// Prevent file locking issues: close all IndexReader objects maintained for this collection
91	synchronized(index_to_reader_map) { // Regular Map implementations are not synchronized, so adding/removing requires synchronizing on the map object.
92	// see https://docs.oracle.com/javase/7/docs/api/java/util/HashMap.html
93	// And ConcurrentHashMap seems complicated, https://docs.oracle.com/javase/7/docs/api/java/util/concurrent/ConcurrentHashMap.html
94
95	// Synchronizing outside the loop because cleanUp() clears the entire HashMap.
96	// Don't let any other threads access the map, hence synchronizing.
97	// Not sure if there may be other threads accessing the map when deactivating a collection which calls cleanUp().
98	// However, when multiple users' search queries lead to adding to the hashmap, definitely need to
99	// synchronize as there's a greater possibility of concurrent access then.
100
101	Iterator<Map.Entry<String,IndexReader>> map_iterator = index_to_reader_map.entrySet().iterator();
102	// Can use the Map.Entry Set view iterator to remove (key, value) entry from underlying Map!
103	// See https://docs.oracle.com/javase/7/docs/api/java/util/HashMap.html#keySet()
104	// Same thread creates the iterator as synchronizes on the map, so we should be allowed to remove() from the map
105	// but only through iterator!
106	while(map_iterator.hasNext()) {
107	Map.Entry<String,IndexReader> entry = map_iterator.next();
108	//index_to_reader_map.remove(...); // concurrentmodexception! Only allowed to remove through iterator. Will remove recent object returned by next()
109	IndexReader reader = entry.getValue(); //keys are index dir paths, e.g. path to current collection's didx folder, values are IndexReader objects
110	map_iterator.remove(); // removes current key's (key,value) entry from underlying map! (Remember, we're iterating on the keyset)
111	// We're first removing the reader singleton from map because reader.close() will only close the reader
112	//if it's the final reference to it in case that has a bearing here
113
114	if(reader != null) { // if there was a reader singleton instantiated for this index directory, e.g. coll-didx, close it
115	try {
116	// We're opening an IndexReader per indexdir once and closing it once: at start and end of collection.
117	// If Reader was a member var of GS2LuceneQuery and if multiple GS2LuceneQuery Objects were to call close() on the
118	// same reader object (on the singleton instance of reader for an index dir), so close is called multiple times,
119	// then would use incRef and decRef, see http://lucene.472066.n3.nabble.com/IndexReader-close-behavior-td2865515.html
120	// But then when concurrent queries are done, the final one would have closed the IndexReader and it would have to
121	// be reopened for the next query. We'd rather keep an opened IndexReader around until the collection's deactivated.
122	reader.close();
123	// Closes files associated with this index. Also saves any new deletions to disk.
124	// No other methods should be called after this has been called.
125	} catch (IOException exception) {
126	exception.printStackTrace();
127	}
128	}
129	} // end loop
130	} // end synchronising on index_to_reader_map
131
132	// Now we've closed all the Readers maintained for this collection and cleared the map.
133	}
134
135	public boolean configure(Element info, Element extra_info)
136	{
137	if (!super.configure(info, extra_info))
138	{
139	return false;
140	}
141	logger.info("Configuring GS2LuceneSearch...");
142
143	// add our reverseSort param to be saved to the session
144	this.save_params.add(SORT_ORDER_PARAM);
145	return true;
146	}
147	/** add in the Lucene specific params to TextQuery */
148	protected void addCustomQueryParams(Element param_list, String lang)
149	{
150	super.addCustomQueryParams(param_list, lang);
151	/** Add in the reverse sort on/off param */
152	createParameter(SORT_ORDER_PARAM, param_list, lang);
153	}
154	/** add in Lucene specific params for AdvancedFieldQuery */
155	protected void addCustomQueryParamsAdvField(Element param_list, String lang)
156	{
157	super.addCustomQueryParamsAdvField(param_list, lang);
158	createParameter(SORT_ORDER_PARAM, param_list, lang);
159
160	}
161	/** create a param and add to the list */
162	protected void createParameter(String name, Element param_list, String lang)
163	{
164	Document doc = param_list.getOwnerDocument();
165	Element param = null;
166	String param_default = paramDefaults.get(name);
167	if (name.equals(SORT_ORDER_PARAM)) {
168	String[] vals = { SORT_ORDER_REVERSE, SORT_ORDER_NORMAL };
169	String[] vals_texts = { getTextString("param." + SORT_ORDER_PARAM + "." + SORT_ORDER_REVERSE, lang), getTextString("param." + SORT_ORDER_PARAM + "." + SORT_ORDER_NORMAL, lang) };
170
171	param = GSXML.createParameterDescription(doc, SORT_ORDER_PARAM, getTextString("param." + SORT_ORDER_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, param_default, vals, vals_texts);
172	}
173
174	if (param != null)
175	{
176	param_list.appendChild(param);
177	}
178	else
179	{
180	super.createParameter(name, param_list, lang);
181	}
182
183	}
184
185	/** methods to handle actually doing the query */
186
187	/** do any initialisation of the query object */
188	protected Object setUpQueryer(HashMap params)
189	{
190	// local Query object
191	GS2LuceneQuery lucene_src = new GS2LuceneQuery();
192
193	String indexdir = GSFile.collectionBaseDir(this.site_home, this.cluster_name) + File.separatorChar + "index" + File.separatorChar;
194
195	String index = "didx";
196	if (this.default_level.toUpperCase().equals("SEC")) {
197	index = "sidx";
198	}
199	String physical_index_language_name = null;
200	String physical_sub_index_name = null;
201	int hits_per_page = Integer.parseInt(paramDefaults.get(HITS_PER_PAGE_PARAM));
202	int start_page = Integer.parseInt(paramDefaults.get(START_PAGE_PARAM));
203	String sort_field = getLuceneSort(default_sort);
204	String sort_order = paramDefaults.get(SORT_ORDER_PARAM);
205
206	// set up the query params
207	Set entries = params.entrySet();
208	Iterator i = entries.iterator();
209	while (i.hasNext())
210	{
211	Map.Entry m = (Map.Entry) i.next();
212	String name = (String) m.getKey();
213	String value = (String) m.getValue();
214
215	if (name.equals(HITS_PER_PAGE_PARAM))
216	{
217	if (value.equals("all")) {
218	hits_per_page = -1;
219	} else {
220	hits_per_page = Integer.parseInt(value);
221	}
222	}
223	else if (name.equals(START_PAGE_PARAM))
224	{
225	start_page = Integer.parseInt(value);
226
227	}
228	else if (name.equals(MATCH_PARAM))
229	{
230	if (value.equals(MATCH_PARAM_ALL))
231	{
232	lucene_src.setDefaultConjunctionOperator("AND");
233	}
234	else
235	{
236	lucene_src.setDefaultConjunctionOperator("OR");
237	}
238	}
239	else if (name.equals(RANK_PARAM))
240	{
241	sort_field = getLuceneSort(value);
242	lucene_src.setSortField(sort_field);
243
244	}
245	else if (name.equals(SORT_ORDER_PARAM)) {
246	sort_order = value;
247	}
248	else if (name.equals(LEVEL_PARAM))
249	{
250	if (value.toUpperCase().equals("SEC"))
251	{
252	index = "sidx";
253	}
254	else
255	{
256	index = "didx";
257	}
258	}
259	else if (name.equals(INDEX_SUBCOLLECTION_PARAM))
260	{
261	physical_sub_index_name = value;
262	}
263	else if (name.equals(INDEX_LANGUAGE_PARAM))
264	{
265	physical_index_language_name = value;
266	} // ignore any others
267	}
268	// set up start and end results if necessary
269	// start results always start at 0
270	int start_results = 0;
271	if (start_page > 1 && hits_per_page > 0)
272	{
273	start_results = ((start_page - 1) * hits_per_page) ;
274	}
275	int end_results = Integer.MAX_VALUE;
276	if (hits_per_page > 0) {
277	end_results = hits_per_page * start_page;
278	}
279	lucene_src.setStartResults(start_results);
280	lucene_src.setEndResults(end_results);
281
282	if (index.equals("sidx") \|\| index.equals("didx"))
283	{
284	if (physical_sub_index_name != null)
285	{
286	index += physical_sub_index_name;
287	}
288	if (physical_index_language_name != null)
289	{
290	index += physical_index_language_name;
291	}
292	}
293
294	if (sort_order.equals(SORT_ORDER_REVERSE)) {
295	lucene_src.setReverseSort(true);
296	} else {
297	lucene_src.setReverseSort(false);
298	}
299
300	String full_index_dir_str = indexdir + index;
301	lucene_src.setIndexDir(full_index_dir_str);
302
303	// Ensure we have an IndexReader for this full_index_dir_str:
304	// check the hashmap first, in case we already opened a reader and searcher for this index dir, e.g. didx
305	// if there was a reader singleton instantiated for this index directory, e.g. <coll>didx, use that.
306	// Else open a new reader for this index_dir and store it in the map.
307	IndexReader reader = index_to_reader_map.get(full_index_dir_str);
308	if(reader == null) {
309	try {
310	Directory full_indexdir_dir = FSDirectory.open(new File(full_index_dir_str));
311	reader = DirectoryReader.open(full_indexdir_dir); // Returns an IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
312	synchronized(index_to_reader_map) {
313	// If storing searcher along with reader, mimic Pairs with: https://stackoverflow.com/questions/2670982/using-pairs-or-2-tuples-in-java
314	index_to_reader_map.put(full_index_dir_str, reader);
315	}
316	}
317	catch (IOException exception) {
318	exception.printStackTrace();
319	}
320	}
321
322	lucene_src.initialise(reader); // sets IndexReader and IndexSearcher
323
324	return lucene_src; // return the queryobject
325	}
326
327	/** do the query */
328	protected Object runQuery(Object queryObject, String query)
329	{
330	GS2LuceneQuery lucene_src = (GS2LuceneQuery) queryObject;
331	try
332	{
333	LuceneQueryResult lqr = lucene_src.runQuery(query);
334	return lqr;
335	}
336	catch (Exception e)
337	{
338	logger.error("Exception happened in runQuery(): ", e);
339	}
340
341	return null;
342	}
343
344	/** get the total number of docs that match */
345	protected long numDocsMatched(Object query_result)
346	{
347	return ((LuceneQueryResult) query_result).getTotalDocs();
348	}
349
350	/** get the list of doc ids */
351	protected String[] getDocIDs(Object query_result)
352	{
353	Vector docs = ((LuceneQueryResult) query_result).getDocs();
354	String[] doc_nums = new String[docs.size()];
355	for (int d = 0; d < docs.size(); d++)
356	{
357	String doc_num = ((LuceneQueryResult.DocInfo) docs.elementAt(d)).id_;
358	doc_nums[d] = doc_num;
359	}
360	return doc_nums;
361	}
362
363	/** get the list of doc ranks */
364	protected String[] getDocRanks(Object query_result)
365	{
366	Vector docs = ((LuceneQueryResult) query_result).getDocs();
367	String[] doc_ranks = new String[docs.size()];
368	for (int d = 0; d < docs.size(); d++)
369	{
370	doc_ranks[d] = Float.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).rank_);
371	}
372	return doc_ranks;
373	}
374
375	/** add in term info if available */
376	protected boolean addTermInfo(Element term_list, HashMap params, Object query_result)
377	{
378	Document doc = term_list.getOwnerDocument();
379	String query_level = (String) params.get(LEVEL_PARAM); // the current query level
380
381	Vector terms = ((LuceneQueryResult) query_result).getTerms();
382	for (int t = 0; t < terms.size(); t++)
383	{
384	LuceneQueryResult.TermInfo term_info = (LuceneQueryResult.TermInfo) terms.get(t);
385
386	Element term_elem = doc.createElement(GSXML.TERM_ELEM);
387	term_elem.setAttribute(GSXML.NAME_ATT, term_info.term_);
388	term_elem.setAttribute(FREQ_ATT, "" + term_info.term_freq_);
389	term_elem.setAttribute(NUM_DOCS_MATCH_ATT, "" + term_info.match_docs_);
390	term_elem.setAttribute(FIELD_ATT, term_info.field_);
391	term_list.appendChild(term_elem);
392	}
393
394	Vector stopwords = ((LuceneQueryResult) query_result).getStopWords();
395	for (int t = 0; t < stopwords.size(); t++)
396	{
397	String stopword = (String) stopwords.get(t);
398
399	Element stopword_elem = doc.createElement(GSXML.STOPWORD_ELEM);
400	stopword_elem.setAttribute(GSXML.NAME_ATT, stopword);
401	term_list.appendChild(stopword_elem);
402	}
403
404	return true;
405	}
406
407	protected ArrayList<FacetWrapper> getFacets(Object query_result, String lang)
408	{
409	return null;
410	}
411
412	protected String getLuceneSort(String gs3_sort) {
413
414	if (gs3_sort.equals(RANK_PARAM_RANK)) {
415	return GS2LuceneQuery.SORT_RANK;
416	}
417	if (gs3_sort.equals(RANK_PARAM_NONE)) {
418	return GS2LuceneQuery.SORT_NATURAL;
419	}
420	return gs3_sort;
421	}
422
423	@Override
424	protected Map<String, Map<String, List<String>>> getHighlightSnippets(
425	Object query_result) {
426	// TODO Auto-generated method stub
427	return null;
428	}
429
430	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: