Context Navigation

GS2LuceneSearch.java@ 38154

Last change on this file since 38154 was 38154, checked in by kjdon, 7 months ago
moved sidx and didx to static strings
Property svn:keywords set to `Author Date Id Revision`
File size: 15.9 KB

Line
1	/*
2	* GS2LuceneSearch.java
3	* Copyright (C) 2006 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* the Free Software Foundation; either version 2 of the License, or
7	* (at your option) any later version.
8	*
9	* This program is distributed in the hope that it will be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.
13	*
14	* You should have received a copy of the GNU General Public License
15	* along with this program; if not, write to the Free Software
16	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17	*/
18
19	package org.greenstone.gsdl3.service;
20
21	// Greenstone classes
22	import java.io.File;
23	import java.io.IOException;
24	import java.io.Serializable;
25	import java.util.ArrayList;
26	import java.util.HashMap;
27	import java.util.Iterator;
28	import java.util.List;
29	import java.util.Map;
30	import java.util.Set;
31	import java.util.Vector;
32
33	// For maintaining Lucene IndexReader objects at collection level
34	import org.apache.lucene.index.DirectoryReader;
35	import org.apache.lucene.index.IndexReader;
36	import org.apache.lucene.store.Directory;
37	import org.apache.lucene.store.FSDirectory;
38
39	import org.apache.log4j.Logger;
40	import org.greenstone.LuceneWrapper4.GS2LuceneQuery;
41	import org.greenstone.LuceneWrapper4.LuceneQueryResult;
42	import org.greenstone.gsdl3.util.FacetWrapper;
43	import org.greenstone.gsdl3.util.GSFile;
44	import org.greenstone.gsdl3.util.GSXML;
45	import org.greenstone.gsdl3.util.XMLConverter;
46	import org.w3c.dom.Document;
47	import org.w3c.dom.Element;
48
49
50	public class GS2LuceneSearch extends SharedSoleneGS2FieldSearch
51	{
52
53	protected static final String SORT_ORDER_PARAM = "reverseSort";
54	protected static final String SORT_ORDER_REVERSE = "1";
55	protected static final String SORT_ORDER_NORMAL = "0";
56
57	// IndexReader objects are to be opened for each index level (e.g. one for didx, one for sidx) of a
58	// collection and will live for the duration of that collection, which is from collection activation
59	// until deactivation.
60	// So we want singletons of each index level's IndexReader, since IndexReaders are "multi-threaded
61	// re-entrant", so there's support for just one reader per index with concurrent access by multiple users'
62	// search queries.
63	// When a collection is deactivated, we need to close the reader objects to prevent handles to the
64	// index lingering and causing file locking issues on windows.
65	// Since GS2LuceneQuery now becomes a local member variable instantiated per query, we have to maintain
66	// IndexReader objects in GS2LuceneSearch instead, as GS2LuceneSearch is a collection's service, and
67	// therefore activated and deactivated along with the collection.
68	// The uniqueness of an IndexReader is indicated in the filepath to its index folder (collection path + sidx/didx).
69	// It doesn't have to be a static map of index_dir to IndexReader, and can be a member variable, since
70	// no other collection will refer to the same didx and sidx index folders: each collection has unique filepaths
71	// to its collection folder's index subdirs, not shared with other collections so the Readers don't have to be
72	// shared between collections either.
73
74	// We now store IndexReaders in a map of singleton index_dir -> IndexReaders opened for this collection:
75	// one Reader singleton for each index_dir
76	private Map<String, IndexReader> index_to_reader_map = new HashMap<String, IndexReader>();
77
78	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneSearch.class.getName());
79
80	public GS2LuceneSearch()
81	{
82	does_paging = true;
83	paramDefaults.put(SORT_ORDER_PARAM, SORT_ORDER_NORMAL);
84	}
85
86	public void cleanUp()
87	{
88	super.cleanUp();
89
90	// Prevent file locking issues: close all IndexReader objects maintained for this collection
91	synchronized(index_to_reader_map) { // Regular Map implementations are not synchronized, so adding/removing requires synchronizing on the map object.
92	// see https://docs.oracle.com/javase/7/docs/api/java/util/HashMap.html
93	// And ConcurrentHashMap seems complicated, https://docs.oracle.com/javase/7/docs/api/java/util/concurrent/ConcurrentHashMap.html
94
95	// Synchronizing outside the loop because cleanUp() clears the entire HashMap.
96	// Don't let any other threads access the map, hence synchronizing.
97	// Not sure if there may be other threads accessing the map when deactivating a collection which calls cleanUp().
98	// However, when multiple users' search queries lead to adding to the hashmap, definitely need to
99	// synchronize as there's a greater possibility of concurrent access then.
100
101	Iterator<Map.Entry<String,IndexReader>> map_iterator = index_to_reader_map.entrySet().iterator();
102	// Can use the Map.Entry Set view iterator to remove (key, value) entry from underlying Map!
103	// See https://docs.oracle.com/javase/7/docs/api/java/util/HashMap.html#keySet()
104	// Same thread creates the iterator as synchronizes on the map, so we should be allowed to remove() from the map
105	// but only through iterator!
106	while(map_iterator.hasNext()) {
107	Map.Entry<String,IndexReader> entry = map_iterator.next();
108	//index_to_reader_map.remove(...); // concurrentmodexception! Only allowed to remove through iterator. Will remove recent object returned by next()
109	IndexReader reader = entry.getValue(); //keys are index dir paths, e.g. path to current collection's didx folder, values are IndexReader objects
110	map_iterator.remove(); // removes current key's (key,value) entry from underlying map! (Remember, we're iterating on the keyset)
111	// We're first removing the reader singleton from map because reader.close() will only close the reader
112	//if it's the final reference to it in case that has a bearing here
113
114	if(reader != null) { // if there was a reader singleton instantiated for this index directory, e.g. coll-didx, close it
115	try {
116	// We're opening an IndexReader per indexdir once and closing it once: at start and end of collection.
117	// If Reader was a member var of GS2LuceneQuery and if multiple GS2LuceneQuery Objects were to call close() on the
118	// same reader object (on the singleton instance of reader for an index dir), so close is called multiple times,
119	// then would use incRef and decRef, see http://lucene.472066.n3.nabble.com/IndexReader-close-behavior-td2865515.html
120	// But then when concurrent queries are done, the final one would have closed the IndexReader and it would have to
121	// be reopened for the next query. We'd rather keep an opened IndexReader around until the collection's deactivated.
122	reader.close();
123	// Closes files associated with this index. Also saves any new deletions to disk.
124	// No other methods should be called after this has been called.
125	} catch (IOException exception) {
126	exception.printStackTrace();
127	}
128	}
129	} // end loop
130	} // end synchronising on index_to_reader_map
131
132	// Now we've closed all the Readers maintained for this collection and cleared the map.
133	}
134
135	public boolean configure(Element info, Element extra_info)
136	{
137	if (!super.configure(info, extra_info))
138	{
139	return false;
140	}
141	logger.info("Configuring GS2LuceneSearch...");
142
143	// add our reverseSort param to be saved to the session
144	this.save_params.add(SORT_ORDER_PARAM);
145	return true;
146	}
147	/** add in the Lucene specific params to TextQuery */
148	protected void addCustomQueryParams(Element param_list, String lang)
149	{
150	super.addCustomQueryParams(param_list, lang);
151	/** Add in the reverse sort on/off param */
152	createParameter(SORT_ORDER_PARAM, param_list, lang);
153	}
154	/** add in Lucene specific params for AdvancedFieldQuery */
155	protected void addCustomQueryParamsAdvField(Element param_list, String lang)
156	{
157	super.addCustomQueryParamsAdvField(param_list, lang);
158	createParameter(SORT_ORDER_PARAM, param_list, lang);
159
160	}
161	/** create a param and add to the list */
162	protected void createParameter(String name, Element param_list, String lang)
163	{
164	Document doc = param_list.getOwnerDocument();
165	Element param = null;
166	String param_default = paramDefaults.get(name);
167	if (name.equals(SORT_ORDER_PARAM)) {
168	String[] vals = { SORT_ORDER_REVERSE, SORT_ORDER_NORMAL };
169	String[] vals_texts = { getTextString("param." + SORT_ORDER_PARAM + "." + SORT_ORDER_REVERSE, lang), getTextString("param." + SORT_ORDER_PARAM + "." + SORT_ORDER_NORMAL, lang) };
170
171	param = GSXML.createParameterDescription(doc, SORT_ORDER_PARAM, getTextString("param." + SORT_ORDER_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, param_default, vals, vals_texts);
172	}
173
174	if (param != null)
175	{
176	param_list.appendChild(param);
177	}
178	else
179	{
180	super.createParameter(name, param_list, lang);
181	}
182
183	}
184
185	/** methods to handle actually doing the query */
186
187	/** do any initialisation of the query object */
188	protected Object setUpQueryer(HashMap params)
189	{
190	// local Query object
191	GS2LuceneQuery lucene_src = new GS2LuceneQuery();
192
193	String indexdir = GSFile.collectionBaseDir(this.site_home, this.cluster_name) + File.separatorChar + "index" + File.separatorChar;
194
195	String index = DOCUMENT_INDEX;
196	if (this.default_level.toUpperCase().equals("SEC")) {
197	index = SECTION_INDEX;
198	}
199	String physical_index_language_name = null;
200	String physical_sub_index_name = null;
201	int hits_per_page = Integer.parseInt(paramDefaults.get(HITS_PER_PAGE_PARAM));
202	int start_page = Integer.parseInt(paramDefaults.get(START_PAGE_PARAM));
203	String sort_field = getLuceneSort(default_sort);
204	String sort_order = paramDefaults.get(SORT_ORDER_PARAM);
205
206	// set up the query params
207	Set entries = params.entrySet();
208	Iterator i = entries.iterator();
209	while (i.hasNext())
210	{
211	Map.Entry m = (Map.Entry) i.next();
212	String name = (String) m.getKey();
213	String value = (String) m.getValue();
214
215	if (name.equals(HITS_PER_PAGE_PARAM))
216	{
217	if (value.equals("all")) {
218	hits_per_page = -1;
219	} else {
220	hits_per_page = Integer.parseInt(value);
221	}
222	}
223	else if (name.equals(START_PAGE_PARAM))
224	{
225	start_page = Integer.parseInt(value);
226
227	}
228	else if (name.equals(MATCH_PARAM))
229	{
230	if (value.equals(MATCH_PARAM_ALL))
231	{
232	lucene_src.setDefaultConjunctionOperator("AND");
233	}
234	else
235	{
236	lucene_src.setDefaultConjunctionOperator("OR");
237	}
238	}
239	else if (name.equals(RANK_PARAM))
240	{
241	sort_field = getLuceneSort(value);
242	lucene_src.setSortField(sort_field);
243
244	}
245	else if (name.equals(SORT_ORDER_PARAM)) {
246	sort_order = value;
247	}
248	else if (name.equals(LEVEL_PARAM))
249	{
250	if (value.toUpperCase().equals("SEC"))
251	{
252	index = SECTION_INDEX;
253	}
254	else
255	{
256	index = DOCUMENT_INDEX;
257	}
258	}
259	else if (name.equals(INDEX_SUBCOLLECTION_PARAM))
260	{
261	physical_sub_index_name = value;
262	}
263	else if (name.equals(INDEX_LANGUAGE_PARAM))
264	{
265	physical_index_language_name = value;
266	} // ignore any others
267	}
268	// set up start and end results if necessary
269	// start results always start at 0
270	int start_results = 0;
271	if (start_page > 1 && hits_per_page > 0)
272	{
273	start_results = ((start_page - 1) * hits_per_page) ;
274	}
275	int end_results = Integer.MAX_VALUE;
276	if (hits_per_page > 0) {
277	end_results = hits_per_page * start_page;
278	}
279	lucene_src.setStartResults(start_results);
280	lucene_src.setEndResults(end_results);
281
282	if (index.equals(SECTION_INDEX) \|\| index.equals(DOCUMENT_INDEX))
283	{
284	if (physical_sub_index_name != null)
285	{
286	index += physical_sub_index_name;
287	}
288	if (physical_index_language_name != null)
289	{
290	index += physical_index_language_name;
291	}
292	}
293
294	if (sort_order.equals(SORT_ORDER_REVERSE)) {
295	lucene_src.setReverseSort(true);
296	} else {
297	lucene_src.setReverseSort(false);
298	}
299
300	String full_index_dir_str = indexdir + index;
301	lucene_src.setIndexDir(full_index_dir_str);
302
303	// Ensure we have an IndexReader for this full_index_dir_str:
304	// check the hashmap first, in case we already opened a reader and searcher for this index dir, e.g. didx
305	// if there was a reader singleton instantiated for this index directory, e.g. <coll>didx, use that.
306	// Else open a new reader for this index_dir and store it in the map.
307	IndexReader reader = index_to_reader_map.get(full_index_dir_str);
308	if(reader == null) {
309	try {
310	Directory full_indexdir_dir = FSDirectory.open(new File(full_index_dir_str));
311	reader = DirectoryReader.open(full_indexdir_dir); // Returns an IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
312	synchronized(index_to_reader_map) {
313	// If storing searcher along with reader, mimic Pairs with: https://stackoverflow.com/questions/2670982/using-pairs-or-2-tuples-in-java
314	index_to_reader_map.put(full_index_dir_str, reader);
315	}
316	}
317	catch (IOException exception) {
318	exception.printStackTrace();
319	}
320	}
321
322	lucene_src.initialise(reader); // sets IndexReader and IndexSearcher
323
324	return lucene_src; // return the queryobject
325	}
326
327	/** do the query */
328	protected Object runQuery(Object queryObject, String query)
329	{
330	GS2LuceneQuery lucene_src = (GS2LuceneQuery) queryObject;
331	try
332	{
333	LuceneQueryResult lqr = lucene_src.runQuery(query);
334	return lqr;
335	}
336	catch (Exception e)
337	{
338	logger.error("Exception happened in runQuery(): ", e);
339	}
340
341	return null;
342	}
343
344	/** get the total number of docs that match */
345	protected long numDocsMatched(Object query_result)
346	{
347	return ((LuceneQueryResult) query_result).getTotalDocs();
348	}
349
350	/** get the list of doc ids */
351	protected String[] getDocIDs(Object query_result)
352	{
353	Vector docs = ((LuceneQueryResult) query_result).getDocs();
354	String[] doc_nums = new String[docs.size()];
355	for (int d = 0; d < docs.size(); d++)
356	{
357	String doc_num = ((LuceneQueryResult.DocInfo) docs.elementAt(d)).id_;
358	doc_nums[d] = doc_num;
359	}
360	return doc_nums;
361	}
362
363	/** get the list of doc ranks */
364	protected String[] getDocRanks(Object query_result)
365	{
366	Vector docs = ((LuceneQueryResult) query_result).getDocs();
367	String[] doc_ranks = new String[docs.size()];
368	for (int d = 0; d < docs.size(); d++)
369	{
370	doc_ranks[d] = Float.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).rank_);
371	}
372	return doc_ranks;
373	}
374
375	/** add in term info if available */
376	protected boolean addTermInfo(Element term_list, HashMap params, Object query_result)
377	{
378	Document doc = term_list.getOwnerDocument();
379	String query_level = (String) params.get(LEVEL_PARAM); // the current query level
380
381	Vector terms = ((LuceneQueryResult) query_result).getTerms();
382	for (int t = 0; t < terms.size(); t++)
383	{
384	LuceneQueryResult.TermInfo term_info = (LuceneQueryResult.TermInfo) terms.get(t);
385
386	Element term_elem = doc.createElement(GSXML.TERM_ELEM);
387	term_elem.setAttribute(GSXML.NAME_ATT, term_info.term_);
388	term_elem.setAttribute(FREQ_ATT, "" + term_info.term_freq_);
389	term_elem.setAttribute(NUM_DOCS_MATCH_ATT, "" + term_info.match_docs_);
390	term_elem.setAttribute(FIELD_ATT, term_info.field_);
391	term_list.appendChild(term_elem);
392	}
393
394	Vector stopwords = ((LuceneQueryResult) query_result).getStopWords();
395	for (int t = 0; t < stopwords.size(); t++)
396	{
397	String stopword = (String) stopwords.get(t);
398
399	Element stopword_elem = doc.createElement(GSXML.STOPWORD_ELEM);
400	stopword_elem.setAttribute(GSXML.NAME_ATT, stopword);
401	term_list.appendChild(stopword_elem);
402	}
403
404	return true;
405	}
406
407	protected ArrayList<FacetWrapper> getFacets(Object query_result, String lang)
408	{
409	return null;
410	}
411
412	protected String getLuceneSort(String gs3_sort) {
413
414	if (gs3_sort.equals(RANK_PARAM_RANK)) {
415	return GS2LuceneQuery.SORT_RANK;
416	}
417	if (gs3_sort.equals(RANK_PARAM_NONE)) {
418	return GS2LuceneQuery.SORT_NATURAL;
419	}
420	return gs3_sort;
421	}
422
423	@Override
424	protected Map<String, Map<String, List<String>>> getHighlightSnippets(
425	Object query_result) {
426	// TODO Auto-generated method stub
427	return null;
428	}
429
430	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/GS2LuceneSearch.java@ 38154

Download in other formats: