/* * GS2LuceneSearch.java * Copyright (C) 2006 New Zealand Digital Library, http://www.nzdl.org * * This program is free software; you can redistribute it and/or modify * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.greenstone.gsdl3.service; // Greenstone classes import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Vector; // For maintaining Lucene IndexReader objects at collection level import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.log4j.Logger; import org.greenstone.LuceneWrapper4.GS2LuceneQuery; import org.greenstone.LuceneWrapper4.LuceneQueryResult; import org.greenstone.gsdl3.util.FacetWrapper; import org.greenstone.gsdl3.util.GSFile; import org.greenstone.gsdl3.util.GSXML; import org.greenstone.gsdl3.util.XMLConverter; import org.w3c.dom.Document; import org.w3c.dom.Element; public class GS2LuceneSearch extends SharedSoleneGS2FieldSearch { protected static final String SORT_ORDER_PARAM = "reverseSort"; protected static final String SORT_ORDER_REVERSE = "1"; protected static final String SORT_ORDER_NORMAL = "0"; // IndexReader objects are to be opened for each index level (e.g. one for didx, one for sidx) of a // collection and will live for the duration of that collection, which is from collection activation // until deactivation. // So we want singletons of each index level's IndexReader, since IndexReaders are "multi-threaded // re-entrant", so there's support for just one reader per index with concurrent access by multiple users' // search queries. // When a collection is deactivated, we need to close the reader objects to prevent handles to the // index lingering and causing file locking issues on windows. // Since GS2LuceneQuery now becomes a local member variable instantiated per query, we have to maintain // IndexReader objects in GS2LuceneSearch instead, as GS2LuceneSearch is a collection's service, and // therefore activated and deactivated along with the collection. // The uniqueness of an IndexReader is indicated in the filepath to its index folder (collection path + sidx/didx). // It doesn't have to be a static map of index_dir to IndexReader, and can be a member variable, since // no other collection will refer to the same didx and sidx index folders: each collection has unique filepaths // to its collection folder's index subdirs, not shared with other collections so the Readers don't have to be // shared between collections either. // We now store IndexReaders in a map of singleton index_dir -> IndexReaders opened for this collection: // one Reader singleton for each index_dir private Map index_to_reader_map = new HashMap(); static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneSearch.class.getName()); public GS2LuceneSearch() { does_paging = true; paramDefaults.put(SORT_ORDER_PARAM, SORT_ORDER_NORMAL); } public void cleanUp() { super.cleanUp(); // Prevent file locking issues: close all IndexReader objects maintained for this collection synchronized(index_to_reader_map) { // Regular Map implementations are not synchronized, so adding/removing requires synchronizing on the map object. // see https://docs.oracle.com/javase/7/docs/api/java/util/HashMap.html // And ConcurrentHashMap seems complicated, https://docs.oracle.com/javase/7/docs/api/java/util/concurrent/ConcurrentHashMap.html // Synchronizing *outside* the loop because cleanUp() clears the entire HashMap. // Don't let any other threads access the map, hence synchronizing. // Not sure if there may be other threads accessing the map when deactivating a collection which calls cleanUp(). // However, when multiple users' search queries lead to adding to the hashmap, definitely need to // synchronize as there's a greater possibility of concurrent access then. Iterator> map_iterator = index_to_reader_map.entrySet().iterator(); // Can use the Map.Entry Set view iterator to remove (key, value) entry from underlying Map! // See https://docs.oracle.com/javase/7/docs/api/java/util/HashMap.html#keySet() // Same thread creates the iterator as synchronizes on the map, so we should be allowed to remove() from the map // but only through iterator! while(map_iterator.hasNext()) { Map.Entry entry = map_iterator.next(); //index_to_reader_map.remove(...); // concurrentmodexception! Only allowed to remove through iterator. Will remove recent object returned by next() IndexReader reader = entry.getValue(); //keys are index dir paths, e.g. path to current collection's didx folder, values are IndexReader objects map_iterator.remove(); // removes current key's (key,value) entry from underlying map! (Remember, we're iterating on the keyset) // We're first removing the reader singleton from map because reader.close() will only close the reader //if it's the final reference to it in case that has a bearing here if(reader != null) { // if there was a reader singleton instantiated for this index directory, e.g. coll-didx, close it try { // We're opening an IndexReader per indexdir once and closing it once: at start and end of collection. // If Reader was a member var of GS2LuceneQuery and if multiple GS2LuceneQuery Objects were to call close() on the // same reader object (on the singleton instance of reader for an index dir), so close is called multiple times, // then would use incRef and decRef, see http://lucene.472066.n3.nabble.com/IndexReader-close-behavior-td2865515.html // But then when concurrent queries are done, the final one would have closed the IndexReader and it would have to // be reopened for the next query. We'd rather keep an opened IndexReader around until the collection's deactivated. reader.close(); // Closes files associated with this index. Also saves any new deletions to disk. // No other methods should be called after this has been called. } catch (IOException exception) { exception.printStackTrace(); } } } // end loop } // end synchronising on index_to_reader_map // Now we've closed all the Readers maintained for this collection and cleared the map. } public boolean configure(Element info, Element extra_info) { if (!super.configure(info, extra_info)) { return false; } logger.info("Configuring GS2LuceneSearch..."); // add our reverseSort param to be saved to the session this.save_params.add(SORT_ORDER_PARAM); return true; } /** add in the Lucene specific params to TextQuery */ protected void addCustomQueryParams(Element param_list, String lang) { super.addCustomQueryParams(param_list, lang); /** Add in the reverse sort on/off param */ createParameter(SORT_ORDER_PARAM, param_list, lang); } /** add in Lucene specific params for AdvancedFieldQuery */ protected void addCustomQueryParamsAdvField(Element param_list, String lang) { super.addCustomQueryParamsAdvField(param_list, lang); createParameter(SORT_ORDER_PARAM, param_list, lang); } /** create a param and add to the list */ protected void createParameter(String name, Element param_list, String lang) { Document doc = param_list.getOwnerDocument(); Element param = null; String param_default = paramDefaults.get(name); if (name.equals(SORT_ORDER_PARAM)) { String[] vals = { SORT_ORDER_REVERSE, SORT_ORDER_NORMAL }; String[] vals_texts = { getTextString("param." + SORT_ORDER_PARAM + "." + SORT_ORDER_REVERSE, lang), getTextString("param." + SORT_ORDER_PARAM + "." + SORT_ORDER_NORMAL, lang) }; param = GSXML.createParameterDescription(doc, SORT_ORDER_PARAM, getTextString("param." + SORT_ORDER_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, param_default, vals, vals_texts); } if (param != null) { param_list.appendChild(param); } else { super.createParameter(name, param_list, lang); } } /** methods to handle actually doing the query */ /** do any initialisation of the query object */ protected Object setUpQueryer(HashMap params) { // local Query object GS2LuceneQuery lucene_src = new GS2LuceneQuery(); String indexdir = GSFile.collectionBaseDir(this.site_home, this.cluster_name) + File.separatorChar + "index" + File.separatorChar; String index = "didx"; if (this.default_level.toUpperCase().equals("SEC")) { index = "sidx"; } String physical_index_language_name = null; String physical_sub_index_name = null; int hits_per_page = Integer.parseInt(paramDefaults.get(HITS_PER_PAGE_PARAM)); int start_page = Integer.parseInt(paramDefaults.get(START_PAGE_PARAM)); String sort_field = getLuceneSort(default_sort); String sort_order = paramDefaults.get(SORT_ORDER_PARAM); // set up the query params Set entries = params.entrySet(); Iterator i = entries.iterator(); while (i.hasNext()) { Map.Entry m = (Map.Entry) i.next(); String name = (String) m.getKey(); String value = (String) m.getValue(); if (name.equals(HITS_PER_PAGE_PARAM)) { if (value.equals("all")) { hits_per_page = -1; } else { hits_per_page = Integer.parseInt(value); } } else if (name.equals(START_PAGE_PARAM)) { start_page = Integer.parseInt(value); } else if (name.equals(MATCH_PARAM)) { if (value.equals(MATCH_PARAM_ALL)) { lucene_src.setDefaultConjunctionOperator("AND"); } else { lucene_src.setDefaultConjunctionOperator("OR"); } } else if (name.equals(RANK_PARAM)) { sort_field = getLuceneSort(value); lucene_src.setSortField(sort_field); } else if (name.equals(SORT_ORDER_PARAM)) { sort_order = value; } else if (name.equals(LEVEL_PARAM)) { if (value.toUpperCase().equals("SEC")) { index = "sidx"; } else { index = "didx"; } } else if (name.equals(INDEX_SUBCOLLECTION_PARAM)) { physical_sub_index_name = value; } else if (name.equals(INDEX_LANGUAGE_PARAM)) { physical_index_language_name = value; } // ignore any others } // set up start and end results if necessary // start results always start at 0 int start_results = 0; if (start_page > 1 && hits_per_page > 0) { start_results = ((start_page - 1) * hits_per_page) ; } int end_results = Integer.MAX_VALUE; if (hits_per_page > 0) { end_results = hits_per_page * start_page; } lucene_src.setStartResults(start_results); lucene_src.setEndResults(end_results); if (index.equals("sidx") || index.equals("didx")) { if (physical_sub_index_name != null) { index += physical_sub_index_name; } if (physical_index_language_name != null) { index += physical_index_language_name; } } if (sort_order.equals(SORT_ORDER_REVERSE)) { lucene_src.setReverseSort(true); } else { lucene_src.setReverseSort(false); } String full_index_dir_str = indexdir + index; lucene_src.setIndexDir(full_index_dir_str); // Ensure we have an IndexReader for this full_index_dir_str: // check the hashmap first, in case we already opened a reader and searcher for this index dir, e.g. didx // if there was a reader singleton instantiated for this index directory, e.g. didx, use that. // Else open a new reader for this index_dir and store it in the map. IndexReader reader = index_to_reader_map.get(full_index_dir_str); if(reader == null) { try { Directory full_indexdir_dir = FSDirectory.open(new File(full_index_dir_str)); reader = DirectoryReader.open(full_indexdir_dir); // Returns an IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher synchronized(index_to_reader_map) { // If storing searcher along with reader, mimic Pairs with: https://stackoverflow.com/questions/2670982/using-pairs-or-2-tuples-in-java index_to_reader_map.put(full_index_dir_str, reader); } } catch (IOException exception) { exception.printStackTrace(); } } lucene_src.initialise(reader); // sets IndexReader and IndexSearcher return lucene_src; // return the queryobject } /** do the query */ protected Object runQuery(Object queryObject, String query) { GS2LuceneQuery lucene_src = (GS2LuceneQuery) queryObject; try { LuceneQueryResult lqr = lucene_src.runQuery(query); return lqr; } catch (Exception e) { logger.error("Exception happened in runQuery(): ", e); } return null; } /** get the total number of docs that match */ protected long numDocsMatched(Object query_result) { return ((LuceneQueryResult) query_result).getTotalDocs(); } /** get the list of doc ids */ protected String[] getDocIDs(Object query_result) { Vector docs = ((LuceneQueryResult) query_result).getDocs(); String[] doc_nums = new String[docs.size()]; for (int d = 0; d < docs.size(); d++) { String doc_num = ((LuceneQueryResult.DocInfo) docs.elementAt(d)).id_; doc_nums[d] = doc_num; } return doc_nums; } /** get the list of doc ranks */ protected String[] getDocRanks(Object query_result) { Vector docs = ((LuceneQueryResult) query_result).getDocs(); String[] doc_ranks = new String[docs.size()]; for (int d = 0; d < docs.size(); d++) { doc_ranks[d] = Float.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).rank_); } return doc_ranks; } /** add in term info if available */ protected boolean addTermInfo(Element term_list, HashMap params, Object query_result) { Document doc = term_list.getOwnerDocument(); String query_level = (String) params.get(LEVEL_PARAM); // the current query level Vector terms = ((LuceneQueryResult) query_result).getTerms(); for (int t = 0; t < terms.size(); t++) { LuceneQueryResult.TermInfo term_info = (LuceneQueryResult.TermInfo) terms.get(t); Element term_elem = doc.createElement(GSXML.TERM_ELEM); term_elem.setAttribute(GSXML.NAME_ATT, term_info.term_); term_elem.setAttribute(FREQ_ATT, "" + term_info.term_freq_); term_elem.setAttribute(NUM_DOCS_MATCH_ATT, "" + term_info.match_docs_); term_elem.setAttribute(FIELD_ATT, term_info.field_); term_list.appendChild(term_elem); } Vector stopwords = ((LuceneQueryResult) query_result).getStopWords(); for (int t = 0; t < stopwords.size(); t++) { String stopword = (String) stopwords.get(t); Element stopword_elem = doc.createElement(GSXML.STOPWORD_ELEM); stopword_elem.setAttribute(GSXML.NAME_ATT, stopword); term_list.appendChild(stopword_elem); } return true; } protected ArrayList getFacets(Object query_result, String lang) { return null; } protected String getLuceneSort(String gs3_sort) { if (gs3_sort.equals(RANK_PARAM_RANK)) { return GS2LuceneQuery.SORT_RANK; } if (gs3_sort.equals(RANK_PARAM_NONE)) { return GS2LuceneQuery.SORT_NATURAL; } return gs3_sort; } @Override protected Map>> getHighlightSnippets( Object query_result) { // TODO Auto-generated method stub return null; } }