Changeset 24024
- Timestamp:
- 2011-05-18T13:24:11+12:00 (13 years ago)
- Location:
- main/trunk/greenstone3/src/java/org/greenstone/gsdl3
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/GS2LuceneSearch.java
r18422 r24024 1 1 /* 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 2 * GS2LuceneSearch.java 3 * Copyright (C) 2006 New Zealand Digital Library, http://www.nzdl.org 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 17 */ 18 18 19 19 package org.greenstone.gsdl3.service; … … 42 42 43 43 public class GS2LuceneSearch 44 44 extends AbstractGS2FieldSearch 45 45 { 46 protected static final String RANK_PARAM_RANK_VALUE = "rank"; 47 48 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneSearch.class.getName()); 49 50 private GS2LuceneQuery lucene_src=null; 51 52 public GS2LuceneSearch() 53 { 54 this.lucene_src = new GS2LuceneQuery(); 55 // Lucene uses double operators, not single 56 AND_OPERATOR = "&&"; 57 OR_OPERATOR = "||"; 58 59 does_paging = true; 60 does_chunking = true; 61 } 62 63 public void cleanUp() { 64 super.cleanUp(); 65 this.lucene_src.cleanUp(); 66 } 67 68 /** configure this service */ 69 public boolean configure(Element info, Element extra_info) 70 { 71 if (!super.configure(info, extra_info)){ 72 return false; 73 } 74 75 // Lucene doesn't do case folding or stemming or accent folding at the 76 // moment 77 does_case = false; 78 does_stem = false; 79 does_accent = false; 80 81 return true; 82 } 83 84 /** add in the lucene specific params to TextQuery */ 85 protected void addCustomQueryParams(Element param_list, String lang) 86 { 87 super.addCustomQueryParams(param_list, lang); 88 /** lucenes rank param is based on index fields, not ranked/not */ 89 createParameter(RANK_PARAM, param_list, lang); 90 91 } 92 93 /** create a param and add to the list */ 94 /** we override this to do a special rank param */ 95 protected void createParameter(String name, Element param_list, String lang) 96 { 97 Element param = null; 98 if (name.equals(RANK_PARAM)) { 99 // get the fields 100 ArrayList fields = new ArrayList(); 101 fields.add(RANK_PARAM_RANK_VALUE); 102 ArrayList field_names = new ArrayList(); 103 field_names.add(getTextString("param.sortBy.rank", lang)); 104 getSortByIndexData(fields, field_names, lang); 105 106 param = GSXML.createParameterDescription2(this.doc, name, getTextString("param."+name, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, (String)fields.get(0), fields, field_names ); 107 } 108 if (param != null) { 109 param_list.appendChild(param); 110 } else { 111 super.createParameter(name, param_list, lang); 112 } 113 } 114 115 protected void getSortByIndexData(ArrayList index_ids, ArrayList index_names, String lang) { 116 // the index info - 117 Element index_list = (Element)GSXML.getChildByTagName(this.config_info, INDEX_ELEM+GSXML.LIST_MODIFIER); 118 NodeList indexes = index_list.getElementsByTagName(INDEX_ELEM); 119 int len = indexes.getLength(); 120 // now add even if there is only one 121 for (int i=0; i<len; i++) { 122 Element index = (Element)indexes.item(i); 123 String shortname = index.getAttribute(GSXML.SHORTNAME_ATT); 124 if (shortname.equals("") || shortname.equals("ZZ") || shortname.equals("TX")) { 125 continue; 126 } 127 index_ids.add("by"+shortname); 128 String display_name = GSXML.getDisplayText(index, GSXML.DISPLAY_TEXT_NAME, lang, "en"); 129 if (display_name.equals("")) { 130 display_name = index.getAttribute(GSXML.NAME_ATT); 131 if (display_name.equals("")) { 132 display_name = shortname; 133 } 134 } 135 index_names.add(display_name); 136 137 } 138 139 } 140 141 /** methods to handle actually doing the query */ 142 143 /** do any initialisation of the query object */ 144 protected boolean setUpQueryer(HashMap params) { 145 String indexdir = GSFile.collectionBaseDir(this.site_home, this.cluster_name) + File.separatorChar + "index"+File.separatorChar; 146 147 String index = "didx"; 148 String physical_index_language_name=null; 149 String physical_sub_index_name=null; 150 int maxdocs = 100; 151 int hits_per_page = 20; 152 int start_page = 1; 153 // set up the query params 154 Set entries = params.entrySet(); 155 Iterator i = entries.iterator(); 156 while (i.hasNext()) { 157 Map.Entry m = (Map.Entry)i.next(); 158 String name = (String)m.getKey(); 159 String value = (String)m.getValue(); 160 161 if (name.equals(MAXDOCS_PARAM)&& !value.equals("")) { 162 maxdocs = Integer.parseInt(value); 163 } else if (name.equals(HITS_PER_PAGE_PARAM)) { 164 hits_per_page = Integer.parseInt(value); 165 } else if (name.equals(START_PAGE_PARAM)) { 166 start_page = Integer.parseInt(value); 167 168 } else if (name.equals(MATCH_PARAM)) { 169 if (value.equals(MATCH_PARAM_ALL)) { 170 this.lucene_src.setDefaultConjunctionOperator("AND"); 171 } else{ 172 this.lucene_src.setDefaultConjunctionOperator("OR"); 173 } 174 } else if (name.equals(RANK_PARAM)) { 175 if (value.equals(RANK_PARAM_RANK_VALUE)) { 176 value = null; 177 } 178 this.lucene_src.setSortField(value); 179 } else if (name.equals(LEVEL_PARAM)) { 180 if (value.toUpperCase().equals("SEC")){ 181 index = "sidx"; 182 } 183 else { 184 index = "didx"; 185 } 186 } else if (name.equals(INDEX_SUBCOLLECTION_PARAM)) { 187 physical_sub_index_name=value; 188 } else if (name.equals(INDEX_LANGUAGE_PARAM)){ 189 physical_index_language_name=value; 190 } // ignore any others 191 } 192 // set up start and end results if necessary 193 int start_results = 1; 194 if (start_page != 1) { 195 start_results = ((start_page-1) * hits_per_page) + 1; 196 } 197 int end_results = hits_per_page * start_page; 198 this.lucene_src.setStartResults(start_results); 199 this.lucene_src.setEndResults(end_results); 200 201 202 if (index.equals("sidx") || index.equals("didx")){ 203 if (physical_sub_index_name!=null) { 204 index+=physical_sub_index_name; 205 } 206 if (physical_index_language_name!=null){ 207 index+=physical_index_language_name; 208 } 209 } 210 211 this.lucene_src.setIndexDir(indexdir+index); 212 this.lucene_src.initialise(); 213 return true; 214 } 215 /** do the query */ 216 protected Object runQuery(String query) { 217 try { 218 LuceneQueryResult lqr=this.lucene_src.runQuery(query); 219 return lqr; 220 } catch (Exception e) { 221 logger.error ("exception happened in run query: ", e); 222 } 223 224 return null; 225 } 226 /** get the total number of docs that match */ 227 protected long numDocsMatched(Object query_result) { 228 return ((LuceneQueryResult)query_result).getTotalDocs(); 229 230 } 231 /** get the list of doc ids */ 232 protected String [] getDocIDs(Object query_result) { 233 Vector docs = ((LuceneQueryResult)query_result).getDocs(); 234 String [] doc_nums = new String [docs.size()]; 235 for (int d = 0; d < docs.size(); d++) { 236 String doc_num = ((LuceneQueryResult.DocInfo) docs.elementAt(d)).id_; 237 doc_nums[d] = doc_num; 238 } 239 return doc_nums; 240 } 241 /** get the list of doc ranks */ 242 protected String [] getDocRanks(Object query_result) { 243 Vector docs = ((LuceneQueryResult)query_result).getDocs(); 244 String [] doc_ranks = new String [docs.size()]; 245 for (int d = 0; d < docs.size(); d++) { 246 doc_ranks[d] = Float.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).rank_); 247 } 248 return doc_ranks; 249 } 250 /** add in term info if available */ 251 protected boolean addTermInfo(Element term_list, HashMap params, 252 Object query_result) { 253 String query_level = (String)params.get(LEVEL_PARAM); // the current query level 254 255 Vector terms = ((LuceneQueryResult)query_result).getTerms(); 256 for (int t = 0; t < terms.size(); t++) { 257 LuceneQueryResult.TermInfo term_info = (LuceneQueryResult.TermInfo) terms.get(t); 258 259 Element term_elem = this.doc.createElement(GSXML.TERM_ELEM); 260 term_elem.setAttribute(GSXML.NAME_ATT, term_info.term_); 261 term_elem.setAttribute(FREQ_ATT, "" + term_info.term_freq_); 262 term_elem.setAttribute(NUM_DOCS_MATCH_ATT, "" + term_info.match_docs_); 263 term_elem.setAttribute(FIELD_ATT, term_info.field_); 264 term_list.appendChild(term_elem); 265 } 266 return true; 267 } 268 269 protected String addFieldInfo(String query, String field) { 270 if (field.equals("") || field.equals("ZZ")) { 271 return query; 272 } 273 return field+":("+query+")"; 274 } 275 276 protected void addQueryElem(StringBuffer s, String q, String f, String c) { 277 278 String combine=""; 279 if (s.length()>0) { 280 combine = " "+c+" "; 281 } 282 s.append(combine + addFieldInfo(q,f)); 283 } 284 285 /** Lucene doesn't use these options at the moment */ 286 protected String addStemOptions(String query, String stem, 287 String casef, String accent) { 288 return query; 289 } 290 291 /** Lucene no longer uses internal ids. It just uses hash ids. So we need 46 protected static final String RANK_PARAM_RANK_VALUE = "rank"; 47 48 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneSearch.class.getName()); 49 50 private GS2LuceneQuery lucene_src=null; 51 52 public GS2LuceneSearch() 53 { 54 this.lucene_src = new GS2LuceneQuery(); 55 // Lucene uses double operators, not single 56 AND_OPERATOR = "&&"; 57 OR_OPERATOR = "||"; 58 59 does_paging = true; 60 does_chunking = true; 61 } 62 63 public void cleanUp() { 64 super.cleanUp(); 65 this.lucene_src.cleanUp(); 66 } 67 68 /** configure this service */ 69 public boolean configure(Element info, Element extra_info) 70 { 71 if (!super.configure(info, extra_info)){ 72 return false; 73 } 74 75 // Lucene doesn't do case folding or stemming or accent folding at the 76 // moment 77 does_case = false; 78 does_stem = false; 79 does_accent = false; 80 81 return true; 82 } 83 84 /** add in the lucene specific params to TextQuery */ 85 protected void addCustomQueryParams(Element param_list, String lang) 86 { 87 super.addCustomQueryParams(param_list, lang); 88 /** lucenes rank param is based on index fields, not ranked/not */ 89 createParameter(RANK_PARAM, param_list, lang); 90 91 } 92 93 /** create a param and add to the list */ 94 /** we override this to do a special rank param */ 95 protected void createParameter(String name, Element param_list, String lang) 96 { 97 Element param = null; 98 if (name.equals(RANK_PARAM)) { 99 // get the fields 100 ArrayList fields = new ArrayList(); 101 fields.add(RANK_PARAM_RANK_VALUE); 102 ArrayList field_names = new ArrayList(); 103 field_names.add(getTextString("param.sortBy.rank", lang)); 104 getSortByIndexData(fields, field_names, lang); 105 106 param = GSXML.createParameterDescription2(this.doc, name, getTextString("param."+name, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, (String)fields.get(0), fields, field_names ); 107 } 108 if (param != null) { 109 param_list.appendChild(param); 110 } else { 111 super.createParameter(name, param_list, lang); 112 } 113 } 114 115 protected void getSortByIndexData(ArrayList index_ids, ArrayList index_names, String lang) { 116 // the index info - 117 Element index_list = (Element)GSXML.getChildByTagName(this.config_info, INDEX_ELEM+GSXML.LIST_MODIFIER); 118 NodeList indexes = index_list.getElementsByTagName(INDEX_ELEM); 119 int len = indexes.getLength(); 120 // now add even if there is only one 121 for (int i=0; i<len; i++) { 122 Element index = (Element)indexes.item(i); 123 String shortname = index.getAttribute(GSXML.SHORTNAME_ATT); 124 if (shortname.equals("") || shortname.equals("ZZ") || shortname.equals("TX")) { 125 continue; 126 } 127 index_ids.add("by"+shortname); 128 String display_name = GSXML.getDisplayText(index, GSXML.DISPLAY_TEXT_NAME, lang, "en"); 129 if (display_name.equals("")) { 130 display_name = index.getAttribute(GSXML.NAME_ATT); 131 if (display_name.equals("")) { 132 display_name = shortname; 133 } 134 } 135 index_names.add(display_name); 136 137 } 138 139 } 140 141 /** methods to handle actually doing the query */ 142 143 /** do any initialisation of the query object */ 144 protected boolean setUpQueryer(HashMap params) { 145 String indexdir = GSFile.collectionBaseDir(this.site_home, this.cluster_name) + File.separatorChar + "index"+File.separatorChar; 146 147 String index = "didx"; 148 String physical_index_language_name=null; 149 String physical_sub_index_name=null; 150 int maxdocs = 100; 151 int hits_per_page = 20; 152 int start_page = 1; 153 // set up the query params 154 Set entries = params.entrySet(); 155 Iterator i = entries.iterator(); 156 while (i.hasNext()) { 157 Map.Entry m = (Map.Entry)i.next(); 158 String name = (String)m.getKey(); 159 String value = (String)m.getValue(); 160 161 if (name.equals(MAXDOCS_PARAM)&& !value.equals("")) { 162 maxdocs = Integer.parseInt(value); 163 } else if (name.equals(HITS_PER_PAGE_PARAM)) { 164 hits_per_page = Integer.parseInt(value); 165 } else if (name.equals(START_PAGE_PARAM)) { 166 start_page = Integer.parseInt(value); 167 168 } else if (name.equals(MATCH_PARAM)) { 169 if (value.equals(MATCH_PARAM_ALL)) { 170 this.lucene_src.setDefaultConjunctionOperator("AND"); 171 } else{ 172 this.lucene_src.setDefaultConjunctionOperator("OR"); 173 } 174 } else if (name.equals(RANK_PARAM)) { 175 if (value.equals(RANK_PARAM_RANK_VALUE)) { 176 value = null; 177 } 178 this.lucene_src.setSortField(value); 179 } else if (name.equals(LEVEL_PARAM)) { 180 if (value.toUpperCase().equals("SEC")){ 181 index = "sidx"; 182 } 183 else { 184 index = "didx"; 185 } 186 } else if (name.equals(INDEX_SUBCOLLECTION_PARAM)) { 187 physical_sub_index_name=value; 188 } else if (name.equals(INDEX_LANGUAGE_PARAM)){ 189 physical_index_language_name=value; 190 } // ignore any others 191 } 192 // set up start and end results if necessary 193 int start_results = 1; 194 if (start_page != 1) { 195 start_results = ((start_page-1) * hits_per_page) + 1; 196 } 197 int end_results = hits_per_page * start_page; 198 this.lucene_src.setStartResults(start_results); 199 this.lucene_src.setEndResults(end_results); 200 201 202 if (index.equals("sidx") || index.equals("didx")){ 203 if (physical_sub_index_name!=null) { 204 index+=physical_sub_index_name; 205 } 206 if (physical_index_language_name!=null){ 207 index+=physical_index_language_name; 208 } 209 } 210 211 this.lucene_src.setIndexDir(indexdir+index); 212 this.lucene_src.initialise(); 213 return true; 214 } 215 /** do the query */ 216 protected Object runQuery(String query) { 217 try { 218 LuceneQueryResult lqr=this.lucene_src.runQuery(query); 219 return lqr; 220 } catch (Exception e) { 221 logger.error ("exception happened in run query: ", e); 222 } 223 224 return null; 225 } 226 /** get the total number of docs that match */ 227 protected long numDocsMatched(Object query_result) { 228 return ((LuceneQueryResult)query_result).getTotalDocs(); 229 230 } 231 /** get the list of doc ids */ 232 protected String [] getDocIDs(Object query_result) { 233 Vector docs = ((LuceneQueryResult)query_result).getDocs(); 234 String [] doc_nums = new String [docs.size()]; 235 for (int d = 0; d < docs.size(); d++) { 236 String doc_num = ((LuceneQueryResult.DocInfo) docs.elementAt(d)).id_; 237 doc_nums[d] = doc_num; 238 } 239 return doc_nums; 240 } 241 /** get the list of doc ranks */ 242 protected String [] getDocRanks(Object query_result) { 243 Vector docs = ((LuceneQueryResult)query_result).getDocs(); 244 String [] doc_ranks = new String [docs.size()]; 245 for (int d = 0; d < docs.size(); d++) { 246 doc_ranks[d] = Float.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).rank_); 247 } 248 return doc_ranks; 249 } 250 /** add in term info if available */ 251 protected boolean addTermInfo(Element term_list, HashMap params, 252 Object query_result) { 253 String query_level = (String)params.get(LEVEL_PARAM); // the current query level 254 255 Vector terms = ((LuceneQueryResult)query_result).getTerms(); 256 for (int t = 0; t < terms.size(); t++) { 257 LuceneQueryResult.TermInfo term_info = (LuceneQueryResult.TermInfo) terms.get(t); 258 259 Element term_elem = this.doc.createElement(GSXML.TERM_ELEM); 260 term_elem.setAttribute(GSXML.NAME_ATT, term_info.term_); 261 term_elem.setAttribute(FREQ_ATT, "" + term_info.term_freq_); 262 term_elem.setAttribute(NUM_DOCS_MATCH_ATT, "" + term_info.match_docs_); 263 term_elem.setAttribute(FIELD_ATT, term_info.field_); 264 term_list.appendChild(term_elem); 265 } 266 267 Vector stopwords = ((LuceneQueryResult)query_result).getStopWords(); 268 for (int t = 0; t < stopwords.size(); t++) { 269 String stopword = (String) stopwords.get(t); 270 271 Element stopword_elem = this.doc.createElement(GSXML.STOPWORD_ELEM); 272 stopword_elem.setAttribute(GSXML.NAME_ATT, stopword); 273 term_list.appendChild(stopword_elem); 274 } 275 276 return true; 277 } 278 279 protected String addFieldInfo(String query, String field) { 280 if (field.equals("") || field.equals("ZZ")) { 281 return query; 282 } 283 return field+":("+query+")"; 284 } 285 286 protected void addQueryElem(StringBuffer s, String q, String f, String c) { 287 288 String combine=""; 289 if (s.length()>0) { 290 combine = " "+c+" "; 291 } 292 s.append(combine + addFieldInfo(q,f)); 293 } 294 295 /** Lucene doesn't use these options at the moment */ 296 protected String addStemOptions(String query, String stem, 297 String casef, String accent) { 298 return query; 299 } 300 301 /** Lucene no longer uses internal ids. It just uses hash ids. So we need 292 302 to override these methods so no conversion is done. */ 293 294 295 296 return Long.toString(docnum);297 298 299 300 301 return docnum;302 303 303 /** convert indexer internal id to Greenstone oid */ 304 protected String internalNum2OID(long docnum) 305 { 306 return Long.toString(docnum); 307 308 } 309 protected String internalNum2OID(String docnum) 310 { 311 return docnum; 312 313 } 304 314 305 315 } -
main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/GSXML.java
r23968 r24024 74 74 public static final String FORMAT_ELEM = "format"; // config files use format - should we use this instead of stylesheet?? 75 75 public static final String TERM_ELEM = "term"; 76 public static final String STOPWORD_ELEM = "stopword"; 76 77 public static final String SYSTEM_ELEM = "system"; 77 78 public static final String FORMAT_STRING_ELEM = "formatString";
Note:
See TracChangeset
for help on using the changeset viewer.