Changeset 13576
- Timestamp:
- 2007-01-11T14:55:04+13:00 (17 years ago)
- Location:
- trunk/gsdl3/src/java/org/greenstone/gsdl3/service
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/service/GS2LuceneRetrieve.java
r13270 r13576 20 20 21 21 // Greenstone classes 22 import org.greenstone.mgpp.*;23 22 import org.greenstone.gsdl3.core.GSException; 24 23 import org.greenstone.gsdl3.util.GSFile; 25 24 import org.greenstone.gsdl3.util.GSXML; 26 25 import org.greenstone.gsdl3.util.GDBMWrapper; 26 import org.greenstone.gsdl3.util.DBInfo; 27 import org.greenstone.gsdl3.util.GSHTML; 28 import org.greenstone.gsdl3.util.OID; 27 29 // XML classes 30 import org.w3c.dom.Document; 28 31 import org.w3c.dom.Element; 29 32 import org.w3c.dom.Text; … … 34 37 import org.apache.log4j.Logger; 35 38 39 /** Retrieve documents from a gs2 lucene collection. Note that this doesn't 40 actually use lucene, as the documents are stored in XML files */ 36 41 public class GS2LuceneRetrieve 37 42 extends AbstractGS2DocumentRetrieve 38 43 { 39 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneRetrieve.class.getName()); 44 45 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneRetrieve.class.getName()); 46 47 48 protected static final String DOC_LEVEL="Doc"; 49 protected static final String SEC_LEVEL="Sec"; 50 protected static final String ID_ATT = "gs2:id"; 40 51 41 52 // Parameters used 42 53 private static final String LEVEL_PARAM = "level"; 43 54 44 55 // Elements used in the config file that are specific to this class 45 56 private static final String DEFAULT_LEVEL_ELEM = "defaultLevel"; 46 47 private MGPPWrapper mgpp_src = null;48 57 49 58 private String default_level = null; 50 private String mgpp_textdir = null; 51 59 private String text_dir = null; 60 61 private boolean text_available = true; 62 52 63 public GS2LuceneRetrieve() { 53 this.mgpp_src = new MGPPWrapper();54 64 } 55 65 56 66 public void cleanUp() { 57 67 super.cleanUp(); 58 this.mgpp_src.unloadIndexData();59 68 } 60 69 61 70 /** configure this service */ 62 71 public boolean configure(Element info, Element extra_info) … … 65 74 return false; 66 75 } 67 76 68 77 // Do specific configuration 69 78 logger.info("Configuring GS2LuceneRetrieve..."); 70 71 // TODO - is there anything we need to do here? 72 // set up XML parser?? 73 79 80 text_dir = GSFile.collectionIndexDir(this.site_home, this.cluster_name) + File.separatorChar+"text"+File.separatorChar; 81 if (!(new File(text_dir).isDirectory())) { 82 logger.error("Text directory "+text_dir+" does not exist, will be unable to retrieve text for "+cluster_name); 83 text_available = false; 84 return true; // return true so that we still get the other services for the collection 85 } 86 // Get the default level out of <defaultLevel> (buildConfig.xml) 87 Element def = (Element) GSXML.getChildByTagName(info, DEFAULT_LEVEL_ELEM); 88 if (def != null) { 89 this.default_level = def.getAttribute(GSXML.NAME_ATT); 90 } 91 if (this.default_level == null || this.default_level.equals("")) { 92 logger.error("Default level not specified for "+this.cluster_name+", assuming "+DOC_LEVEL); 93 this.default_level = DOC_LEVEL; 94 } 95 74 96 return true; 75 97 76 98 } 77 99 … … 80 102 * <nodeContent>text content or other elements</nodeContent> 81 103 */ 82 protected Element getNodeContent(String doc_id) throws GSException { 83 104 protected Element getNodeContent(String doc_id, String lang) throws GSException { 105 String doc_content = getTextString("TextRetrievalError", lang); 106 try { 107 if (!text_available) { 108 throw new Exception("No text directory available"); 109 } 110 111 long doc_num = this.gdbm_src.OID2Docnum(doc_id); 112 if (doc_num == -1) { 113 throw new Exception("OID "+doc_id +" couldn't be converted to lucene doc num"); 114 } 115 116 DBInfo info=this.gdbm_src.getInfo(OID.getTop(doc_id)); 117 if (info == null) { 118 throw new Exception("Couldn't get GDBM database entry for "+OID.getTop(doc_id)); 119 } 120 121 String archivedir=info.getInfo("archivedir"); 122 File doc_xml_file = new File(text_dir+archivedir+File.separatorChar+"doc.xml"); 123 if (!doc_xml_file.isFile()) { 124 throw new Exception("Doc XML file "+doc_xml_file.getPath()+" does not exist"); 125 } 126 Document doc_xml_doc = this.converter.getDOM(doc_xml_file); 127 if (doc_xml_doc == null) { 128 throw new Exception("Couldn't parse file "+doc_xml_file.getPath()); 129 } 130 Element full_document = doc_xml_doc.getDocumentElement(); 131 if (full_document == null) { 132 throw new Exception("Couldn't parse file "+doc_xml_file.getPath()); 133 } 134 Element current_section = null; 135 if (default_level.equals(DOC_LEVEL)) { 136 current_section = full_document; 137 } else { 138 current_section = GSXML.getNamedElement(full_document, SEC_LEVEL, ID_ATT, String.valueOf(doc_num)); 139 } 140 if (current_section == null) { 141 throw new Exception("Couldn't find section "+ doc_num+" in file "+doc_xml_file.getPath()); 142 } 143 doc_content = GSXML.getNodeText(current_section); 144 if (doc_content == null) { 145 doc_content = ""; 146 } else { 147 doc_content = resolveTextMacros(doc_content, doc_id, lang); 148 } 149 } catch (Exception e) { 150 logger.error("Error trying to get document text for "+doc_id+" in collection "+this.cluster_name+": "+e); 151 } 84 152 85 153 Element content_node = this.doc.createElement(GSXML.NODE_CONTENT_ELEM); 86 87 String doc_content = "";88 try {89 // TODO get the doc content from the index text dir90 } catch (Exception e) {91 logger.info("exception happended getting doc content for " + doc_id);92 doc_content = "dummy content for doc "+doc_id +"\n";93 94 }95 96 154 Text t = this.doc.createTextNode(doc_content); 97 155 content_node.appendChild(t); 98 156 return content_node; 99 157 } 100 101 102 158 } -
trunk/gsdl3/src/java/org/greenstone/gsdl3/service/GS2LuceneSearch.java
r13270 r13576 25 25 import org.w3c.dom.Element; 26 26 import org.w3c.dom.NodeList; 27 27 import org.w3c.dom.Document; 28 28 // java classes 29 29 import java.util.ArrayList; 30 30 import java.util.HashMap; 31 import java.io.File; 32 import java.util.Iterator; 33 import java.util.Set; 34 import java.util.Map; 35 import java.util.Vector; 31 36 32 37 // Logging 33 38 import org.apache.log4j.Logger; 39 40 import org.nzdl.gsdl.LuceneWrap.GS2LuceneQuery; 41 import org.nzdl.gsdl.LuceneWrap.LuceneQueryResult; 34 42 35 43 public class GS2LuceneSearch 36 44 extends AbstractGS2FieldSearch 37 45 { 38 39 // TODO: lucene query object 40 46 protected static final String RANK_PARAM_RANK_VALUE = "rank"; 47 41 48 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.GS2LuceneSearch.class.getName()); 42 49 50 private GS2LuceneQuery lucene_src=null; 51 43 52 public GS2LuceneSearch() 44 53 { 45 // TODO: create new query object 46 54 this.lucene_src = new GS2LuceneQuery(); 47 55 // Lucene uses double operators, not single 48 56 AND_OPERATOR = "&&"; 49 57 OR_OPERATOR = "||"; 50 58 51 } 52 59 does_paging = true; 60 does_chunking = true; 61 } 62 53 63 public void cleanUp() { 54 64 super.cleanUp(); 55 // TODO: clean up query object56 } 57 58 65 this.lucene_src.cleanUp(); 66 } 67 68 /** configure this service */ 59 69 public boolean configure(Element info, Element extra_info) 60 70 { … … 68 78 does_stem = false; 69 79 does_accent = false; 70 71 // TODO: configure the query object based on info from config file72 80 73 81 return true; 74 82 } 75 83 76 84 /** add in the lucene specific params to TextQuery */ 77 85 protected void addCustomQueryParams(Element param_list, String lang) … … 80 88 /** lucenes rank param is based on fields, not ranked/not */ 81 89 createParameter(RANK_PARAM, param_list, lang); 82 } 83 90 createParameter(FIELD_ATT, param_list, lang); 91 } 92 84 93 /** create a param and add to the list */ 85 94 /** we override this to do a special rank param */ … … 90 99 // get the fields 91 100 ArrayList fields = new ArrayList(); 92 fields.add( "rank");101 fields.add(RANK_PARAM_RANK_VALUE); 93 102 ArrayList field_names = new ArrayList(); 94 103 field_names.add(getTextString("param.sortBy.rank", lang)); … … 103 112 } 104 113 } 105 114 106 115 protected void getSortByIndexData(ArrayList index_ids, ArrayList index_names, String lang){ 107 116 // the field list - read from config file … … 116 125 } 117 126 // TODO change field so that name is the id, and full metadata name is somthing else 127 118 128 if (shortname.equals("")) { 119 129 shortname = name; 120 130 } 121 if (shortname.equals("ZZ") ) {122 // ZZ is a fake index 131 if (shortname.equals("ZZ") || shortname.equals("TX")) { 132 // ZZ is a fake index, and we don't sort by TX 123 133 continue; 124 134 } … … 132 142 133 143 /** methods to handle actually doing the query */ 144 134 145 /** do any initialisation of the query object */ 135 146 protected boolean setUpQueryer(HashMap params) { 136 // TODO 147 String indexdir = GSFile.collectionBaseDir(this.site_home, this.cluster_name) + File.separatorChar + "index"+File.separatorChar; 148 149 String index = "didx"; 150 int maxdocs = 100; 151 int hits_per_page = 20; 152 int start_page = 1; 153 // set up the query params 154 Set entries = params.entrySet(); 155 Iterator i = entries.iterator(); 156 while (i.hasNext()) { 157 Map.Entry m = (Map.Entry)i.next(); 158 String name = (String)m.getKey(); 159 String value = (String)m.getValue(); 160 161 if (name.equals(MAXDOCS_PARAM)&& !value.equals("")) { 162 maxdocs = Integer.parseInt(value); 163 } else if (name.equals(HITS_PER_PAGE_PARAM)) { 164 hits_per_page = Integer.parseInt(value); 165 } else if (name.equals(START_PAGE_PARAM)) { 166 start_page = Integer.parseInt(value); 167 168 } else if (name.equals(MATCH_PARAM)) { 169 if (value.equals(MATCH_PARAM_ALL)) { 170 this.lucene_src.setDefaultConjunctionOperator("AND"); 171 } else{ 172 this.lucene_src.setDefaultConjunctionOperator("OR"); 173 } 174 } else if (name.equals(RANK_PARAM)) { 175 if (value.equals(RANK_PARAM_RANK_VALUE)) { 176 value = null; 177 } 178 this.lucene_src.setSortField(value); 179 } else if (name.equals(LEVEL_PARAM)) { 180 if (value.toUpperCase().equals("SEC")){ 181 index = "sidx"; 182 } 183 else { 184 index = "didx"; 185 } 186 } // ignore any others 187 } 188 // set up start and end results if necessary 189 int start_results = 1; 190 if (start_page != 1) { 191 start_results = ((start_page-1) * hits_per_page) + 1; 192 } 193 int end_results = hits_per_page * start_page; 194 this.lucene_src.setStartResults(start_results); 195 this.lucene_src.setEndResults(end_results); 196 197 this.lucene_src.setIndexDir(indexdir+index); 198 this.lucene_src.initialise(); 137 199 return true; 138 200 } 139 201 /** do the query */ 140 202 protected Object runQuery(String query) { 141 // TODO 203 try { 204 LuceneQueryResult lqr=this.lucene_src.runQuery(query); 205 return lqr; 206 } catch (Exception e) { 207 logger.error ("exception happened in run query: ", e); 208 } 209 142 210 return null; 143 211 } 144 212 /** get the total number of docs that match */ 145 213 protected long numDocsMatched(Object query_result) { 146 // TODO 147 return 0; 214 return ((LuceneQueryResult)query_result).getTotalDocs(); 215 148 216 } 149 217 /** get the list of doc ids */ 150 218 protected String [] getDocIDs(Object query_result) { 151 // TODO 152 return null; 219 Vector docs = ((LuceneQueryResult)query_result).getDocs(); 220 String [] doc_nums = new String [docs.size()]; 221 for (int d = 0; d < docs.size(); d++) { 222 String doc_num = Long.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).num_); 223 doc_nums[d] = doc_num; 224 } 225 return doc_nums; 153 226 } 154 227 /** get the list of doc ranks */ 155 228 protected String [] getDocRanks(Object query_result) { 156 // TODO 157 return null; 229 Vector docs = ((LuceneQueryResult)query_result).getDocs(); 230 String [] doc_ranks = new String [docs.size()]; 231 for (int d = 0; d < docs.size(); d++) { 232 doc_ranks[d] = Float.toString(((LuceneQueryResult.DocInfo) docs.elementAt(d)).rank_); 233 } 234 return doc_ranks; 158 235 } 159 236 /** add in term info if available */ 160 237 protected boolean addTermInfo(Element term_list, HashMap params, 161 238 Object query_result) { 162 // TODO 163 return true; 164 } 165 239 String query_level = (String)params.get(LEVEL_PARAM); // the current query level 240 241 Vector terms = ((LuceneQueryResult)query_result).getTerms(); 242 for (int t = 0; t < terms.size(); t++) { 243 LuceneQueryResult.TermInfo term_info = (LuceneQueryResult.TermInfo) terms.get(t); 244 245 Element term_elem = this.doc.createElement(GSXML.TERM_ELEM); 246 term_elem.setAttribute(GSXML.NAME_ATT, term_info.term_); 247 term_elem.setAttribute(FREQ_ATT, "" + term_info.term_freq_); 248 term_elem.setAttribute(NUM_DOCS_MATCH_ATT, "" + term_info.match_docs_); 249 term_elem.setAttribute(FIELD_ATT, term_info.field_); 250 term_list.appendChild(term_elem); 251 } 252 return true; 253 } 254 166 255 protected String addFieldInfo(String query, String field) { 167 256 if (field.equals("") || field.equals("ZZ")) { … … 170 259 return field+":("+query+")"; 171 260 } 172 261 173 262 protected void addQueryElem(StringBuffer s, String q, String f, String c) { 174 263 175 264 String combine=""; 176 265 if (s.length()>0) { … … 179 268 s.append(combine + addFieldInfo(q,f)); 180 269 } 181 270 182 271 /** Lucene doesn't use these options at the moment */ 183 272 protected String addStemOptions(String query, String stem,
Note:
See TracChangeset
for help on using the changeset viewer.