Changeset 14483
- Timestamp:
- 2007-09-06T11:19:44+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
greenstone3/trunk/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java
r13270 r14483 22 22 23 23 import org.greenstone.mgpp.*; 24 import org.w3c.dom.Document; 25 import org.w3c.dom.Node; 26 import org.w3c.dom.Element; 27 import org.w3c.dom.Text; 24 import org.w3c.dom.Document; 25 import org.w3c.dom.Node; 26 import org.w3c.dom.Element; 27 import org.w3c.dom.Text; 28 28 29 29 import java.util.Vector; … … 33 33 import org.apache.log4j.*; 34 34 35 /** 35 /** 36 36 * PhindServices - the phind phrase browsing service 37 * 37 * 38 38 * @author <a href="mailto:[email protected]">Katherine Don</a> 39 39 * @version $Revision$ 40 40 */ 41 41 public class PhindPhraseBrowse 42 extends ServiceRack { 43 44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.PhindPhraseBrowse.class.getName()); 45 46 // the services on offer 47 private static final String PHIND_SERVICE = "PhindApplet"; 48 49 private MGPPWrapper mgpp_src=null; 50 private String basepath = null; 51 52 private Element applet_description = null; 53 54 public PhindPhraseBrowse() { 55 this.mgpp_src = new MGPPWrapper(); 56 // set up the default params 57 this.mgpp_src.setQueryLevel("Document"); 58 this.mgpp_src.setReturnLevel("Document"); 59 this.mgpp_src.setMaxDocs(5); 60 this.mgpp_src.setStem(false); 61 this.mgpp_src.setCase(true); 62 } 63 64 public void cleanUp() { 65 super.cleanUp(); 66 this.mgpp_src.unloadIndexData(); 67 } 68 69 /** configure the service module 70 * 71 * @param info a DOM Element containing any config info for the service 72 * @return true if configured 73 */ 74 public boolean configure(Element info, Element extra_info) { 75 76 if (!super.configure(info, extra_info)){ 77 return false; 78 } 79 80 logger.info("configuring PhindPhraseBrowse"); 81 82 // set up short_service_info_ - for now just has name and type 83 Element e = this.doc.createElement(GSXML.SERVICE_ELEM); 84 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET); 85 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE); 86 this.short_service_info.appendChild(e); 87 88 // set up the static applet description 89 90 applet_description = this.doc.createElement(GSXML.SERVICE_ELEM); 91 applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET); 92 applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE); 93 94 // add in the applet info for the phind applet 95 // need to make this dynamic - library names etc 96 // change the applet params - have a single param with the library name 97 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out 98 // phindcgi param now is not complete - library must be prepended to it. 99 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='applet' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?"; 100 app_info += GSParams.ACTION +"=a&"+GSParams.REQUEST_TYPE +"=r&"+GSParams.SERVICE+"="+PHIND_SERVICE+"&"+GSParams.OUTPUT+"=xml&"+GSParams.RESPONSE_ONLY+"=1'/>"; 101 app_info +="<PARAM NAME='collection' VALUE='"; 102 app_info += this.cluster_name; 103 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">"; 104 105 Document dom = this.converter.getDOM(app_info); 106 if (dom==null) { 107 logger.error("Couldn't parse applet info"); 108 return false; 109 } 110 Element app_elem = dom.getDocumentElement(); 111 applet_description.appendChild(this.doc.importNode(app_elem, true)); 112 113 return true; 114 } 115 116 protected Element getServiceDescription(String service, String lang, String subset) { 117 if (!service.equals(PHIND_SERVICE)) { 118 return null; 119 } 120 Element describe = (Element) applet_description.cloneNode(true); 121 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(PHIND_SERVICE+".name", lang))); 122 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(PHIND_SERVICE+".description", lang))); 123 return describe; 124 } 125 126 protected Element processPhindApplet(Element request) { 127 128 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); 129 HashMap params = GSXML.extractParams(param_elem, false); 130 131 long first_e = Long.parseLong((String)params.get("pfe")); 132 long last_e = Long.parseLong((String)params.get("ple")); 133 long first_l = Long.parseLong((String)params.get("pfl")); 134 long last_l = Long.parseLong((String)params.get("pll")); 135 long first_d = Long.parseLong((String)params.get("pfd")); 136 long last_d = Long.parseLong((String)params.get("pld")); 137 138 long phrase; 139 String phrase_str = (String)params.get("ppnum"); 140 if (phrase_str == null || phrase_str.equals("")) { 141 phrase=0; 142 } else { 143 phrase = Long.parseLong(phrase_str); 144 } 145 String word = (String)params.get("pptext"); 146 String phind_index = (String)params.get("pc"); 147 // the location of the mgpp database files 148 this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index); 149 150 // the result element 151 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM); 152 result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE); 153 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS); 154 155 // applet result info must be in appletInfo element 156 Element applet_data = this.doc.createElement(GSXML.APPLET_DATA_ELEM); 157 result.appendChild(applet_data); 158 Element phind_data = this.doc.createElement("phindData"); 159 applet_data.appendChild(phind_data); 160 161 162 // if we dont know the phrase number, look it up 163 if (phrase == 0) { 164 if (word==null || word.equals("")) { 165 Element error = phindError("no word or phrase"); 166 phind_data.appendChild(error); 167 return result; 168 } 169 phrase = findPhraseNumberFromWord( word); 170 } 171 if (phrase==0) { 172 // the word is not in the collection 173 // return a phind error string 174 Element error = phindError("the term "+word+" is not in the collection"); 175 phind_data.appendChild(error); 176 return result; 177 } 178 179 // get the phrase data into the phind_data node 180 getPhraseData(phind_data, phrase, first_l, last_l, 181 first_e, last_e, first_d, last_d); 182 return result; 183 184 185 }// processPhindApplet 186 187 protected long findPhraseNumberFromWord(String word) { 188 189 // set the mgpp index data - we are looking up pword 190 this.mgpp_src.loadIndexData(this.basepath+File.separatorChar+"pword"); 191 192 this.mgpp_src.runQuery(word); 193 194 MGPPQueryResult res = this.mgpp_src.getQueryResult(); 195 Vector docs = res.getDocs(); 196 if (docs.size()==0) { 197 // phrase not found 198 return 0; 199 } 200 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement(); 201 return doc.num_; 202 } 203 204 protected boolean getPhraseData(Element phind_data, 205 long phrase, long first_l, long last_l, 206 long first_e, long last_e, long first_d, 207 long last_d) { 208 209 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document", 210 phrase); 211 if (record.equals("")) { 212 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase); 213 phind_data.appendChild(error); 214 return false; 215 } 216 217 // parse the record - its in gordons cryptic form 218 // ":word:tf:ef:df:el:dl:lf:ll" 219 // el: e,e,e 220 // dl: d;f,d;f, 221 // lf and ll may be null 222 // l: type,dest, dest; type,dest,dest 223 224 // ignore everything up to and including first colon (has 225 // <Document>3505: at the start) 226 record = record.substring(record.indexOf(':')+1); 227 228 // split on ':' 229 String [] fields = record.split(":"); 230 String word = fields[0]; 231 String tf = fields[1]; 232 String ef = fields[2]; 233 String df = fields[3]; 234 235 236 String expansions = fields[4]; 237 String documents = fields[5]; 238 String lf = "0"; 239 String linklist = ""; 240 if (fields.length > 7) {// have thesaurus stuff 241 lf =fields[6]; 242 linklist = fields[7]; 243 } 244 245 // the phindData attributes and phrase 246 phind_data.setAttribute("id", Long.toString(phrase)); 247 phind_data.setAttribute("df", df); 248 phind_data.setAttribute("ef", ef); 249 phind_data.setAttribute("lf", lf); 250 phind_data.setAttribute("tf", tf); 251 GSXML.createTextElement(this.doc, "phrase", word); 252 253 addExpansionList(phind_data, expansions, word, ef, first_e, last_e); 254 addDocumentList(phind_data, documents, word, df, first_d, last_d); 255 if (!lf.equals("0")) { 256 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l); 257 } 258 return true; 259 } 260 261 protected boolean addExpansionList( Element phind_data, String record, 262 String word, 263 String freq, 264 long first, long last) { 265 266 Element expansion_list = this.doc.createElement("expansionList"); 267 phind_data.appendChild(expansion_list); 268 expansion_list.setAttribute("length", freq); 269 expansion_list.setAttribute("start", Long.toString(first)); 270 expansion_list.setAttribute("end", Long.toString(last)); 271 272 // get the list of strings 273 String [] expansions = record.split(","); 274 int length = expansions.length; 275 if (length < last) last = length; 276 for (long i = first; i < last; i++) { 277 long num = Long.parseLong(expansions[(int)i]); 278 Element expansion = getExpansion( num, word); 279 expansion.setAttribute("num", Long.toString(i)); 280 expansion_list.appendChild(expansion); 281 } 282 return true; 283 } 284 285 protected Element getExpansion(long phrase_num, 286 String orig_phrase) { 287 288 // look up the phrase in the pdata thingy 289 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document", 290 phrase_num); 291 292 if (record ==null || record.equals("")) return null; 293 294 // ignore everything up to and including first colon 295 record = record.substring(record.indexOf(':')+1); 296 297 String [] fields = record.split(":"); 298 String phrase = fields[0]; 299 String tf = fields[1]; 300 //String ef = fields[2]; dont use this 301 String df = fields[3]; 302 303 Element expansion = this.doc.createElement("expansion"); 304 expansion.setAttribute("tf", tf); 305 expansion.setAttribute("df", df); 306 expansion.setAttribute("id", Long.toString(phrase_num)); 307 308 // get teh suffix and prefix 309 String [] ends = splitPhraseOnWord(phrase, orig_phrase); 310 if (!ends[0].equals("")) { 311 expansion.appendChild(GSXML.createTextElement(this.doc, "prefix", ends[0])); 312 } 313 if (!ends[1].equals("")) { 314 expansion.appendChild(GSXML.createTextElement(this.doc, "suffix", ends[1])); 315 } 316 317 return expansion; 318 319 } 320 321 protected boolean addDocumentList(Element phind_data, String record, 322 String word, 323 String freq, 324 long first, long last) { 325 326 Element document_list = this.doc.createElement("documentList"); 327 phind_data.appendChild(document_list); 328 document_list.setAttribute("length", freq); 329 document_list.setAttribute("start", Long.toString(first)); 330 document_list.setAttribute("end", Long.toString(last)); 331 332 // get the list of doc,freq 333 String [] doc_freqs = record.split(";"); 334 int length = doc_freqs.length; 335 if (length<last) last=length; 336 337 for (long i = first; i < last; i++) { 338 String doc_elem = doc_freqs[(int)i]; 339 int p = doc_elem.indexOf(','); 340 long doc_num; 341 String doc_freq; 342 if (p == -1) { // there is no freq in the record 343 doc_num =Long.parseLong(doc_elem); 344 doc_freq = "1"; 345 } else { 346 doc_num = Long.parseLong(doc_elem.substring(0,p)); 347 doc_freq = doc_elem.substring(p+1); 348 } 349 Element document = getDocument( doc_num); 350 document.setAttribute("freq", doc_freq); 351 document.setAttribute("num", Long.toString(i)); 352 document_list.appendChild(document); 353 } 354 355 356 return true; 357 } 358 359 360 protected Element getDocument(long doc_num) { 361 362 // look up the phrase in the docs thingy 363 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"docs", "Document", 364 doc_num); 365 366 if (record ==null || record.equals("")) return null; 367 368 // ignore everything up to and including first \t 369 record = record.substring(record.indexOf('\t')+1); 370 371 String [] fields = record.split("\t"); 372 String hash = fields[0]; 373 String title = fields[1]; 374 375 Element d = this.doc.createElement("document"); 376 d.setAttribute("hash", hash); 377 d.appendChild(GSXML.createTextElement(this.doc, "title", title)); 378 379 return d; 380 381 } 382 protected boolean addThesaurusList(Element phind_data, String record, 383 String word, 384 String freq, 385 long first, long last) { 386 387 388 Element thesaurus_list = this.doc.createElement("thesaurusList"); 389 phind_data.appendChild(thesaurus_list); 390 thesaurus_list.setAttribute("length", freq); 391 thesaurus_list.setAttribute("start", Long.toString(first)); 392 thesaurus_list.setAttribute("end", Long.toString(last)); 393 394 // get the list of type,dest,dest 395 String [] links = record.split(";"); 396 int length = links.length; 397 long index = 0; 398 for (int i = 0; i < length; i++) { // go through the entries 399 String link_info = links[(int)i]; 400 String [] items = link_info.split(","); 401 // the first entry is teh type 402 String type = items[0]; 403 for (int j = 1; j<items.length; j++, index++) { 404 if (index >= first && index < last) { // only output the ones we want 405 long phrase = Long.parseLong(items[j]); 406 Element t = getThesaurus(phrase); 407 t.setAttribute("type", type); 408 thesaurus_list.appendChild(t); 409 } 410 } 411 } 412 413 return true; 414 } 415 416 protected Element getThesaurus(long phrase_num) { 417 418 // look up the phrase in the pdata thingy 419 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document", 420 phrase_num); 421 422 if (record ==null || record.equals("")) return null; 423 424 // ignore everything up to and including first colon 425 record = record.substring(record.indexOf(':')+1); 426 427 String [] fields = record.split(":"); 428 String phrase = fields[0]; 429 String tf = fields[1]; 430 //String ef = fields[2]; dont use this 431 String df = fields[3]; 432 433 Element thesaurus = this.doc.createElement("thesaurus"); 434 thesaurus.setAttribute("tf", tf); 435 thesaurus.setAttribute("df", df); 436 thesaurus.setAttribute("id", Long.toString(phrase_num)); 437 thesaurus.appendChild(GSXML.createTextElement(this.doc, "phrase", phrase)); 438 return thesaurus; 439 440 } 441 442 /** returns an array of two elements - the prefix and the suffix*/ 443 protected String [] splitPhraseOnWord(String phrase, String word) { 444 445 if (word.equals("")) { 446 447 String [] res = {phrase, ""}; 448 return res; 449 } 450 // use 2 so that we only split on the first occurrance. trailing empty strings should be included 451 String [] result = phrase.split(word, 2); 452 return result; 453 454 } 455 456 protected Element phindError(String message) { 457 Element e = this.doc.createElement("phindError"); 458 Text t = this.doc.createTextNode(message); 459 e.appendChild(t); 460 return e; 461 } 462 42 extends ServiceRack { 43 44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.PhindPhraseBrowse.class.getName()); 45 46 // the services on offer 47 private static final String PHIND_SERVICE = "PhindApplet"; 48 49 private static MGPPRetrieveWrapper mgpp_retrieve_src=null; 50 private static MGPPSearchWrapper mgpp_search_src=null; 51 private String basepath = null; 52 53 private Element applet_description = null; 54 55 public PhindPhraseBrowse() { 56 if(this.mgpp_retrieve_src == null) { 57 this.mgpp_retrieve_src = new MGPPRetrieveWrapper(); 58 } 59 if(this.mgpp_search_src == null) { 60 this.mgpp_search_src = new MGPPSearchWrapper(); 61 } 62 // set up the default params 63 this.mgpp_search_src.setQueryLevel("Document"); 64 this.mgpp_search_src.setReturnLevel("Document"); 65 this.mgpp_search_src.setMaxDocs(5); 66 this.mgpp_search_src.setStem(false); 67 this.mgpp_search_src.setCase(true); 68 } 69 70 public void cleanUp() { 71 super.cleanUp(); 72 this.mgpp_retrieve_src.unloadIndexData(); 73 this.mgpp_search_src.unloadIndexData(); 74 } 75 76 /** configure the service module 77 * 78 * @param info a DOM Element containing any config info for the service 79 * @return true if configured 80 */ 81 public boolean configure(Element info, Element extra_info) { 82 83 if (!super.configure(info, extra_info)){ 84 return false; 85 } 86 87 logger.info("configuring PhindPhraseBrowse"); 88 89 // set up short_service_info_ - for now just has name and type 90 Element e = this.doc.createElement(GSXML.SERVICE_ELEM); 91 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET); 92 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE); 93 this.short_service_info.appendChild(e); 94 95 // set up the static applet description 96 97 applet_description = this.doc.createElement(GSXML.SERVICE_ELEM); 98 applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET); 99 applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE); 100 101 // add in the applet info for the phind applet 102 // need to make this dynamic - library names etc 103 // change the applet params - have a single param with the library name 104 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out 105 // phindcgi param now is not complete - library must be prepended to it. 106 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='applet' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?"; 107 app_info += GSParams.ACTION +"=a&"+GSParams.REQUEST_TYPE +"=r&"+GSParams.SERVICE+"="+PHIND_SERVICE+"&"+GSParams.OUTPUT+"=xml&"+GSParams.RESPONSE_ONLY+"=1'/>"; 108 app_info +="<PARAM NAME='collection' VALUE='"; 109 app_info += this.cluster_name; 110 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">"; 111 112 Document dom = this.converter.getDOM(app_info); 113 if (dom==null) { 114 logger.error("Couldn't parse applet info"); 115 return false; 116 } 117 Element app_elem = dom.getDocumentElement(); 118 applet_description.appendChild(this.doc.importNode(app_elem, true)); 119 120 return true; 121 } 122 123 protected Element getServiceDescription(String service, String lang, String subset) { 124 if (!service.equals(PHIND_SERVICE)) { 125 return null; 126 } 127 Element describe = (Element) applet_description.cloneNode(true); 128 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(PHIND_SERVICE+".name", lang))); 129 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(PHIND_SERVICE+".description", lang))); 130 return describe; 131 } 132 133 protected Element processPhindApplet(Element request) { 134 135 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); 136 HashMap params = GSXML.extractParams(param_elem, false); 137 138 long first_e = Long.parseLong((String)params.get("pfe")); 139 long last_e = Long.parseLong((String)params.get("ple")); 140 long first_l = Long.parseLong((String)params.get("pfl")); 141 long last_l = Long.parseLong((String)params.get("pll")); 142 long first_d = Long.parseLong((String)params.get("pfd")); 143 long last_d = Long.parseLong((String)params.get("pld")); 144 145 long phrase; 146 String phrase_str = (String)params.get("ppnum"); 147 if (phrase_str == null || phrase_str.equals("")) { 148 phrase=0; 149 } else { 150 phrase = Long.parseLong(phrase_str); 151 } 152 String word = (String)params.get("pptext"); 153 String phind_index = (String)params.get("pc"); 154 // the location of the mgpp database files 155 this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index); 156 157 // the result element 158 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM); 159 result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE); 160 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS); 161 162 // applet result info must be in appletInfo element 163 Element applet_data = this.doc.createElement(GSXML.APPLET_DATA_ELEM); 164 result.appendChild(applet_data); 165 Element phind_data = this.doc.createElement("phindData"); 166 applet_data.appendChild(phind_data); 167 168 169 // if we dont know the phrase number, look it up 170 if (phrase == 0) { 171 if (word==null || word.equals("")) { 172 Element error = phindError("no word or phrase"); 173 phind_data.appendChild(error); 174 return result; 175 } 176 phrase = findPhraseNumberFromWord( word); 177 } 178 if (phrase==0) { 179 // the word is not in the collection 180 // return a phind error string 181 Element error = phindError("the term "+word+" is not in the collection"); 182 phind_data.appendChild(error); 183 return result; 184 } 185 186 // get the phrase data into the phind_data node 187 getPhraseData(phind_data, phrase, first_l, last_l, 188 first_e, last_e, first_d, last_d); 189 return result; 190 191 192 }// processPhindApplet 193 194 protected long findPhraseNumberFromWord(String word) { 195 synchronized (mgpp_search_src) { 196 // set the mgpp index data - we are looking up pword 197 mgpp_search_src.loadIndexData(this.basepath+File.separatorChar+"pword"); 198 199 mgpp_search_src.runQuery(word); 200 201 MGPPQueryResult res = mgpp_search_src.getQueryResult(); 202 Vector docs = res.getDocs(); 203 if (docs.size()==0) { 204 // phrase not found 205 return 0; 206 } 207 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement(); 208 return doc.num_; 209 } 210 } 211 212 protected boolean getPhraseData(Element phind_data, 213 long phrase, long first_l, long last_l, 214 long first_e, long last_e, long first_d, 215 long last_d) { 216 217 synchronized (mgpp_retrieve_src) { 218 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document", 219 phrase); 220 if (record.equals("")) { 221 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase); 222 phind_data.appendChild(error); 223 return false; 224 } 225 226 // parse the record - its in gordons cryptic form 227 // ":word:tf:ef:df:el:dl:lf:ll" 228 // el: e,e,e 229 // dl: d;f,d;f, 230 // lf and ll may be null 231 // l: type,dest, dest; type,dest,dest 232 233 // ignore everything up to and including first colon (has 234 // <Document>3505: at the start) 235 record = record.substring(record.indexOf(':')+1); 236 237 // split on ':' 238 String [] fields = record.split(":"); 239 String word = fields[0]; 240 String tf = fields[1]; 241 String ef = fields[2]; 242 String df = fields[3]; 243 244 245 String expansions = fields[4]; 246 String documents = fields[5]; 247 String lf = "0"; 248 String linklist = ""; 249 if (fields.length > 7) {// have thesaurus stuff 250 lf =fields[6]; 251 linklist = fields[7]; 252 } 253 254 // the phindData attributes and phrase 255 phind_data.setAttribute("id", Long.toString(phrase)); 256 phind_data.setAttribute("df", df); 257 phind_data.setAttribute("ef", ef); 258 phind_data.setAttribute("lf", lf); 259 phind_data.setAttribute("tf", tf); 260 GSXML.createTextElement(this.doc, "phrase", word); 261 262 addExpansionList(phind_data, expansions, word, ef, first_e, last_e); 263 addDocumentList(phind_data, documents, word, df, first_d, last_d); 264 if (!lf.equals("0")) { 265 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l); 266 } 267 return true; 268 } 269 } 270 271 protected boolean addExpansionList( Element phind_data, String record, 272 String word, 273 String freq, 274 long first, long last) { 275 276 Element expansion_list = this.doc.createElement("expansionList"); 277 phind_data.appendChild(expansion_list); 278 expansion_list.setAttribute("length", freq); 279 expansion_list.setAttribute("start", Long.toString(first)); 280 expansion_list.setAttribute("end", Long.toString(last)); 281 282 // get the list of strings 283 String [] expansions = record.split(","); 284 int length = expansions.length; 285 if (length < last) last = length; 286 for (long i = first; i < last; i++) { 287 long num = Long.parseLong(expansions[(int)i]); 288 Element expansion = getExpansion( num, word); 289 expansion.setAttribute("num", Long.toString(i)); 290 expansion_list.appendChild(expansion); 291 } 292 return true; 293 } 294 295 protected Element getExpansion(long phrase_num, 296 String orig_phrase) { 297 298 // look up the phrase in the pdata thingy 299 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document", 300 phrase_num); 301 302 if (record ==null || record.equals("")) return null; 303 304 // ignore everything up to and including first colon 305 record = record.substring(record.indexOf(':')+1); 306 307 String [] fields = record.split(":"); 308 String phrase = fields[0]; 309 String tf = fields[1]; 310 //String ef = fields[2]; dont use this 311 String df = fields[3]; 312 313 Element expansion = this.doc.createElement("expansion"); 314 expansion.setAttribute("tf", tf); 315 expansion.setAttribute("df", df); 316 expansion.setAttribute("id", Long.toString(phrase_num)); 317 318 // get teh suffix and prefix 319 String [] ends = splitPhraseOnWord(phrase, orig_phrase); 320 if (!ends[0].equals("")) { 321 expansion.appendChild(GSXML.createTextElement(this.doc, "prefix", ends[0])); 322 } 323 if (!ends[1].equals("")) { 324 expansion.appendChild(GSXML.createTextElement(this.doc, "suffix", ends[1])); 325 } 326 327 return expansion; 328 329 } 330 331 protected boolean addDocumentList(Element phind_data, String record, 332 String word, 333 String freq, 334 long first, long last) { 335 336 Element document_list = this.doc.createElement("documentList"); 337 phind_data.appendChild(document_list); 338 document_list.setAttribute("length", freq); 339 document_list.setAttribute("start", Long.toString(first)); 340 document_list.setAttribute("end", Long.toString(last)); 341 342 // get the list of doc,freq 343 String [] doc_freqs = record.split(";"); 344 int length = doc_freqs.length; 345 if (length<last) last=length; 346 347 for (long i = first; i < last; i++) { 348 String doc_elem = doc_freqs[(int)i]; 349 int p = doc_elem.indexOf(','); 350 long doc_num; 351 String doc_freq; 352 if (p == -1) { // there is no freq in the record 353 doc_num =Long.parseLong(doc_elem); 354 doc_freq = "1"; 355 } else { 356 doc_num = Long.parseLong(doc_elem.substring(0,p)); 357 doc_freq = doc_elem.substring(p+1); 358 } 359 Element document = getDocument( doc_num); 360 document.setAttribute("freq", doc_freq); 361 document.setAttribute("num", Long.toString(i)); 362 document_list.appendChild(document); 363 } 364 365 366 return true; 367 } 368 369 370 protected Element getDocument(long doc_num) { 371 372 // look up the phrase in the docs thingy 373 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"docs", "Document", 374 doc_num); 375 376 if (record ==null || record.equals("")) return null; 377 378 // ignore everything up to and including first \t 379 record = record.substring(record.indexOf('\t')+1); 380 381 String [] fields = record.split("\t"); 382 String hash = fields[0]; 383 String title = fields[1]; 384 385 Element d = this.doc.createElement("document"); 386 d.setAttribute("hash", hash); 387 d.appendChild(GSXML.createTextElement(this.doc, "title", title)); 388 389 return d; 390 391 } 392 protected boolean addThesaurusList(Element phind_data, String record, 393 String word, 394 String freq, 395 long first, long last) { 396 397 398 Element thesaurus_list = this.doc.createElement("thesaurusList"); 399 phind_data.appendChild(thesaurus_list); 400 thesaurus_list.setAttribute("length", freq); 401 thesaurus_list.setAttribute("start", Long.toString(first)); 402 thesaurus_list.setAttribute("end", Long.toString(last)); 403 404 // get the list of type,dest,dest 405 String [] links = record.split(";"); 406 int length = links.length; 407 long index = 0; 408 for (int i = 0; i < length; i++) { // go through the entries 409 String link_info = links[(int)i]; 410 String [] items = link_info.split(","); 411 // the first entry is teh type 412 String type = items[0]; 413 for (int j = 1; j<items.length; j++, index++) { 414 if (index >= first && index < last) { // only output the ones we want 415 long phrase = Long.parseLong(items[j]); 416 Element t = getThesaurus(phrase); 417 t.setAttribute("type", type); 418 thesaurus_list.appendChild(t); 419 } 420 } 421 } 422 423 return true; 424 } 425 426 protected Element getThesaurus(long phrase_num) { 427 428 // look up the phrase in the pdata thingy 429 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document", 430 phrase_num); 431 432 if (record ==null || record.equals("")) return null; 433 434 // ignore everything up to and including first colon 435 record = record.substring(record.indexOf(':')+1); 436 437 String [] fields = record.split(":"); 438 String phrase = fields[0]; 439 String tf = fields[1]; 440 //String ef = fields[2]; dont use this 441 String df = fields[3]; 442 443 Element thesaurus = this.doc.createElement("thesaurus"); 444 thesaurus.setAttribute("tf", tf); 445 thesaurus.setAttribute("df", df); 446 thesaurus.setAttribute("id", Long.toString(phrase_num)); 447 thesaurus.appendChild(GSXML.createTextElement(this.doc, "phrase", phrase)); 448 return thesaurus; 449 450 } 451 452 /** returns an array of two elements - the prefix and the suffix*/ 453 protected String [] splitPhraseOnWord(String phrase, String word) { 454 455 if (word.equals("")) { 456 457 String [] res = {phrase, ""}; 458 return res; 459 } 460 // use 2 so that we only split on the first occurrance. trailing empty strings should be included 461 String [] result = phrase.split(word, 2); 462 return result; 463 464 } 465 466 protected Element phindError(String message) { 467 Element e = this.doc.createElement("phindError"); 468 Text t = this.doc.createTextNode(message); 469 e.appendChild(t); 470 return e; 471 } 472 463 473 } 464 474
Note:
See TracChangeset
for help on using the changeset viewer.