- Timestamp:
- 2002-08-23T10:10:18+12:00 (22 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/service/PhindService.java
r3377 r3389 19 19 package org.greenstone.gsdl3.service; 20 20 21 import org.greenstone.gsdl3.util.*; 22 23 import org.greenstone.mgpp.*; 21 24 import org.w3c.dom.Document; 22 25 import org.w3c.dom.Node; … … 24 27 import org.w3c.dom.Text; 25 28 29 import java.util.Vector; 30 import java.util.HashMap; 26 31 /** 27 32 * PhindService - the phind phrase browsing service … … 32 37 public class PhindService 33 38 extends ServiceModule { 34 35 39 36 protected Element processService(String name, Element request) { 37 38 if (!name.equals("PhindApplet")) { 39 System.err.println("PhindService:you have asked for a non-existant service - "+name+"!"); 40 return null; 41 } 42 // create dummy response 43 Element res = doc_.createElement("response"); 44 res.setAttribute("from", "PhindApplet"); 45 Element data = doc_.createElement("content"); 46 Text t = doc_.createTextNode("this is the results for a phind request"); 47 data.appendChild(t); 48 49 res.appendChild(data); 50 51 return res; 52 40 private MGPPWrapper mgpp_src_=null; 41 private String basepath_ = null; 42 public PhindService() { 43 mgpp_src_ = new MGPPWrapper(); 44 // set up the default params 45 mgpp_src_.setQueryLevel("Document"); 46 mgpp_src_.setReturnLevel("Document"); 47 mgpp_src_.setMaxDocs(5); 48 mgpp_src_.setStem(false); 49 mgpp_src_.setCase(true); 53 50 } 54 51 /** configure the service module … … 73 70 f.setAttribute("name", "PhindApplet"); 74 71 75 // add in teh applet info for the phind applet 76 String app_info = "<applet CODEBASE='/gsdl3/lib/java' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE='/gsdl3/library'/> <PARAM NAME='phindcgi' VALUE='/gsdl3/library?a=b&sa=phind'/> <PARAM NAME='collection' VALUE='mgppdemo'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='/gsdl3/interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</applet>"; 72 // add in the applet info for the phind applet 73 // need to make this dynamic - library names etc 74 String app_info = "<applet CODEBASE='/gsdl3/lib/java' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE='/gsdl3/library'/> <PARAM NAME='phindcgi' VALUE='/gsdl3/library?a=a&sa=r&sn=Phind'/> <PARAM NAME='collection' VALUE='mgppdemo'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='/gsdl3/interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</applet>"; 77 75 78 76 Document dom = converter_.getDOM(app_info); … … 85 83 } 86 84 85 protected Element processService(String name, Element request) { 86 87 if (!name.equals("PhindApplet")) { 88 System.err.println("PhindService:you have asked for a non-existant service - "+name+"!"); 89 return null; 90 } 91 Element param_elem = (Element)GSXML.getChildByTagName(request, "paramList"); 92 HashMap params = GSXML.extractParams(param_elem); 93 94 long first_e = Long.parseLong((String)params.get("pfe")); 95 long last_e = Long.parseLong((String)params.get("ple")); 96 long first_l = Long.parseLong((String)params.get("pfl")); 97 long last_l = Long.parseLong((String)params.get("pll")); 98 long first_d = Long.parseLong((String)params.get("pfd")); 99 long last_d = Long.parseLong((String)params.get("pld")); 100 101 long phrase; 102 String phrase_str = (String)params.get("ppnum"); 103 if (phrase_str == null || phrase_str.equals("")) { 104 phrase=0; 105 } else { 106 phrase = Long.parseLong(phrase_str); 107 } 108 String word = (String)params.get("pptext"); 109 String phind_index = (String)params.get("pc"); 110 // the location of the mgpp database files 111 basepath_ = GSFile.phindBasePath(site_home_, collection_name_, phind_index); 112 113 // the result element 114 Element result = doc_.createElement("response"); 115 String from = GSPath.appendLink(collection_name_, "PhindApplet"); 116 result.setAttribute("from", from); 117 result.setAttribute("type", "query"); 118 119 // applet result info must be in appletInfo element 120 Element applet_data = doc_.createElement("appletData"); 121 result.appendChild(applet_data); 122 Element phind_data = doc_.createElement("phindData"); 123 applet_data.appendChild(phind_data); 124 125 126 // if we dont know the phrase number, look it up 127 if (phrase == 0) { 128 if (word==null || word.equals("")) { 129 Element error = phindError("no word or phrase"); 130 phind_data.appendChild(error); 131 return result; 132 } 133 phrase = findPhraseNumberFromWord( word); 134 System.out.println("phind, term number for "+word+" is "+phrase); 135 } 136 if (phrase==0) { 137 // the word is not in the collection 138 // return a phind error string 139 Element error = phindError("the term "+word+" is not in the collection"); 140 phind_data.appendChild(error); 141 return result; 142 } 143 144 // get the phrase data into the phind_data node 145 getPhraseData(phind_data, phrase, first_l, last_l, 146 first_e, last_e, first_d, last_d); 147 return result; 148 149 150 }// processService 151 152 protected long findPhraseNumberFromWord(String word) { 153 154 // set the mgpp index data - we are looking up pword 155 mgpp_src_.loadIndexData(basepath_, "pword"); 156 157 mgpp_src_.runQuery(word); 158 159 MGPPQueryResult res = mgpp_src_.getQueryResult(); 160 Vector docs = res.getDocs(); 161 if (docs.size()==0) { 162 // phrase not found 163 return 0; 164 } 165 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement(); 166 return doc.num_; 167 } 168 169 protected boolean getPhraseData(Element phind_data, 170 long phrase, long first_l, long last_l, 171 long first_e, long last_e, long first_d, 172 long last_d) { 173 174 String record = mgpp_src_.getDocument(basepath_, "pdata", "Document", 175 phrase); 176 if (record.equals("")) { 177 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase); 178 phind_data.appendChild(error); 179 return false; 180 } 181 182 System.out.println("record="+record); 183 // parse the record - its in gordons cryptic form 184 // ":word:tf:ef:df:el:dl:lf:ll" 185 // el: e,e,e 186 // dl: d;f,d;f, 187 // lf and ll may be null 188 // l: type,dest, dest; type,dest,dest 189 190 // ignore everything up to and including first colon (has 191 // <Document>3505: at the start) 192 record = record.substring(record.indexOf(':')+1); 193 194 // split on ':' 195 String [] fields = record.split(":"); 196 String word = fields[0]; 197 String tf = fields[1]; 198 String ef = fields[2]; 199 String df = fields[3]; 200 201 202 String expansions = fields[4]; 203 String documents = fields[5]; 204 String lf = "0"; 205 String linklist = ""; 206 if (fields.length > 7) {// have thesaurus stuff 207 lf =fields[6]; 208 linklist = fields[7]; 209 } 210 211 // the phindData attributes and phrase 212 phind_data.setAttribute("id", Long.toString(phrase)); 213 phind_data.setAttribute("df", df); 214 phind_data.setAttribute("ef", ef); 215 phind_data.setAttribute("lf", lf); 216 phind_data.setAttribute("tf", tf); 217 GSXML.createTextElement(doc_, "phrase", word); 218 219 addExpansionList(phind_data, expansions, word, ef, first_e, last_e); 220 addDocumentList(phind_data, documents, word, df, first_d, last_d); 221 if (!lf.equals("0")) { 222 System.out.println("adding thesaurus stuff"); 223 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l); 224 } 225 return true; 226 } 227 228 protected boolean addExpansionList( Element phind_data, String record, 229 String word, 230 String freq, 231 long first, long last) { 232 233 Element expansion_list = doc_.createElement("expansionList"); 234 phind_data.appendChild(expansion_list); 235 expansion_list.setAttribute("length", freq); 236 expansion_list.setAttribute("start", Long.toString(first)); 237 expansion_list.setAttribute("end", Long.toString(last)); 238 239 // get the list of strings 240 String [] expansions = record.split(","); 241 int length = expansions.length; 242 if (length < last) last = length; 243 for (long i = first; i < last; i++) { 244 long num = Long.parseLong(expansions[(int)i]); 245 Element expansion = getExpansion( num, word); 246 expansion.setAttribute("num", Long.toString(i)); 247 expansion_list.appendChild(expansion); 248 } 249 return true; 250 } 251 252 protected Element getExpansion(long phrase_num, 253 String orig_phrase) { 254 255 // look up the phrase in the pdata thingy 256 String record = mgpp_src_.getDocument(basepath_, "pdata", "Document", 257 phrase_num); 258 259 if (record ==null || record.equals("")) return null; 260 261 // ignore everything up to and including first colon 262 record = record.substring(record.indexOf(':')+1); 263 264 String [] fields = record.split(":"); 265 String phrase = fields[0]; 266 String tf = fields[1]; 267 //String ef = fields[2]; dont use this 268 String df = fields[3]; 269 270 Element expansion = doc_.createElement("expansion"); 271 expansion.setAttribute("tf", tf); 272 expansion.setAttribute("df", df); 273 expansion.setAttribute("id", Long.toString(phrase_num)); 274 275 // get teh suffix and prefix 276 String [] ends = splitPhraseOnWord(phrase, orig_phrase); 277 if (!ends[0].equals("")) { 278 expansion.appendChild(GSXML.createTextElement(doc_, "prefix", ends[0])); 279 } 280 if (!ends[1].equals("")) { 281 expansion.appendChild(GSXML.createTextElement(doc_, "suffix", ends[1])); 282 } 283 284 return expansion; 285 286 } 287 288 protected boolean addDocumentList(Element phind_data, String record, 289 String word, 290 String freq, 291 long first, long last) { 292 293 Element document_list = doc_.createElement("documentList"); 294 phind_data.appendChild(document_list); 295 document_list.setAttribute("length", freq); 296 document_list.setAttribute("start", Long.toString(first)); 297 document_list.setAttribute("end", Long.toString(last)); 298 299 // get the list of doc,freq 300 String [] doc_freqs = record.split(";"); 301 int length = doc_freqs.length; 302 if (length<last) last=length; 303 304 for (long i = first; i < last; i++) { 305 String doc_elem = doc_freqs[(int)i]; 306 int p = doc_elem.indexOf(','); 307 long doc_num; 308 String doc_freq; 309 if (p == -1) { // there is no freq in the record 310 doc_num =Long.parseLong(doc_elem); 311 doc_freq = "1"; 312 } else { 313 doc_num = Long.parseLong(doc_elem.substring(0,p)); 314 doc_freq = doc_elem.substring(p+1); 315 } 316 Element document = getDocument( doc_num); 317 document.setAttribute("freq", doc_freq); 318 document.setAttribute("num", Long.toString(i)); 319 document_list.appendChild(document); 320 } 321 322 323 return true; 324 } 325 326 327 protected Element getDocument(long doc_num) { 328 329 // look up the phrase in the docs thingy 330 String record = mgpp_src_.getDocument(basepath_, "docs", "Document", 331 doc_num); 332 333 if (record ==null || record.equals("")) return null; 334 System.out.println("doc record:"+record); 335 336 // ignore everything up to and including first \t 337 record = record.substring(record.indexOf('\t')+1); 338 339 String [] fields = record.split("\t"); 340 String hash = fields[0]; 341 String title = fields[1]; 342 343 Element d = doc_.createElement("document"); 344 d.setAttribute("hash", hash); 345 d.appendChild(GSXML.createTextElement(doc_, "title", title)); 346 347 return d; 348 349 } 350 protected boolean addThesaurusList(Element phind_data, String record, 351 String word, 352 String freq, 353 long first, long last) { 354 355 356 Element thesaurus_list = doc_.createElement("thesaurusList"); 357 phind_data.appendChild(thesaurus_list); 358 thesaurus_list.setAttribute("length", freq); 359 thesaurus_list.setAttribute("start", Long.toString(first)); 360 thesaurus_list.setAttribute("end", Long.toString(last)); 361 362 System.out.println("record for thesaurus="+record); 363 364 // get the list of type,dest,dest 365 String [] links = record.split(";"); 366 int length = links.length; 367 long index = 0; 368 for (int i = 0; i < length; i++) { // go through the entries 369 String link_info = links[(int)i]; 370 String [] items = link_info.split(","); 371 // the first entry is teh type 372 String type = items[0]; 373 for (int j = 1; j<items.length; j++, index++) { 374 if (index >= first && index < last) { // only output the ones we want 375 long phrase = Long.parseLong(items[j]); 376 Element t = getThesaurus(phrase); 377 t.setAttribute("type", type); 378 thesaurus_list.appendChild(t); 379 } 380 } 381 } 382 383 return true; 384 } 385 386 protected Element getThesaurus(long phrase_num) { 387 388 // look up the phrase in the pdata thingy 389 String record = mgpp_src_.getDocument(basepath_, "pdata", "Document", 390 phrase_num); 391 392 if (record ==null || record.equals("")) return null; 393 394 // ignore everything up to and including first colon 395 record = record.substring(record.indexOf(':')+1); 396 397 String [] fields = record.split(":"); 398 String phrase = fields[0]; 399 String tf = fields[1]; 400 //String ef = fields[2]; dont use this 401 String df = fields[3]; 402 403 Element thesaurus = doc_.createElement("thesaurus"); 404 thesaurus.setAttribute("tf", tf); 405 thesaurus.setAttribute("df", df); 406 thesaurus.setAttribute("id", Long.toString(phrase_num)); 407 thesaurus.appendChild(GSXML.createTextElement(doc_, "phrase", phrase)); 408 return thesaurus; 409 410 } 411 412 /** returns an array of two elements - the prefix and the suffix*/ 413 protected String [] splitPhraseOnWord(String phrase, String word) { 414 415 if (word.equals("")) { 416 417 String [] res = {phrase, ""}; 418 return res; 419 } 420 // use 2 so that we only split on the first occurrance. trailing empty strings should be included 421 String [] result = phrase.split(word, 2); 422 if (result.length !=2) { 423 System.out.println("didn't get two substrings!!"); 424 } 425 return result; 426 427 } 428 429 protected Element phindError(String message) { 430 Element e = doc_.createElement("phindError"); 431 Text t = doc_.createTextNode(message); 432 e.appendChild(t); 433 return e; 434 } 87 435 88 436 } 437 438 439 /* 440 // CREATE dummy response 441 Element res = doc_.createElement("response"); 442 res.setAttribute("from", "PhindApplet"); 443 Element data = doc_.createElement("service"); 444 Element app_data = doc_.createElement("appletData"); 445 data.appendChild(app_data); 446 String phind_info ="<phindData id='2507' tf='19424' ef='1632' df='1843' lf='0'><phrase>FOREST</phrase><expansionList length='1632' start='0' end='10'><expansion num='0' id='177648' tf='2162' df='519'><suffix>MANAGEMENT</suffix></expansion> <expansion num='1' id='177531' tf='1958' df='566'><suffix>PRODUCTS</suffix></expansion> <expansion num='2' id='177469' tf='1328' df='532'><suffix>RESOURCES</suffix></expansion> <expansion num='3' id='177773' tf='943' df='177'><suffix>GENETIC</suffix></expansion> <expansion num='4' id='177335' tf='736' df='258'><prefix>SUSTAINABLE</prefix></expansion> </expansionList><documentList length='1843' start='0' end='10'><document num='0' hash='HASH011fb8a7d8bf781ab3cbb087' freq='363'><title>FO-edu List of Countries 0</title></document><document num='1' hash='HASH27ae41229eb0636849a5be' freq='344' ><title>FO-edu List of Countries 1</title></document><document num='2' hash='HASH0187ef85c9dbf5bf132ea1d1' freq='263'><title>FO-edu List of Countries 2</title></document><document num='3' hash='HASH0125ec9ef67960446f471280' freq='238'><title>FO-edu List of Countries 3</title></document><document num='4' hash='HASH67087f7717eb35050ce1ac' freq='213'><title>FO-edu List of Countries 4</title></document></documentList><thesaurusList><thesaurus num='3' id='36506' tf='0' df='0' type='RT'><phrase>FRANCOPHONE</phrase></thesaurus></thesaurusList></phindData>"; 447 448 Node t = converter_.getDOM(phind_info).getDocumentElement(); 449 app_data.appendChild(doc_.importNode(t, true)); 450 451 res.appendChild(data); 452 453 return res; 454 */
Note:
See TracChangeset
for help on using the changeset viewer.