source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java@ 13270

Last change on this file since 13270 was 13270, checked in by shaoqun, 17 years ago

replace Category class which is deprecated with Logger class

  • Property svn:keywords set to Author Date Id Revision
File size: 15.2 KB
Line 
1/*
2 * PhindServices.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21import org.greenstone.gsdl3.util.*;
22
23import org.greenstone.mgpp.*;
24import org.w3c.dom.Document;
25import org.w3c.dom.Node;
26import org.w3c.dom.Element;
27import org.w3c.dom.Text;
28
29import java.util.Vector;
30import java.util.HashMap;
31import java.io.File;
32
33import org.apache.log4j.*;
34
35/**
36 * PhindServices - the phind phrase browsing service
37 *
38 * @author <a href="mailto:[email protected]">Katherine Don</a>
39 * @version $Revision: 13270 $
40 */
41public class PhindPhraseBrowse
42 extends ServiceRack {
43
44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.PhindPhraseBrowse.class.getName());
45
46 // the services on offer
47 private static final String PHIND_SERVICE = "PhindApplet";
48
49 private MGPPWrapper mgpp_src=null;
50 private String basepath = null;
51
52 private Element applet_description = null;
53
54 public PhindPhraseBrowse() {
55 this.mgpp_src = new MGPPWrapper();
56 // set up the default params
57 this.mgpp_src.setQueryLevel("Document");
58 this.mgpp_src.setReturnLevel("Document");
59 this.mgpp_src.setMaxDocs(5);
60 this.mgpp_src.setStem(false);
61 this.mgpp_src.setCase(true);
62 }
63
64 public void cleanUp() {
65 super.cleanUp();
66 this.mgpp_src.unloadIndexData();
67 }
68
69 /** configure the service module
70 *
71 * @param info a DOM Element containing any config info for the service
72 * @return true if configured
73 */
74 public boolean configure(Element info, Element extra_info) {
75
76 if (!super.configure(info, extra_info)){
77 return false;
78 }
79
80 logger.info("configuring PhindPhraseBrowse");
81
82 // set up short_service_info_ - for now just has name and type
83 Element e = this.doc.createElement(GSXML.SERVICE_ELEM);
84 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
85 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
86 this.short_service_info.appendChild(e);
87
88 // set up the static applet description
89
90 applet_description = this.doc.createElement(GSXML.SERVICE_ELEM);
91 applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
92 applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
93
94 // add in the applet info for the phind applet
95 // need to make this dynamic - library names etc
96 // change the applet params - have a single param with the library name
97 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
98 // phindcgi param now is not complete - library must be prepended to it.
99 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='applet' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
100 app_info += GSParams.ACTION +"=a&amp;"+GSParams.REQUEST_TYPE +"=r&amp;"+GSParams.SERVICE+"="+PHIND_SERVICE+"&amp;"+GSParams.OUTPUT+"=xml&amp;"+GSParams.RESPONSE_ONLY+"=1'/>";
101 app_info +="<PARAM NAME='collection' VALUE='";
102 app_info += this.cluster_name;
103 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
104
105 Document dom = this.converter.getDOM(app_info);
106 if (dom==null) {
107 logger.error("Couldn't parse applet info");
108 return false;
109 }
110 Element app_elem = dom.getDocumentElement();
111 applet_description.appendChild(this.doc.importNode(app_elem, true));
112
113 return true;
114 }
115
116 protected Element getServiceDescription(String service, String lang, String subset) {
117 if (!service.equals(PHIND_SERVICE)) {
118 return null;
119 }
120 Element describe = (Element) applet_description.cloneNode(true);
121 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(PHIND_SERVICE+".name", lang)));
122 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(PHIND_SERVICE+".description", lang)));
123 return describe;
124 }
125
126 protected Element processPhindApplet(Element request) {
127
128 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
129 HashMap params = GSXML.extractParams(param_elem, false);
130
131 long first_e = Long.parseLong((String)params.get("pfe"));
132 long last_e = Long.parseLong((String)params.get("ple"));
133 long first_l = Long.parseLong((String)params.get("pfl"));
134 long last_l = Long.parseLong((String)params.get("pll"));
135 long first_d = Long.parseLong((String)params.get("pfd"));
136 long last_d = Long.parseLong((String)params.get("pld"));
137
138 long phrase;
139 String phrase_str = (String)params.get("ppnum");
140 if (phrase_str == null || phrase_str.equals("")) {
141 phrase=0;
142 } else {
143 phrase = Long.parseLong(phrase_str);
144 }
145 String word = (String)params.get("pptext");
146 String phind_index = (String)params.get("pc");
147 // the location of the mgpp database files
148 this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index);
149
150 // the result element
151 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
152 result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
153 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
154
155 // applet result info must be in appletInfo element
156 Element applet_data = this.doc.createElement(GSXML.APPLET_DATA_ELEM);
157 result.appendChild(applet_data);
158 Element phind_data = this.doc.createElement("phindData");
159 applet_data.appendChild(phind_data);
160
161
162 // if we dont know the phrase number, look it up
163 if (phrase == 0) {
164 if (word==null || word.equals("")) {
165 Element error = phindError("no word or phrase");
166 phind_data.appendChild(error);
167 return result;
168 }
169 phrase = findPhraseNumberFromWord( word);
170 }
171 if (phrase==0) {
172 // the word is not in the collection
173 // return a phind error string
174 Element error = phindError("the term "+word+" is not in the collection");
175 phind_data.appendChild(error);
176 return result;
177 }
178
179 // get the phrase data into the phind_data node
180 getPhraseData(phind_data, phrase, first_l, last_l,
181 first_e, last_e, first_d, last_d);
182 return result;
183
184
185 }// processPhindApplet
186
187 protected long findPhraseNumberFromWord(String word) {
188
189 // set the mgpp index data - we are looking up pword
190 this.mgpp_src.loadIndexData(this.basepath+File.separatorChar+"pword");
191
192 this.mgpp_src.runQuery(word);
193
194 MGPPQueryResult res = this.mgpp_src.getQueryResult();
195 Vector docs = res.getDocs();
196 if (docs.size()==0) {
197 // phrase not found
198 return 0;
199 }
200 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
201 return doc.num_;
202 }
203
204 protected boolean getPhraseData(Element phind_data,
205 long phrase, long first_l, long last_l,
206 long first_e, long last_e, long first_d,
207 long last_d) {
208
209 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
210 phrase);
211 if (record.equals("")) {
212 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
213 phind_data.appendChild(error);
214 return false;
215 }
216
217 // parse the record - its in gordons cryptic form
218 // ":word:tf:ef:df:el:dl:lf:ll"
219 // el: e,e,e
220 // dl: d;f,d;f,
221 // lf and ll may be null
222 // l: type,dest, dest; type,dest,dest
223
224 // ignore everything up to and including first colon (has
225 // <Document>3505: at the start)
226 record = record.substring(record.indexOf(':')+1);
227
228 // split on ':'
229 String [] fields = record.split(":");
230 String word = fields[0];
231 String tf = fields[1];
232 String ef = fields[2];
233 String df = fields[3];
234
235
236 String expansions = fields[4];
237 String documents = fields[5];
238 String lf = "0";
239 String linklist = "";
240 if (fields.length > 7) {// have thesaurus stuff
241 lf =fields[6];
242 linklist = fields[7];
243 }
244
245 // the phindData attributes and phrase
246 phind_data.setAttribute("id", Long.toString(phrase));
247 phind_data.setAttribute("df", df);
248 phind_data.setAttribute("ef", ef);
249 phind_data.setAttribute("lf", lf);
250 phind_data.setAttribute("tf", tf);
251 GSXML.createTextElement(this.doc, "phrase", word);
252
253 addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
254 addDocumentList(phind_data, documents, word, df, first_d, last_d);
255 if (!lf.equals("0")) {
256 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
257 }
258 return true;
259 }
260
261 protected boolean addExpansionList( Element phind_data, String record,
262 String word,
263 String freq,
264 long first, long last) {
265
266 Element expansion_list = this.doc.createElement("expansionList");
267 phind_data.appendChild(expansion_list);
268 expansion_list.setAttribute("length", freq);
269 expansion_list.setAttribute("start", Long.toString(first));
270 expansion_list.setAttribute("end", Long.toString(last));
271
272 // get the list of strings
273 String [] expansions = record.split(",");
274 int length = expansions.length;
275 if (length < last) last = length;
276 for (long i = first; i < last; i++) {
277 long num = Long.parseLong(expansions[(int)i]);
278 Element expansion = getExpansion( num, word);
279 expansion.setAttribute("num", Long.toString(i));
280 expansion_list.appendChild(expansion);
281 }
282 return true;
283 }
284
285 protected Element getExpansion(long phrase_num,
286 String orig_phrase) {
287
288 // look up the phrase in the pdata thingy
289 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
290 phrase_num);
291
292 if (record ==null || record.equals("")) return null;
293
294 // ignore everything up to and including first colon
295 record = record.substring(record.indexOf(':')+1);
296
297 String [] fields = record.split(":");
298 String phrase = fields[0];
299 String tf = fields[1];
300 //String ef = fields[2]; dont use this
301 String df = fields[3];
302
303 Element expansion = this.doc.createElement("expansion");
304 expansion.setAttribute("tf", tf);
305 expansion.setAttribute("df", df);
306 expansion.setAttribute("id", Long.toString(phrase_num));
307
308 // get teh suffix and prefix
309 String [] ends = splitPhraseOnWord(phrase, orig_phrase);
310 if (!ends[0].equals("")) {
311 expansion.appendChild(GSXML.createTextElement(this.doc, "prefix", ends[0]));
312 }
313 if (!ends[1].equals("")) {
314 expansion.appendChild(GSXML.createTextElement(this.doc, "suffix", ends[1]));
315 }
316
317 return expansion;
318
319 }
320
321 protected boolean addDocumentList(Element phind_data, String record,
322 String word,
323 String freq,
324 long first, long last) {
325
326 Element document_list = this.doc.createElement("documentList");
327 phind_data.appendChild(document_list);
328 document_list.setAttribute("length", freq);
329 document_list.setAttribute("start", Long.toString(first));
330 document_list.setAttribute("end", Long.toString(last));
331
332 // get the list of doc,freq
333 String [] doc_freqs = record.split(";");
334 int length = doc_freqs.length;
335 if (length<last) last=length;
336
337 for (long i = first; i < last; i++) {
338 String doc_elem = doc_freqs[(int)i];
339 int p = doc_elem.indexOf(',');
340 long doc_num;
341 String doc_freq;
342 if (p == -1) { // there is no freq in the record
343 doc_num =Long.parseLong(doc_elem);
344 doc_freq = "1";
345 } else {
346 doc_num = Long.parseLong(doc_elem.substring(0,p));
347 doc_freq = doc_elem.substring(p+1);
348 }
349 Element document = getDocument( doc_num);
350 document.setAttribute("freq", doc_freq);
351 document.setAttribute("num", Long.toString(i));
352 document_list.appendChild(document);
353 }
354
355
356 return true;
357 }
358
359
360 protected Element getDocument(long doc_num) {
361
362 // look up the phrase in the docs thingy
363 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"docs", "Document",
364 doc_num);
365
366 if (record ==null || record.equals("")) return null;
367
368 // ignore everything up to and including first \t
369 record = record.substring(record.indexOf('\t')+1);
370
371 String [] fields = record.split("\t");
372 String hash = fields[0];
373 String title = fields[1];
374
375 Element d = this.doc.createElement("document");
376 d.setAttribute("hash", hash);
377 d.appendChild(GSXML.createTextElement(this.doc, "title", title));
378
379 return d;
380
381 }
382 protected boolean addThesaurusList(Element phind_data, String record,
383 String word,
384 String freq,
385 long first, long last) {
386
387
388 Element thesaurus_list = this.doc.createElement("thesaurusList");
389 phind_data.appendChild(thesaurus_list);
390 thesaurus_list.setAttribute("length", freq);
391 thesaurus_list.setAttribute("start", Long.toString(first));
392 thesaurus_list.setAttribute("end", Long.toString(last));
393
394 // get the list of type,dest,dest
395 String [] links = record.split(";");
396 int length = links.length;
397 long index = 0;
398 for (int i = 0; i < length; i++) { // go through the entries
399 String link_info = links[(int)i];
400 String [] items = link_info.split(",");
401 // the first entry is teh type
402 String type = items[0];
403 for (int j = 1; j<items.length; j++, index++) {
404 if (index >= first && index < last) { // only output the ones we want
405 long phrase = Long.parseLong(items[j]);
406 Element t = getThesaurus(phrase);
407 t.setAttribute("type", type);
408 thesaurus_list.appendChild(t);
409 }
410 }
411 }
412
413 return true;
414 }
415
416 protected Element getThesaurus(long phrase_num) {
417
418 // look up the phrase in the pdata thingy
419 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
420 phrase_num);
421
422 if (record ==null || record.equals("")) return null;
423
424 // ignore everything up to and including first colon
425 record = record.substring(record.indexOf(':')+1);
426
427 String [] fields = record.split(":");
428 String phrase = fields[0];
429 String tf = fields[1];
430 //String ef = fields[2]; dont use this
431 String df = fields[3];
432
433 Element thesaurus = this.doc.createElement("thesaurus");
434 thesaurus.setAttribute("tf", tf);
435 thesaurus.setAttribute("df", df);
436 thesaurus.setAttribute("id", Long.toString(phrase_num));
437 thesaurus.appendChild(GSXML.createTextElement(this.doc, "phrase", phrase));
438 return thesaurus;
439
440 }
441
442 /** returns an array of two elements - the prefix and the suffix*/
443 protected String [] splitPhraseOnWord(String phrase, String word) {
444
445 if (word.equals("")) {
446
447 String [] res = {phrase, ""};
448 return res;
449 }
450 // use 2 so that we only split on the first occurrance. trailing empty strings should be included
451 String [] result = phrase.split(word, 2);
452 return result;
453
454 }
455
456 protected Element phindError(String message) {
457 Element e = this.doc.createElement("phindError");
458 Text t = this.doc.createTextNode(message);
459 e.appendChild(t);
460 return e;
461 }
462
463}
464
Note: See TracBrowser for help on using the repository browser.