source: greenstone3/trunk/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java@ 16355

Last change on this file since 16355 was 16355, checked in by kjdon, 16 years ago

MGPPRetrieveWrapper no longer has an UnloadIndexData method

  • Property svn:keywords set to Author Date Id Revision
File size: 16.4 KB
Line 
1/*
2 * PhindServices.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21import org.greenstone.gsdl3.util.*;
22
23import org.greenstone.mgpp.*;
24import org.w3c.dom.Document;
25import org.w3c.dom.Node;
26import org.w3c.dom.Element;
27import org.w3c.dom.Text;
28
29import java.util.Vector;
30import java.util.HashMap;
31import java.io.File;
32
33import org.apache.log4j.*;
34
35/**
36 * PhindServices - the phind phrase browsing service
37 *
38 * @author <a href="mailto:[email protected]">Katherine Don</a>
39 * @version $Revision: 16355 $
40 */
41public class PhindPhraseBrowse
42 extends ServiceRack {
43
44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.PhindPhraseBrowse.class.getName());
45
46 // the services on offer
47 private static final String PHIND_SERVICE = "PhindApplet";
48
49 private static MGPPRetrieveWrapper mgpp_retrieve_src=null;
50 private static MGPPSearchWrapper mgpp_search_src=null;
51 private String basepath = null;
52
53 private Element applet_description = null;
54
55 public PhindPhraseBrowse() {
56 if(this.mgpp_retrieve_src == null) {
57 this.mgpp_retrieve_src = new MGPPRetrieveWrapper();
58 }
59 if(this.mgpp_search_src == null) {
60 this.mgpp_search_src = new MGPPSearchWrapper();
61 }
62 // set up the default params
63 this.mgpp_search_src.setQueryLevel("Document");
64 this.mgpp_search_src.setReturnLevel("Document");
65 this.mgpp_search_src.setMaxDocs(5);
66 this.mgpp_search_src.setStem(false);
67 this.mgpp_search_src.setCase(true);
68 }
69
70 public void cleanUp() {
71 super.cleanUp();
72 this.mgpp_search_src.unloadIndexData();
73 }
74
75 /** configure the service module
76 *
77 * @param info a DOM Element containing any config info for the service
78 * @return true if configured
79 */
80 public boolean configure(Element info, Element extra_info) {
81
82 if (!super.configure(info, extra_info)){
83 return false;
84 }
85
86 logger.info("configuring PhindPhraseBrowse");
87
88 // set up short_service_info_ - for now just has name and type
89 Element e = this.doc.createElement(GSXML.SERVICE_ELEM);
90 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
91 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
92 this.short_service_info.appendChild(e);
93
94 // set up the static applet description
95
96 applet_description = this.doc.createElement(GSXML.SERVICE_ELEM);
97 applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
98 applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
99
100 // add in the applet info for the phind applet
101 // need to make this dynamic - library names etc
102 // change the applet params - have a single param with the library name
103 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
104 // phindcgi param now is not complete - library must be prepended to it.
105 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='applet' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
106 app_info += GSParams.ACTION +"=a&amp;"+GSParams.REQUEST_TYPE +"=r&amp;"+GSParams.SERVICE+"="+PHIND_SERVICE+"&amp;"+GSParams.OUTPUT+"=xml&amp;"+GSParams.RESPONSE_ONLY+"=1'/>";
107 app_info +="<PARAM NAME='collection' VALUE='";
108 app_info += this.cluster_name;
109 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
110
111 Document dom = this.converter.getDOM(app_info);
112 if (dom==null) {
113 logger.error("Couldn't parse applet info");
114 return false;
115 }
116 Element app_elem = dom.getDocumentElement();
117 applet_description.appendChild(this.doc.importNode(app_elem, true));
118
119 return true;
120 }
121
122 protected Element getServiceDescription(String service, String lang, String subset) {
123 if (!service.equals(PHIND_SERVICE)) {
124 return null;
125 }
126 Element describe = (Element) applet_description.cloneNode(true);
127 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(PHIND_SERVICE+".name", lang)));
128 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(PHIND_SERVICE+".description", lang)));
129 return describe;
130 }
131
132 protected Element processPhindApplet(Element request) {
133
134 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
135 HashMap params = GSXML.extractParams(param_elem, false);
136
137 long first_e = Long.parseLong((String)params.get("pfe"));
138 long last_e = Long.parseLong((String)params.get("ple"));
139 long first_l = Long.parseLong((String)params.get("pfl"));
140 long last_l = Long.parseLong((String)params.get("pll"));
141 long first_d = Long.parseLong((String)params.get("pfd"));
142 long last_d = Long.parseLong((String)params.get("pld"));
143
144 long phrase;
145 String phrase_str = (String)params.get("ppnum");
146 if (phrase_str == null || phrase_str.equals("")) {
147 phrase=0;
148 } else {
149 phrase = Long.parseLong(phrase_str);
150 }
151 String word = (String)params.get("pptext");
152 String phind_index = (String)params.get("pc");
153 // the location of the mgpp database files
154 this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index);
155
156 // the result element
157 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
158 result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
159 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
160
161 // applet result info must be in appletInfo element
162 Element applet_data = this.doc.createElement(GSXML.APPLET_DATA_ELEM);
163 result.appendChild(applet_data);
164 Element phind_data = this.doc.createElement("phindData");
165 applet_data.appendChild(phind_data);
166
167
168 // if we dont know the phrase number, look it up
169 if (phrase == 0) {
170 if (word==null || word.equals("")) {
171 Element error = phindError("no word or phrase");
172 phind_data.appendChild(error);
173 return result;
174 }
175 phrase = findPhraseNumberFromWord( word);
176 }
177 if (phrase==0) {
178 // the word is not in the collection
179 // return a phind error string
180 Element error = phindError("the term "+word+" is not in the collection");
181 phind_data.appendChild(error);
182 return result;
183 }
184
185 // get the phrase data into the phind_data node
186 getPhraseData(phind_data, phrase, first_l, last_l,
187 first_e, last_e, first_d, last_d);
188 return result;
189
190
191 }// processPhindApplet
192
193 protected long findPhraseNumberFromWord(String word) {
194 synchronized (mgpp_search_src) {
195 // set the mgpp index data - we are looking up pword
196 mgpp_search_src.loadIndexData(this.basepath+File.separatorChar+"pword");
197
198 mgpp_search_src.runQuery(word);
199
200 MGPPQueryResult res = mgpp_search_src.getQueryResult();
201 Vector docs = res.getDocs();
202 if (docs.size()==0) {
203 // phrase not found
204 return 0;
205 }
206 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
207 return doc.num_;
208 }
209 }
210
211 protected boolean getPhraseData(Element phind_data,
212 long phrase, long first_l, long last_l,
213 long first_e, long last_e, long first_d,
214 long last_d) {
215
216 synchronized (mgpp_retrieve_src) {
217 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
218 phrase);
219 if (record.equals("")) {
220 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
221 phind_data.appendChild(error);
222 return false;
223 }
224
225 // parse the record - its in gordons cryptic form
226 // ":word:tf:ef:df:el:dl:lf:ll"
227 // el: e,e,e
228 // dl: d;f,d;f,
229 // lf and ll may be null
230 // l: type,dest, dest; type,dest,dest
231
232 // ignore everything up to and including first colon (has
233 // <Document>3505: at the start)
234 record = record.substring(record.indexOf(':')+1);
235
236 // split on ':'
237 String [] fields = record.split(":");
238 String word = fields[0];
239 String tf = fields[1];
240 String ef = fields[2];
241 String df = fields[3];
242
243
244 String expansions = fields[4];
245 String documents = fields[5];
246 String lf = "0";
247 String linklist = "";
248 if (fields.length > 7) {// have thesaurus stuff
249 lf =fields[6];
250 linklist = fields[7];
251 }
252
253 // the phindData attributes and phrase
254 phind_data.setAttribute("id", Long.toString(phrase));
255 phind_data.setAttribute("df", df);
256 phind_data.setAttribute("ef", ef);
257 phind_data.setAttribute("lf", lf);
258 phind_data.setAttribute("tf", tf);
259 GSXML.createTextElement(this.doc, "phrase", word);
260
261 addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
262 addDocumentList(phind_data, documents, word, df, first_d, last_d);
263 if (!lf.equals("0")) {
264 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
265 }
266 return true;
267 }
268 }
269
270 protected boolean addExpansionList( Element phind_data, String record,
271 String word,
272 String freq,
273 long first, long last) {
274
275 Element expansion_list = this.doc.createElement("expansionList");
276 phind_data.appendChild(expansion_list);
277 expansion_list.setAttribute("length", freq);
278 expansion_list.setAttribute("start", Long.toString(first));
279 expansion_list.setAttribute("end", Long.toString(last));
280
281 // get the list of strings
282 String [] expansions = record.split(",");
283 int length = expansions.length;
284 if (length < last) last = length;
285 for (long i = first; i < last; i++) {
286 long num = Long.parseLong(expansions[(int)i]);
287 Element expansion = getExpansion( num, word);
288 expansion.setAttribute("num", Long.toString(i));
289 expansion_list.appendChild(expansion);
290 }
291 return true;
292 }
293
294 protected Element getExpansion(long phrase_num,
295 String orig_phrase) {
296
297 // look up the phrase in the pdata thingy
298 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
299 phrase_num);
300
301 if (record ==null || record.equals("")) return null;
302
303 // ignore everything up to and including first colon
304 record = record.substring(record.indexOf(':')+1);
305
306 String [] fields = record.split(":");
307 String phrase = fields[0];
308 String tf = fields[1];
309 //String ef = fields[2]; dont use this
310 String df = fields[3];
311
312 Element expansion = this.doc.createElement("expansion");
313 expansion.setAttribute("tf", tf);
314 expansion.setAttribute("df", df);
315 expansion.setAttribute("id", Long.toString(phrase_num));
316
317 // get teh suffix and prefix
318 String [] ends = splitPhraseOnWord(phrase, orig_phrase);
319 if (!ends[0].equals("")) {
320 expansion.appendChild(GSXML.createTextElement(this.doc, "prefix", ends[0]));
321 }
322 if (!ends[1].equals("")) {
323 expansion.appendChild(GSXML.createTextElement(this.doc, "suffix", ends[1]));
324 }
325
326 return expansion;
327
328 }
329
330 protected boolean addDocumentList(Element phind_data, String record,
331 String word,
332 String freq,
333 long first, long last) {
334
335 Element document_list = this.doc.createElement("documentList");
336 phind_data.appendChild(document_list);
337 document_list.setAttribute("length", freq);
338 document_list.setAttribute("start", Long.toString(first));
339 document_list.setAttribute("end", Long.toString(last));
340
341 // get the list of doc,freq
342 String [] doc_freqs = record.split(";");
343 int length = doc_freqs.length;
344 if (length<last) last=length;
345
346 for (long i = first; i < last; i++) {
347 String doc_elem = doc_freqs[(int)i];
348 int p = doc_elem.indexOf(',');
349 long doc_num;
350 String doc_freq;
351 if (p == -1) { // there is no freq in the record
352 doc_num =Long.parseLong(doc_elem);
353 doc_freq = "1";
354 } else {
355 doc_num = Long.parseLong(doc_elem.substring(0,p));
356 doc_freq = doc_elem.substring(p+1);
357 }
358 Element document = getDocument( doc_num);
359 document.setAttribute("freq", doc_freq);
360 document.setAttribute("num", Long.toString(i));
361 document_list.appendChild(document);
362 }
363
364
365 return true;
366 }
367
368
369 protected Element getDocument(long doc_num) {
370
371 // look up the phrase in the docs thingy
372 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"docs", "Document",
373 doc_num);
374
375 if (record ==null || record.equals("")) return null;
376
377 // ignore everything up to and including first \t
378 record = record.substring(record.indexOf('\t')+1);
379
380 String [] fields = record.split("\t");
381 String hash = fields[0];
382 String title = fields[1];
383
384 Element d = this.doc.createElement("document");
385 d.setAttribute("hash", hash);
386 d.appendChild(GSXML.createTextElement(this.doc, "title", title));
387
388 return d;
389
390 }
391 protected boolean addThesaurusList(Element phind_data, String record,
392 String word,
393 String freq,
394 long first, long last) {
395
396
397 Element thesaurus_list = this.doc.createElement("thesaurusList");
398 phind_data.appendChild(thesaurus_list);
399 thesaurus_list.setAttribute("length", freq);
400 thesaurus_list.setAttribute("start", Long.toString(first));
401 thesaurus_list.setAttribute("end", Long.toString(last));
402
403 // get the list of type,dest,dest
404 String [] links = record.split(";");
405 int length = links.length;
406 long index = 0;
407 for (int i = 0; i < length; i++) { // go through the entries
408 String link_info = links[(int)i];
409 String [] items = link_info.split(",");
410 // the first entry is teh type
411 String type = items[0];
412 for (int j = 1; j<items.length; j++, index++) {
413 if (index >= first && index < last) { // only output the ones we want
414 long phrase = Long.parseLong(items[j]);
415 Element t = getThesaurus(phrase);
416 t.setAttribute("type", type);
417 thesaurus_list.appendChild(t);
418 }
419 }
420 }
421
422 return true;
423 }
424
425 protected Element getThesaurus(long phrase_num) {
426
427 // look up the phrase in the pdata thingy
428 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
429 phrase_num);
430
431 if (record ==null || record.equals("")) return null;
432
433 // ignore everything up to and including first colon
434 record = record.substring(record.indexOf(':')+1);
435
436 String [] fields = record.split(":");
437 String phrase = fields[0];
438 String tf = fields[1];
439 //String ef = fields[2]; dont use this
440 String df = fields[3];
441
442 Element thesaurus = this.doc.createElement("thesaurus");
443 thesaurus.setAttribute("tf", tf);
444 thesaurus.setAttribute("df", df);
445 thesaurus.setAttribute("id", Long.toString(phrase_num));
446 thesaurus.appendChild(GSXML.createTextElement(this.doc, "phrase", phrase));
447 return thesaurus;
448
449 }
450
451 /** returns an array of two elements - the prefix and the suffix*/
452 protected String [] splitPhraseOnWord(String phrase, String word) {
453
454 if (word.equals("")) {
455
456 String [] res = {phrase, ""};
457 return res;
458 }
459 // use 2 so that we only split on the first occurrance. trailing empty strings should be included
460 String [] result = phrase.split(word, 2);
461 return result;
462
463 }
464
465 protected Element phindError(String message) {
466 Element e = this.doc.createElement("phindError");
467 Text t = this.doc.createTextNode(message);
468 e.appendChild(t);
469 return e;
470 }
471
472}
473
Note: See TracBrowser for help on using the repository browser.