source: branches/ant-install-branch/gsdl3/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java@ 9824

Last change on this file since 9824 was 9824, checked in by kjdon, 19 years ago

when a collection (using gdbm) is opened by tomcat, windows holds a lock on the gdbm file, so you can't rebuild it. modified ModuleInterface to have a cleanUp method, so all modules need to implement this. for mg/mgpp and gdbm modules, they now unload the index data or close the connection to the database. so cleanUp should be called whenever you deactivate a module

  • Property svn:keywords set to Author Date Id Revision
File size: 15.0 KB
Line 
1/*
2 * PhindServices.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21import org.greenstone.gsdl3.util.*;
22
23import org.greenstone.mgpp.*;
24import org.w3c.dom.Document;
25import org.w3c.dom.Node;
26import org.w3c.dom.Element;
27import org.w3c.dom.Text;
28
29import java.util.Vector;
30import java.util.HashMap;
31import java.io.File;
32/**
33 * PhindServices - the phind phrase browsing service
34 *
35 * @author <a href="mailto:[email protected]">Katherine Don</a>
36 * @version $Revision: 9824 $
37 */
38public class PhindPhraseBrowse
39 extends ServiceRack {
40
41 // the services on offer
42 private static final String PHIND_SERVICE = "PhindApplet";
43
44 private MGPPWrapper mgpp_src=null;
45 private String basepath = null;
46
47 private Element applet_description = null;
48
49 public PhindPhraseBrowse() {
50 this.mgpp_src = new MGPPWrapper();
51 // set up the default params
52 this.mgpp_src.setQueryLevel("Document");
53 this.mgpp_src.setReturnLevel("Document");
54 this.mgpp_src.setMaxDocs(5);
55 this.mgpp_src.setStem(false);
56 this.mgpp_src.setCase(true);
57 }
58
59 public void cleanUp() {
60 super.cleanUp();
61 this.mgpp_src.unloadIndexData();
62 }
63
64 /** configure the service module
65 *
66 * @param info a DOM Element containing any config info for the service
67 * @return true if configured
68 */
69 public boolean configure(Element info, Element extra_info) {
70
71 System.out.println("configuring PhindPhraseBrowse");
72
73 // set up short_service_info_ - for now just has name and type
74 Element e = this.doc.createElement(GSXML.SERVICE_ELEM);
75 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
76 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
77 this.short_service_info.appendChild(e);
78
79 // set up the static applet description
80
81 applet_description = this.doc.createElement(GSXML.SERVICE_ELEM);
82 applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
83 applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
84
85 // add in the applet info for the phind applet
86 // need to make this dynamic - library names etc
87 // change the applet params - have a single param with the library name
88 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
89 // phindcgi param now is not complete - library must be prepended to it.
90 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='lib' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
91 app_info += GSParams.ACTION +"=a&amp;"+GSParams.REQUEST_TYPE +"=r&amp;"+GSParams.SERVICE+"="+PHIND_SERVICE+"&amp;"+GSParams.OUTPUT+"=xml&amp;"+GSParams.RESPONSE_ONLY+"=1'/>";
92 app_info +="<PARAM NAME='collection' VALUE='";
93 app_info += this.cluster_name;
94 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
95
96 Document dom = this.converter.getDOM(app_info);
97 if (dom==null) {
98 System.err.println("PhindPhraseBrowse.configure Error: Couldn't parse applet info");
99 return false;
100 }
101 Element app_elem = dom.getDocumentElement();
102 applet_description.appendChild(this.doc.importNode(app_elem, true));
103
104 return true;
105 }
106
107 protected Element getServiceDescription(String service, String lang, String subset) {
108 if (!service.equals(PHIND_SERVICE)) {
109 return null;
110 }
111 Element describe = (Element) applet_description.cloneNode(true);
112 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(PHIND_SERVICE+".name", lang)));
113 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(PHIND_SERVICE+".description", lang)));
114 return describe;
115 }
116
117 protected Element processPhindApplet(Element request) {
118
119 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
120 HashMap params = GSXML.extractParams(param_elem, false);
121
122 long first_e = Long.parseLong((String)params.get("pfe"));
123 long last_e = Long.parseLong((String)params.get("ple"));
124 long first_l = Long.parseLong((String)params.get("pfl"));
125 long last_l = Long.parseLong((String)params.get("pll"));
126 long first_d = Long.parseLong((String)params.get("pfd"));
127 long last_d = Long.parseLong((String)params.get("pld"));
128
129 long phrase;
130 String phrase_str = (String)params.get("ppnum");
131 if (phrase_str == null || phrase_str.equals("")) {
132 phrase=0;
133 } else {
134 phrase = Long.parseLong(phrase_str);
135 }
136 String word = (String)params.get("pptext");
137 String phind_index = (String)params.get("pc");
138 // the location of the mgpp database files
139 this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index);
140
141 // the result element
142 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
143 result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
144 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
145
146 // applet result info must be in appletInfo element
147 Element applet_data = this.doc.createElement(GSXML.APPLET_DATA_ELEM);
148 result.appendChild(applet_data);
149 Element phind_data = this.doc.createElement("phindData");
150 applet_data.appendChild(phind_data);
151
152
153 // if we dont know the phrase number, look it up
154 if (phrase == 0) {
155 if (word==null || word.equals("")) {
156 Element error = phindError("no word or phrase");
157 phind_data.appendChild(error);
158 return result;
159 }
160 phrase = findPhraseNumberFromWord( word);
161 }
162 if (phrase==0) {
163 // the word is not in the collection
164 // return a phind error string
165 Element error = phindError("the term "+word+" is not in the collection");
166 phind_data.appendChild(error);
167 return result;
168 }
169
170 // get the phrase data into the phind_data node
171 getPhraseData(phind_data, phrase, first_l, last_l,
172 first_e, last_e, first_d, last_d);
173 return result;
174
175
176 }// processPhindApplet
177
178 protected long findPhraseNumberFromWord(String word) {
179
180 // set the mgpp index data - we are looking up pword
181 this.mgpp_src.loadIndexData(this.basepath+File.separatorChar+"pword");
182
183 this.mgpp_src.runQuery(word);
184
185 MGPPQueryResult res = this.mgpp_src.getQueryResult();
186 Vector docs = res.getDocs();
187 if (docs.size()==0) {
188 // phrase not found
189 return 0;
190 }
191 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
192 return doc.num_;
193 }
194
195 protected boolean getPhraseData(Element phind_data,
196 long phrase, long first_l, long last_l,
197 long first_e, long last_e, long first_d,
198 long last_d) {
199
200 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
201 phrase);
202 if (record.equals("")) {
203 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
204 phind_data.appendChild(error);
205 return false;
206 }
207
208 // parse the record - its in gordons cryptic form
209 // ":word:tf:ef:df:el:dl:lf:ll"
210 // el: e,e,e
211 // dl: d;f,d;f,
212 // lf and ll may be null
213 // l: type,dest, dest; type,dest,dest
214
215 // ignore everything up to and including first colon (has
216 // <Document>3505: at the start)
217 record = record.substring(record.indexOf(':')+1);
218
219 // split on ':'
220 String [] fields = record.split(":");
221 String word = fields[0];
222 String tf = fields[1];
223 String ef = fields[2];
224 String df = fields[3];
225
226
227 String expansions = fields[4];
228 String documents = fields[5];
229 String lf = "0";
230 String linklist = "";
231 if (fields.length > 7) {// have thesaurus stuff
232 lf =fields[6];
233 linklist = fields[7];
234 }
235
236 // the phindData attributes and phrase
237 phind_data.setAttribute("id", Long.toString(phrase));
238 phind_data.setAttribute("df", df);
239 phind_data.setAttribute("ef", ef);
240 phind_data.setAttribute("lf", lf);
241 phind_data.setAttribute("tf", tf);
242 GSXML.createTextElement(this.doc, "phrase", word);
243
244 addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
245 addDocumentList(phind_data, documents, word, df, first_d, last_d);
246 if (!lf.equals("0")) {
247 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
248 }
249 return true;
250 }
251
252 protected boolean addExpansionList( Element phind_data, String record,
253 String word,
254 String freq,
255 long first, long last) {
256
257 Element expansion_list = this.doc.createElement("expansionList");
258 phind_data.appendChild(expansion_list);
259 expansion_list.setAttribute("length", freq);
260 expansion_list.setAttribute("start", Long.toString(first));
261 expansion_list.setAttribute("end", Long.toString(last));
262
263 // get the list of strings
264 String [] expansions = record.split(",");
265 int length = expansions.length;
266 if (length < last) last = length;
267 for (long i = first; i < last; i++) {
268 long num = Long.parseLong(expansions[(int)i]);
269 Element expansion = getExpansion( num, word);
270 expansion.setAttribute("num", Long.toString(i));
271 expansion_list.appendChild(expansion);
272 }
273 return true;
274 }
275
276 protected Element getExpansion(long phrase_num,
277 String orig_phrase) {
278
279 // look up the phrase in the pdata thingy
280 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
281 phrase_num);
282
283 if (record ==null || record.equals("")) return null;
284
285 // ignore everything up to and including first colon
286 record = record.substring(record.indexOf(':')+1);
287
288 String [] fields = record.split(":");
289 String phrase = fields[0];
290 String tf = fields[1];
291 //String ef = fields[2]; dont use this
292 String df = fields[3];
293
294 Element expansion = this.doc.createElement("expansion");
295 expansion.setAttribute("tf", tf);
296 expansion.setAttribute("df", df);
297 expansion.setAttribute("id", Long.toString(phrase_num));
298
299 // get teh suffix and prefix
300 String [] ends = splitPhraseOnWord(phrase, orig_phrase);
301 if (!ends[0].equals("")) {
302 expansion.appendChild(GSXML.createTextElement(this.doc, "prefix", ends[0]));
303 }
304 if (!ends[1].equals("")) {
305 expansion.appendChild(GSXML.createTextElement(this.doc, "suffix", ends[1]));
306 }
307
308 return expansion;
309
310 }
311
312 protected boolean addDocumentList(Element phind_data, String record,
313 String word,
314 String freq,
315 long first, long last) {
316
317 Element document_list = this.doc.createElement("documentList");
318 phind_data.appendChild(document_list);
319 document_list.setAttribute("length", freq);
320 document_list.setAttribute("start", Long.toString(first));
321 document_list.setAttribute("end", Long.toString(last));
322
323 // get the list of doc,freq
324 String [] doc_freqs = record.split(";");
325 int length = doc_freqs.length;
326 if (length<last) last=length;
327
328 for (long i = first; i < last; i++) {
329 String doc_elem = doc_freqs[(int)i];
330 int p = doc_elem.indexOf(',');
331 long doc_num;
332 String doc_freq;
333 if (p == -1) { // there is no freq in the record
334 doc_num =Long.parseLong(doc_elem);
335 doc_freq = "1";
336 } else {
337 doc_num = Long.parseLong(doc_elem.substring(0,p));
338 doc_freq = doc_elem.substring(p+1);
339 }
340 Element document = getDocument( doc_num);
341 document.setAttribute("freq", doc_freq);
342 document.setAttribute("num", Long.toString(i));
343 document_list.appendChild(document);
344 }
345
346
347 return true;
348 }
349
350
351 protected Element getDocument(long doc_num) {
352
353 // look up the phrase in the docs thingy
354 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"docs", "Document",
355 doc_num);
356
357 if (record ==null || record.equals("")) return null;
358
359 // ignore everything up to and including first \t
360 record = record.substring(record.indexOf('\t')+1);
361
362 String [] fields = record.split("\t");
363 String hash = fields[0];
364 String title = fields[1];
365
366 Element d = this.doc.createElement("document");
367 d.setAttribute("hash", hash);
368 d.appendChild(GSXML.createTextElement(this.doc, "title", title));
369
370 return d;
371
372 }
373 protected boolean addThesaurusList(Element phind_data, String record,
374 String word,
375 String freq,
376 long first, long last) {
377
378
379 Element thesaurus_list = this.doc.createElement("thesaurusList");
380 phind_data.appendChild(thesaurus_list);
381 thesaurus_list.setAttribute("length", freq);
382 thesaurus_list.setAttribute("start", Long.toString(first));
383 thesaurus_list.setAttribute("end", Long.toString(last));
384
385 // get the list of type,dest,dest
386 String [] links = record.split(";");
387 int length = links.length;
388 long index = 0;
389 for (int i = 0; i < length; i++) { // go through the entries
390 String link_info = links[(int)i];
391 String [] items = link_info.split(",");
392 // the first entry is teh type
393 String type = items[0];
394 for (int j = 1; j<items.length; j++, index++) {
395 if (index >= first && index < last) { // only output the ones we want
396 long phrase = Long.parseLong(items[j]);
397 Element t = getThesaurus(phrase);
398 t.setAttribute("type", type);
399 thesaurus_list.appendChild(t);
400 }
401 }
402 }
403
404 return true;
405 }
406
407 protected Element getThesaurus(long phrase_num) {
408
409 // look up the phrase in the pdata thingy
410 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
411 phrase_num);
412
413 if (record ==null || record.equals("")) return null;
414
415 // ignore everything up to and including first colon
416 record = record.substring(record.indexOf(':')+1);
417
418 String [] fields = record.split(":");
419 String phrase = fields[0];
420 String tf = fields[1];
421 //String ef = fields[2]; dont use this
422 String df = fields[3];
423
424 Element thesaurus = this.doc.createElement("thesaurus");
425 thesaurus.setAttribute("tf", tf);
426 thesaurus.setAttribute("df", df);
427 thesaurus.setAttribute("id", Long.toString(phrase_num));
428 thesaurus.appendChild(GSXML.createTextElement(this.doc, "phrase", phrase));
429 return thesaurus;
430
431 }
432
433 /** returns an array of two elements - the prefix and the suffix*/
434 protected String [] splitPhraseOnWord(String phrase, String word) {
435
436 if (word.equals("")) {
437
438 String [] res = {phrase, ""};
439 return res;
440 }
441 // use 2 so that we only split on the first occurrance. trailing empty strings should be included
442 String [] result = phrase.split(word, 2);
443 return result;
444
445 }
446
447 protected Element phindError(String message) {
448 Element e = this.doc.createElement("phindError");
449 Text t = this.doc.createTextNode(message);
450 e.appendChild(t);
451 return e;
452 }
453
454}
455
Note: See TracBrowser for help on using the repository browser.