source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java@ 6490

Last change on this file since 6490 was 6490, checked in by nzdl, 20 years ago

changed the applet html description to only include the jar files it needs, and codebase dir is now lib, not lib/java

  • Property svn:keywords set to Author Date Id Revision
File size: 14.8 KB
Line 
1/*
2 * PhindServices.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21import org.greenstone.gsdl3.util.*;
22
23import org.greenstone.mgpp.*;
24import org.w3c.dom.Document;
25import org.w3c.dom.Node;
26import org.w3c.dom.Element;
27import org.w3c.dom.Text;
28
29import java.util.Vector;
30import java.util.HashMap;
31import java.io.File;
32/**
33 * PhindServices - the phind phrase browsing service
34 *
35 * @author <a href="mailto:[email protected]">Katherine Don</a>
36 * @version $Revision: 6490 $
37 */
38public class PhindPhraseBrowse
39 extends ServiceRack {
40
41 // the services on offer
42 private static final String PHIND_SERVICE = "PhindApplet";
43
44 private MGPPWrapper mgpp_src=null;
45 private String basepath = null;
46
47 private Element applet_description = null;
48
49 public PhindPhraseBrowse() {
50 this.mgpp_src = new MGPPWrapper();
51 // set up the default params
52 this.mgpp_src.setQueryLevel("Document");
53 this.mgpp_src.setReturnLevel("Document");
54 this.mgpp_src.setMaxDocs(5);
55 this.mgpp_src.setStem(false);
56 this.mgpp_src.setCase(true);
57 }
58 /** configure the service module
59 *
60 * @param info a DOM Element containing any config info for the service
61 * @return true if configured
62 */
63 public boolean configure(Element info, Element extra_info) {
64
65 System.out.println("configuring PhindPhraseBrowse");
66
67 // set up short_service_info_ - for now just has name and type
68 Element e = this.doc.createElement(GSXML.SERVICE_ELEM);
69 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
70 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
71 this.short_service_info.appendChild(e);
72
73 // set up the static applet description
74
75 applet_description = this.doc.createElement(GSXML.SERVICE_ELEM);
76 applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
77 applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
78
79 // add in the applet info for the phind applet
80 // need to make this dynamic - library names etc
81 // change the applet params - have a single param with the library name
82 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
83 // phindcgi param now is not complete - library must be prepended to it.
84 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='lib' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
85 app_info += GSParams.ACTION +"=a&amp;"+GSParams.REQUEST_TYPE +"=r&amp;"+GSParams.SERVICE+"="+PHIND_SERVICE+"&amp;"+GSParams.OUTPUT+"=xml&amp;"+GSParams.RESPONSE_ONLY+"=1'/>";
86 app_info +="<PARAM NAME='collection' VALUE='";
87 app_info += this.cluster_name;
88 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
89
90 Document dom = this.converter.getDOM(app_info);
91 Element app_elem = dom.getDocumentElement();
92 applet_description.appendChild(this.doc.importNode(app_elem, true));
93
94 return true;
95 }
96
97 protected Element getServiceDescription(String service, String lang, String subset) {
98 if (!service.equals(PHIND_SERVICE)) {
99 return null;
100 }
101 Element describe = (Element) applet_description.cloneNode(true);
102 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(PHIND_SERVICE+".name", lang)));
103 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(PHIND_SERVICE+".description", lang)));
104 return describe;
105 }
106
107 protected Element processPhindApplet(Element request) {
108
109 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
110 HashMap params = GSXML.extractParams(param_elem, false);
111
112 long first_e = Long.parseLong((String)params.get("pfe"));
113 long last_e = Long.parseLong((String)params.get("ple"));
114 long first_l = Long.parseLong((String)params.get("pfl"));
115 long last_l = Long.parseLong((String)params.get("pll"));
116 long first_d = Long.parseLong((String)params.get("pfd"));
117 long last_d = Long.parseLong((String)params.get("pld"));
118
119 long phrase;
120 String phrase_str = (String)params.get("ppnum");
121 if (phrase_str == null || phrase_str.equals("")) {
122 phrase=0;
123 } else {
124 phrase = Long.parseLong(phrase_str);
125 }
126 String word = (String)params.get("pptext");
127 String phind_index = (String)params.get("pc");
128 // the location of the mgpp database files
129 this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index);
130
131 // the result element
132 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
133 result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
134 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
135
136 // applet result info must be in appletInfo element
137 Element applet_data = this.doc.createElement(GSXML.APPLET_DATA_ELEM);
138 result.appendChild(applet_data);
139 Element phind_data = this.doc.createElement("phindData");
140 applet_data.appendChild(phind_data);
141
142
143 // if we dont know the phrase number, look it up
144 if (phrase == 0) {
145 if (word==null || word.equals("")) {
146 Element error = phindError("no word or phrase");
147 phind_data.appendChild(error);
148 return result;
149 }
150 phrase = findPhraseNumberFromWord( word);
151 }
152 if (phrase==0) {
153 // the word is not in the collection
154 // return a phind error string
155 Element error = phindError("the term "+word+" is not in the collection");
156 phind_data.appendChild(error);
157 return result;
158 }
159
160 // get the phrase data into the phind_data node
161 getPhraseData(phind_data, phrase, first_l, last_l,
162 first_e, last_e, first_d, last_d);
163 return result;
164
165
166 }// processPhindApplet
167
168 protected long findPhraseNumberFromWord(String word) {
169
170 // set the mgpp index data - we are looking up pword
171 this.mgpp_src.loadIndexData(this.basepath+File.separatorChar+"pword");
172
173 this.mgpp_src.runQuery(word);
174
175 MGPPQueryResult res = this.mgpp_src.getQueryResult();
176 Vector docs = res.getDocs();
177 if (docs.size()==0) {
178 // phrase not found
179 return 0;
180 }
181 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
182 return doc.num_;
183 }
184
185 protected boolean getPhraseData(Element phind_data,
186 long phrase, long first_l, long last_l,
187 long first_e, long last_e, long first_d,
188 long last_d) {
189
190 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
191 phrase);
192 if (record.equals("")) {
193 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
194 phind_data.appendChild(error);
195 return false;
196 }
197
198 // parse the record - its in gordons cryptic form
199 // ":word:tf:ef:df:el:dl:lf:ll"
200 // el: e,e,e
201 // dl: d;f,d;f,
202 // lf and ll may be null
203 // l: type,dest, dest; type,dest,dest
204
205 // ignore everything up to and including first colon (has
206 // <Document>3505: at the start)
207 record = record.substring(record.indexOf(':')+1);
208
209 // split on ':'
210 String [] fields = record.split(":");
211 String word = fields[0];
212 String tf = fields[1];
213 String ef = fields[2];
214 String df = fields[3];
215
216
217 String expansions = fields[4];
218 String documents = fields[5];
219 String lf = "0";
220 String linklist = "";
221 if (fields.length > 7) {// have thesaurus stuff
222 lf =fields[6];
223 linklist = fields[7];
224 }
225
226 // the phindData attributes and phrase
227 phind_data.setAttribute("id", Long.toString(phrase));
228 phind_data.setAttribute("df", df);
229 phind_data.setAttribute("ef", ef);
230 phind_data.setAttribute("lf", lf);
231 phind_data.setAttribute("tf", tf);
232 GSXML.createTextElement(this.doc, "phrase", word);
233
234 addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
235 addDocumentList(phind_data, documents, word, df, first_d, last_d);
236 if (!lf.equals("0")) {
237 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
238 }
239 return true;
240 }
241
242 protected boolean addExpansionList( Element phind_data, String record,
243 String word,
244 String freq,
245 long first, long last) {
246
247 Element expansion_list = this.doc.createElement("expansionList");
248 phind_data.appendChild(expansion_list);
249 expansion_list.setAttribute("length", freq);
250 expansion_list.setAttribute("start", Long.toString(first));
251 expansion_list.setAttribute("end", Long.toString(last));
252
253 // get the list of strings
254 String [] expansions = record.split(",");
255 int length = expansions.length;
256 if (length < last) last = length;
257 for (long i = first; i < last; i++) {
258 long num = Long.parseLong(expansions[(int)i]);
259 Element expansion = getExpansion( num, word);
260 expansion.setAttribute("num", Long.toString(i));
261 expansion_list.appendChild(expansion);
262 }
263 return true;
264 }
265
266 protected Element getExpansion(long phrase_num,
267 String orig_phrase) {
268
269 // look up the phrase in the pdata thingy
270 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
271 phrase_num);
272
273 if (record ==null || record.equals("")) return null;
274
275 // ignore everything up to and including first colon
276 record = record.substring(record.indexOf(':')+1);
277
278 String [] fields = record.split(":");
279 String phrase = fields[0];
280 String tf = fields[1];
281 //String ef = fields[2]; dont use this
282 String df = fields[3];
283
284 Element expansion = this.doc.createElement("expansion");
285 expansion.setAttribute("tf", tf);
286 expansion.setAttribute("df", df);
287 expansion.setAttribute("id", Long.toString(phrase_num));
288
289 // get teh suffix and prefix
290 String [] ends = splitPhraseOnWord(phrase, orig_phrase);
291 if (!ends[0].equals("")) {
292 expansion.appendChild(GSXML.createTextElement(this.doc, "prefix", ends[0]));
293 }
294 if (!ends[1].equals("")) {
295 expansion.appendChild(GSXML.createTextElement(this.doc, "suffix", ends[1]));
296 }
297
298 return expansion;
299
300 }
301
302 protected boolean addDocumentList(Element phind_data, String record,
303 String word,
304 String freq,
305 long first, long last) {
306
307 Element document_list = this.doc.createElement("documentList");
308 phind_data.appendChild(document_list);
309 document_list.setAttribute("length", freq);
310 document_list.setAttribute("start", Long.toString(first));
311 document_list.setAttribute("end", Long.toString(last));
312
313 // get the list of doc,freq
314 String [] doc_freqs = record.split(";");
315 int length = doc_freqs.length;
316 if (length<last) last=length;
317
318 for (long i = first; i < last; i++) {
319 String doc_elem = doc_freqs[(int)i];
320 int p = doc_elem.indexOf(',');
321 long doc_num;
322 String doc_freq;
323 if (p == -1) { // there is no freq in the record
324 doc_num =Long.parseLong(doc_elem);
325 doc_freq = "1";
326 } else {
327 doc_num = Long.parseLong(doc_elem.substring(0,p));
328 doc_freq = doc_elem.substring(p+1);
329 }
330 Element document = getDocument( doc_num);
331 document.setAttribute("freq", doc_freq);
332 document.setAttribute("num", Long.toString(i));
333 document_list.appendChild(document);
334 }
335
336
337 return true;
338 }
339
340
341 protected Element getDocument(long doc_num) {
342
343 // look up the phrase in the docs thingy
344 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"docs", "Document",
345 doc_num);
346
347 if (record ==null || record.equals("")) return null;
348
349 // ignore everything up to and including first \t
350 record = record.substring(record.indexOf('\t')+1);
351
352 String [] fields = record.split("\t");
353 String hash = fields[0];
354 String title = fields[1];
355
356 Element d = this.doc.createElement("document");
357 d.setAttribute("hash", hash);
358 d.appendChild(GSXML.createTextElement(this.doc, "title", title));
359
360 return d;
361
362 }
363 protected boolean addThesaurusList(Element phind_data, String record,
364 String word,
365 String freq,
366 long first, long last) {
367
368
369 Element thesaurus_list = this.doc.createElement("thesaurusList");
370 phind_data.appendChild(thesaurus_list);
371 thesaurus_list.setAttribute("length", freq);
372 thesaurus_list.setAttribute("start", Long.toString(first));
373 thesaurus_list.setAttribute("end", Long.toString(last));
374
375 // get the list of type,dest,dest
376 String [] links = record.split(";");
377 int length = links.length;
378 long index = 0;
379 for (int i = 0; i < length; i++) { // go through the entries
380 String link_info = links[(int)i];
381 String [] items = link_info.split(",");
382 // the first entry is teh type
383 String type = items[0];
384 for (int j = 1; j<items.length; j++, index++) {
385 if (index >= first && index < last) { // only output the ones we want
386 long phrase = Long.parseLong(items[j]);
387 Element t = getThesaurus(phrase);
388 t.setAttribute("type", type);
389 thesaurus_list.appendChild(t);
390 }
391 }
392 }
393
394 return true;
395 }
396
397 protected Element getThesaurus(long phrase_num) {
398
399 // look up the phrase in the pdata thingy
400 String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
401 phrase_num);
402
403 if (record ==null || record.equals("")) return null;
404
405 // ignore everything up to and including first colon
406 record = record.substring(record.indexOf(':')+1);
407
408 String [] fields = record.split(":");
409 String phrase = fields[0];
410 String tf = fields[1];
411 //String ef = fields[2]; dont use this
412 String df = fields[3];
413
414 Element thesaurus = this.doc.createElement("thesaurus");
415 thesaurus.setAttribute("tf", tf);
416 thesaurus.setAttribute("df", df);
417 thesaurus.setAttribute("id", Long.toString(phrase_num));
418 thesaurus.appendChild(GSXML.createTextElement(this.doc, "phrase", phrase));
419 return thesaurus;
420
421 }
422
423 /** returns an array of two elements - the prefix and the suffix*/
424 protected String [] splitPhraseOnWord(String phrase, String word) {
425
426 if (word.equals("")) {
427
428 String [] res = {phrase, ""};
429 return res;
430 }
431 // use 2 so that we only split on the first occurrance. trailing empty strings should be included
432 String [] result = phrase.split(word, 2);
433 return result;
434
435 }
436
437 protected Element phindError(String message) {
438 Element e = this.doc.createElement("phindError");
439 Text t = this.doc.createTextNode(message);
440 e.appendChild(t);
441 return e;
442 }
443
444}
445
Note: See TracBrowser for help on using the repository browser.