source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/PhindServices.java@ 3567

Last change on this file since 3567 was 3567, checked in by kjdon, 21 years ago

tidied up a bit, using new Dictionary stuff

  • Property svn:keywords set to Author Date Id Revision
File size: 15.0 KB
Line 
1/*
2 * PhindServices.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21import org.greenstone.gsdl3.util.*;
22
23import org.greenstone.mgpp.*;
24import org.w3c.dom.Document;
25import org.w3c.dom.Node;
26import org.w3c.dom.Element;
27import org.w3c.dom.Text;
28
29import java.util.Vector;
30import java.util.HashMap;
31/**
32 * PhindServices - the phind phrase browsing service
33 *
34 * @author <a href="mailto:[email protected]">Katherine Don</a>
35 * @version $Revision: 3567 $
36 */
37public class PhindServices
38 extends ServicesImpl {
39
40 // the services on offer
41 private static final String PHIND_SERVICE = "PhindApplet";
42
43 private MGPPWrapper mgpp_src_=null;
44 private String basepath_ = null;
45 public PhindServices() {
46 mgpp_src_ = new MGPPWrapper();
47 // set up the default params
48 mgpp_src_.setQueryLevel("Document");
49 mgpp_src_.setReturnLevel("Document");
50 mgpp_src_.setMaxDocs(5);
51 mgpp_src_.setStem(false);
52 mgpp_src_.setCase(true);
53 }
54 /** configure the service module
55 *
56 * @param info a DOM Element containing any config info for the service
57 * @return true if configured
58 */
59 public boolean configure(Element info) {
60
61 System.out.println("configuring PhindServices");
62
63 // set up short_service_info_ - for now just has name and type
64 Element e = doc_.createElement(GSXML.SERVICE_ELEM);
65 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
66 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
67 short_service_info_.appendChild(e);
68
69 // set up service_info_map_ - we only have one element, and it has
70 // no extra info yet - we are not processing the config info
71 Element f = doc_.createElement(GSXML.SERVICE_ELEM);
72 f.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
73 f.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
74
75 // add in the applet info for the phind applet
76 // need to make this dynamic - library names etc
77 // change the applet params - have a single param with the library name
78 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
79 // phindcgi param now is not complete - library must be prepended to it.
80 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='lib/java' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, gsdl3.jar, jaxp.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?a=a&amp;sa=r&amp;sn=Phind'/>";
81 app_info +="<PARAM NAME='collection' VALUE='";
82 app_info += cluster_name_;
83 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
84
85 Document dom = converter_.getDOM(app_info);
86 Element app_elem = dom.getDocumentElement();
87 f.appendChild(doc_.importNode(app_elem, true));
88
89 service_info_map_.put(PHIND_SERVICE, f);
90
91 return true;
92 }
93
94 /** creates a display element containing all the text strings needed to display the service page, in the language specified */
95 protected Element createServiceDisplay(String service, String lang) {
96 Element display = doc_.createElement(GSXML.DISPLAY_ELEM);
97 display.appendChild(GSXML.createTextElement(doc_, GSXML.DISPLAY_NAME_ELEM, getTextString(service+".name", lang)));
98 display.appendChild(GSXML.createTextElement(doc_, GSXML.DISPLAY_SUBMIT_ELEM, getTextString(service+".submit", lang)));
99
100 Element param;
101
102 return display;
103
104 }
105
106 protected Element processService(String name, Element request) {
107
108 if (!name.equals(PHIND_SERVICE)) {
109 System.err.println("PhindServices:you have asked for a non-existant service - "+name+"!");
110 return null;
111 }
112 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
113 HashMap params = GSXML.extractParams(param_elem);
114
115 long first_e = Long.parseLong((String)params.get("pfe"));
116 long last_e = Long.parseLong((String)params.get("ple"));
117 long first_l = Long.parseLong((String)params.get("pfl"));
118 long last_l = Long.parseLong((String)params.get("pll"));
119 long first_d = Long.parseLong((String)params.get("pfd"));
120 long last_d = Long.parseLong((String)params.get("pld"));
121
122 long phrase;
123 String phrase_str = (String)params.get("ppnum");
124 if (phrase_str == null || phrase_str.equals("")) {
125 phrase=0;
126 } else {
127 phrase = Long.parseLong(phrase_str);
128 }
129 String word = (String)params.get("pptext");
130 String phind_index = (String)params.get("pc");
131 // the location of the mgpp database files
132 basepath_ = GSFile.phindBaseDir(site_home_, cluster_name_, phind_index);
133
134 // the result element
135 Element result = doc_.createElement(GSXML.RESPONSE_ELEM);
136 String from = GSPath.appendLink(cluster_name_, "PhindApplet");
137 result.setAttribute(GSXML.FROM_ATT, from);
138 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_QUERY);
139
140 // applet result info must be in appletInfo element
141 Element applet_data = doc_.createElement(GSXML.APPLET_DATA_ELEM);
142 result.appendChild(applet_data);
143 Element phind_data = doc_.createElement("phindData");
144 applet_data.appendChild(phind_data);
145
146
147 // if we dont know the phrase number, look it up
148 if (phrase == 0) {
149 if (word==null || word.equals("")) {
150 Element error = phindError("no word or phrase");
151 phind_data.appendChild(error);
152 return result;
153 }
154 phrase = findPhraseNumberFromWord( word);
155 System.out.println("phind, term number for "+word+" is "+phrase);
156 }
157 if (phrase==0) {
158 // the word is not in the collection
159 // return a phind error string
160 Element error = phindError("the term "+word+" is not in the collection");
161 phind_data.appendChild(error);
162 return result;
163 }
164
165 // get the phrase data into the phind_data node
166 getPhraseData(phind_data, phrase, first_l, last_l,
167 first_e, last_e, first_d, last_d);
168 return result;
169
170
171 }// processService
172
173 protected long findPhraseNumberFromWord(String word) {
174
175 // set the mgpp index data - we are looking up pword
176 mgpp_src_.loadIndexData(basepath_, "pword");
177
178 mgpp_src_.runQuery(word);
179
180 MGPPQueryResult res = mgpp_src_.getQueryResult();
181 Vector docs = res.getDocs();
182 if (docs.size()==0) {
183 // phrase not found
184 return 0;
185 }
186 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
187 return doc.num_;
188 }
189
190 protected boolean getPhraseData(Element phind_data,
191 long phrase, long first_l, long last_l,
192 long first_e, long last_e, long first_d,
193 long last_d) {
194
195 String record = mgpp_src_.getDocument(basepath_, "pdata", "Document",
196 phrase);
197 if (record.equals("")) {
198 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
199 phind_data.appendChild(error);
200 return false;
201 }
202
203 System.out.println("record="+record);
204 // parse the record - its in gordons cryptic form
205 // ":word:tf:ef:df:el:dl:lf:ll"
206 // el: e,e,e
207 // dl: d;f,d;f,
208 // lf and ll may be null
209 // l: type,dest, dest; type,dest,dest
210
211 // ignore everything up to and including first colon (has
212 // <Document>3505: at the start)
213 record = record.substring(record.indexOf(':')+1);
214
215 // split on ':'
216 String [] fields = record.split(":");
217 String word = fields[0];
218 String tf = fields[1];
219 String ef = fields[2];
220 String df = fields[3];
221
222
223 String expansions = fields[4];
224 String documents = fields[5];
225 String lf = "0";
226 String linklist = "";
227 if (fields.length > 7) {// have thesaurus stuff
228 lf =fields[6];
229 linklist = fields[7];
230 }
231
232 // the phindData attributes and phrase
233 phind_data.setAttribute("id", Long.toString(phrase));
234 phind_data.setAttribute("df", df);
235 phind_data.setAttribute("ef", ef);
236 phind_data.setAttribute("lf", lf);
237 phind_data.setAttribute("tf", tf);
238 GSXML.createTextElement(doc_, "phrase", word);
239
240 addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
241 addDocumentList(phind_data, documents, word, df, first_d, last_d);
242 if (!lf.equals("0")) {
243 System.out.println("adding thesaurus stuff");
244 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
245 }
246 return true;
247 }
248
249 protected boolean addExpansionList( Element phind_data, String record,
250 String word,
251 String freq,
252 long first, long last) {
253
254 Element expansion_list = doc_.createElement("expansionList");
255 phind_data.appendChild(expansion_list);
256 expansion_list.setAttribute("length", freq);
257 expansion_list.setAttribute("start", Long.toString(first));
258 expansion_list.setAttribute("end", Long.toString(last));
259
260 // get the list of strings
261 String [] expansions = record.split(",");
262 int length = expansions.length;
263 if (length < last) last = length;
264 for (long i = first; i < last; i++) {
265 long num = Long.parseLong(expansions[(int)i]);
266 Element expansion = getExpansion( num, word);
267 expansion.setAttribute("num", Long.toString(i));
268 expansion_list.appendChild(expansion);
269 }
270 return true;
271 }
272
273 protected Element getExpansion(long phrase_num,
274 String orig_phrase) {
275
276 // look up the phrase in the pdata thingy
277 String record = mgpp_src_.getDocument(basepath_, "pdata", "Document",
278 phrase_num);
279
280 if (record ==null || record.equals("")) return null;
281
282 // ignore everything up to and including first colon
283 record = record.substring(record.indexOf(':')+1);
284
285 String [] fields = record.split(":");
286 String phrase = fields[0];
287 String tf = fields[1];
288 //String ef = fields[2]; dont use this
289 String df = fields[3];
290
291 Element expansion = doc_.createElement("expansion");
292 expansion.setAttribute("tf", tf);
293 expansion.setAttribute("df", df);
294 expansion.setAttribute("id", Long.toString(phrase_num));
295
296 // get teh suffix and prefix
297 String [] ends = splitPhraseOnWord(phrase, orig_phrase);
298 if (!ends[0].equals("")) {
299 expansion.appendChild(GSXML.createTextElement(doc_, "prefix", ends[0]));
300 }
301 if (!ends[1].equals("")) {
302 expansion.appendChild(GSXML.createTextElement(doc_, "suffix", ends[1]));
303 }
304
305 return expansion;
306
307 }
308
309 protected boolean addDocumentList(Element phind_data, String record,
310 String word,
311 String freq,
312 long first, long last) {
313
314 Element document_list = doc_.createElement("documentList");
315 phind_data.appendChild(document_list);
316 document_list.setAttribute("length", freq);
317 document_list.setAttribute("start", Long.toString(first));
318 document_list.setAttribute("end", Long.toString(last));
319
320 // get the list of doc,freq
321 String [] doc_freqs = record.split(";");
322 int length = doc_freqs.length;
323 if (length<last) last=length;
324
325 for (long i = first; i < last; i++) {
326 String doc_elem = doc_freqs[(int)i];
327 int p = doc_elem.indexOf(',');
328 long doc_num;
329 String doc_freq;
330 if (p == -1) { // there is no freq in the record
331 doc_num =Long.parseLong(doc_elem);
332 doc_freq = "1";
333 } else {
334 doc_num = Long.parseLong(doc_elem.substring(0,p));
335 doc_freq = doc_elem.substring(p+1);
336 }
337 Element document = getDocument( doc_num);
338 document.setAttribute("freq", doc_freq);
339 document.setAttribute("num", Long.toString(i));
340 document_list.appendChild(document);
341 }
342
343
344 return true;
345 }
346
347
348 protected Element getDocument(long doc_num) {
349
350 // look up the phrase in the docs thingy
351 String record = mgpp_src_.getDocument(basepath_, "docs", "Document",
352 doc_num);
353
354 if (record ==null || record.equals("")) return null;
355 System.out.println("doc record:"+record);
356
357 // ignore everything up to and including first \t
358 record = record.substring(record.indexOf('\t')+1);
359
360 String [] fields = record.split("\t");
361 String hash = fields[0];
362 String title = fields[1];
363
364 Element d = doc_.createElement("document");
365 d.setAttribute("hash", hash);
366 d.appendChild(GSXML.createTextElement(doc_, "title", title));
367
368 return d;
369
370 }
371 protected boolean addThesaurusList(Element phind_data, String record,
372 String word,
373 String freq,
374 long first, long last) {
375
376
377 Element thesaurus_list = doc_.createElement("thesaurusList");
378 phind_data.appendChild(thesaurus_list);
379 thesaurus_list.setAttribute("length", freq);
380 thesaurus_list.setAttribute("start", Long.toString(first));
381 thesaurus_list.setAttribute("end", Long.toString(last));
382
383 System.out.println("record for thesaurus="+record);
384
385 // get the list of type,dest,dest
386 String [] links = record.split(";");
387 int length = links.length;
388 long index = 0;
389 for (int i = 0; i < length; i++) { // go through the entries
390 String link_info = links[(int)i];
391 String [] items = link_info.split(",");
392 // the first entry is teh type
393 String type = items[0];
394 for (int j = 1; j<items.length; j++, index++) {
395 if (index >= first && index < last) { // only output the ones we want
396 long phrase = Long.parseLong(items[j]);
397 Element t = getThesaurus(phrase);
398 t.setAttribute("type", type);
399 thesaurus_list.appendChild(t);
400 }
401 }
402 }
403
404 return true;
405 }
406
407 protected Element getThesaurus(long phrase_num) {
408
409 // look up the phrase in the pdata thingy
410 String record = mgpp_src_.getDocument(basepath_, "pdata", "Document",
411 phrase_num);
412
413 if (record ==null || record.equals("")) return null;
414
415 // ignore everything up to and including first colon
416 record = record.substring(record.indexOf(':')+1);
417
418 String [] fields = record.split(":");
419 String phrase = fields[0];
420 String tf = fields[1];
421 //String ef = fields[2]; dont use this
422 String df = fields[3];
423
424 Element thesaurus = doc_.createElement("thesaurus");
425 thesaurus.setAttribute("tf", tf);
426 thesaurus.setAttribute("df", df);
427 thesaurus.setAttribute("id", Long.toString(phrase_num));
428 thesaurus.appendChild(GSXML.createTextElement(doc_, "phrase", phrase));
429 return thesaurus;
430
431 }
432
433 /** returns an array of two elements - the prefix and the suffix*/
434 protected String [] splitPhraseOnWord(String phrase, String word) {
435
436 if (word.equals("")) {
437
438 String [] res = {phrase, ""};
439 return res;
440 }
441 // use 2 so that we only split on the first occurrance. trailing empty strings should be included
442 String [] result = phrase.split(word, 2);
443 if (result.length !=2) {
444 System.out.println("didn't get two substrings!!");
445 }
446 return result;
447
448 }
449
450 protected Element phindError(String message) {
451 Element e = doc_.createElement("phindError");
452 Text t = doc_.createTextNode(message);
453 e.appendChild(t);
454 return e;
455 }
456
457}
458
Note: See TracBrowser for help on using the repository browser.