source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java@ 4246

Last change on this file since 4246 was 4246, checked in by kjdon, 21 years ago

added response only arg to the phind url

  • Property svn:keywords set to Author Date Id Revision
File size: 14.7 KB
Line 
1/*
2 * PhindServices.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21import org.greenstone.gsdl3.util.*;
22
23import org.greenstone.mgpp.*;
24import org.w3c.dom.Document;
25import org.w3c.dom.Node;
26import org.w3c.dom.Element;
27import org.w3c.dom.Text;
28
29import java.util.Vector;
30import java.util.HashMap;
31import java.io.File;
32/**
33 * PhindServices - the phind phrase browsing service
34 *
35 * @author <a href="mailto:[email protected]">Katherine Don</a>
36 * @version $Revision: 4246 $
37 */
38public class PhindPhraseBrowse
39 extends ServiceRack {
40
41 // the services on offer
42 private static final String PHIND_SERVICE = "PhindApplet";
43
44 private MGPPWrapper mgpp_src_=null;
45 private String basepath_ = null;
46 public PhindPhraseBrowse() {
47 mgpp_src_ = new MGPPWrapper();
48 // set up the default params
49 mgpp_src_.setQueryLevel("Document");
50 mgpp_src_.setReturnLevel("Document");
51 mgpp_src_.setMaxDocs(5);
52 mgpp_src_.setStem(false);
53 mgpp_src_.setCase(true);
54 }
55 /** configure the service module
56 *
57 * @param info a DOM Element containing any config info for the service
58 * @return true if configured
59 */
60 public boolean configure(Element info, Element extra_info) {
61
62 System.out.println("configuring PhindPhraseBrowse");
63
64 // set up short_service_info_ - for now just has name and type
65 Element e = doc_.createElement(GSXML.SERVICE_ELEM);
66 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
67 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
68 short_service_info_.appendChild(e);
69
70 // set up service_info_map_ - we only have one element, and it has
71 // no extra info yet - we are not processing the config info
72 Element f = doc_.createElement(GSXML.SERVICE_ELEM);
73 f.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
74 f.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
75
76 // add in the applet info for the phind applet
77 // need to make this dynamic - library names etc
78 // change the applet params - have a single param with the library name
79 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
80 // phindcgi param now is not complete - library must be prepended to it.
81 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='lib/java' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, gsdl3.jar, jaxp.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
82 app_info += GSCGI.ACTION_ARG +"=a&amp;"+GSCGI.REQUEST_TYPE_ARG +"=r&amp;"+GSCGI.SERVICE_ARG+"="+PHIND_SERVICE+"&amp;"+GSCGI.OUTPUT_ARG+"=xml&amp;"+GSCGI.RESPONSE_ONLY_ARG+"=1'/>";
83 app_info +="<PARAM NAME='collection' VALUE='";
84 app_info += cluster_name_;
85 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
86
87 Document dom = converter_.getDOM(app_info);
88 Element app_elem = dom.getDocumentElement();
89 f.appendChild(doc_.importNode(app_elem, true));
90
91 service_info_map_.put(PHIND_SERVICE, f);
92
93 return true;
94 }
95
96 /** creates a display element containing all the text strings needed to display the service page, in the language specified */
97 protected Element createServiceDisplay(String service, String lang) {
98 Element display = doc_.createElement(GSXML.DISPLAY_ELEM);
99 display.appendChild(GSXML.createTextElement(doc_, GSXML.DISPLAY_NAME_ELEM, getTextString(service+".name", lang)));
100 //display.appendChild(GSXML.createTextElement(doc_, GSXML.DISPLAY_SUBMIT_ELEM, getTextString(service+".submit", lang)));
101
102 return display;
103
104 }
105
106 protected Element processPhindApplet(Element request) {
107
108 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
109 HashMap params = GSXML.extractParams(param_elem, false);
110
111 long first_e = Long.parseLong((String)params.get("pfe"));
112 long last_e = Long.parseLong((String)params.get("ple"));
113 long first_l = Long.parseLong((String)params.get("pfl"));
114 long last_l = Long.parseLong((String)params.get("pll"));
115 long first_d = Long.parseLong((String)params.get("pfd"));
116 long last_d = Long.parseLong((String)params.get("pld"));
117
118 long phrase;
119 String phrase_str = (String)params.get("ppnum");
120 if (phrase_str == null || phrase_str.equals("")) {
121 phrase=0;
122 } else {
123 phrase = Long.parseLong(phrase_str);
124 }
125 String word = (String)params.get("pptext");
126 String phind_index = (String)params.get("pc");
127 // the location of the mgpp database files
128 basepath_ = GSFile.phindBaseDir(site_home_, cluster_name_, phind_index);
129
130 // the result element
131 Element result = doc_.createElement(GSXML.RESPONSE_ELEM);
132 result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
133 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_QUERY);
134
135 // applet result info must be in appletInfo element
136 Element applet_data = doc_.createElement(GSXML.APPLET_DATA_ELEM);
137 result.appendChild(applet_data);
138 Element phind_data = doc_.createElement("phindData");
139 applet_data.appendChild(phind_data);
140
141
142 // if we dont know the phrase number, look it up
143 if (phrase == 0) {
144 if (word==null || word.equals("")) {
145 Element error = phindError("no word or phrase");
146 phind_data.appendChild(error);
147 return result;
148 }
149 phrase = findPhraseNumberFromWord( word);
150 }
151 if (phrase==0) {
152 // the word is not in the collection
153 // return a phind error string
154 Element error = phindError("the term "+word+" is not in the collection");
155 phind_data.appendChild(error);
156 return result;
157 }
158
159 // get the phrase data into the phind_data node
160 getPhraseData(phind_data, phrase, first_l, last_l,
161 first_e, last_e, first_d, last_d);
162 return result;
163
164
165 }// processPhindApplet
166
167 protected long findPhraseNumberFromWord(String word) {
168
169 // set the mgpp index data - we are looking up pword
170 mgpp_src_.loadIndexData(basepath_+File.separatorChar+"pword");
171
172 mgpp_src_.runQuery(word);
173
174 MGPPQueryResult res = mgpp_src_.getQueryResult();
175 Vector docs = res.getDocs();
176 if (docs.size()==0) {
177 // phrase not found
178 return 0;
179 }
180 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
181 return doc.num_;
182 }
183
184 protected boolean getPhraseData(Element phind_data,
185 long phrase, long first_l, long last_l,
186 long first_e, long last_e, long first_d,
187 long last_d) {
188
189 String record = mgpp_src_.getDocument(basepath_+File.separatorChar+"pdata", "Document",
190 phrase);
191 if (record.equals("")) {
192 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
193 phind_data.appendChild(error);
194 return false;
195 }
196
197 // parse the record - its in gordons cryptic form
198 // ":word:tf:ef:df:el:dl:lf:ll"
199 // el: e,e,e
200 // dl: d;f,d;f,
201 // lf and ll may be null
202 // l: type,dest, dest; type,dest,dest
203
204 // ignore everything up to and including first colon (has
205 // <Document>3505: at the start)
206 record = record.substring(record.indexOf(':')+1);
207
208 // split on ':'
209 String [] fields = record.split(":");
210 String word = fields[0];
211 String tf = fields[1];
212 String ef = fields[2];
213 String df = fields[3];
214
215
216 String expansions = fields[4];
217 String documents = fields[5];
218 String lf = "0";
219 String linklist = "";
220 if (fields.length > 7) {// have thesaurus stuff
221 lf =fields[6];
222 linklist = fields[7];
223 }
224
225 // the phindData attributes and phrase
226 phind_data.setAttribute("id", Long.toString(phrase));
227 phind_data.setAttribute("df", df);
228 phind_data.setAttribute("ef", ef);
229 phind_data.setAttribute("lf", lf);
230 phind_data.setAttribute("tf", tf);
231 GSXML.createTextElement(doc_, "phrase", word);
232
233 addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
234 addDocumentList(phind_data, documents, word, df, first_d, last_d);
235 if (!lf.equals("0")) {
236 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
237 }
238 return true;
239 }
240
241 protected boolean addExpansionList( Element phind_data, String record,
242 String word,
243 String freq,
244 long first, long last) {
245
246 Element expansion_list = doc_.createElement("expansionList");
247 phind_data.appendChild(expansion_list);
248 expansion_list.setAttribute("length", freq);
249 expansion_list.setAttribute("start", Long.toString(first));
250 expansion_list.setAttribute("end", Long.toString(last));
251
252 // get the list of strings
253 String [] expansions = record.split(",");
254 int length = expansions.length;
255 if (length < last) last = length;
256 for (long i = first; i < last; i++) {
257 long num = Long.parseLong(expansions[(int)i]);
258 Element expansion = getExpansion( num, word);
259 expansion.setAttribute("num", Long.toString(i));
260 expansion_list.appendChild(expansion);
261 }
262 return true;
263 }
264
265 protected Element getExpansion(long phrase_num,
266 String orig_phrase) {
267
268 // look up the phrase in the pdata thingy
269 String record = mgpp_src_.getDocument(basepath_+File.separatorChar+"pdata", "Document",
270 phrase_num);
271
272 if (record ==null || record.equals("")) return null;
273
274 // ignore everything up to and including first colon
275 record = record.substring(record.indexOf(':')+1);
276
277 String [] fields = record.split(":");
278 String phrase = fields[0];
279 String tf = fields[1];
280 //String ef = fields[2]; dont use this
281 String df = fields[3];
282
283 Element expansion = doc_.createElement("expansion");
284 expansion.setAttribute("tf", tf);
285 expansion.setAttribute("df", df);
286 expansion.setAttribute("id", Long.toString(phrase_num));
287
288 // get teh suffix and prefix
289 String [] ends = splitPhraseOnWord(phrase, orig_phrase);
290 if (!ends[0].equals("")) {
291 expansion.appendChild(GSXML.createTextElement(doc_, "prefix", ends[0]));
292 }
293 if (!ends[1].equals("")) {
294 expansion.appendChild(GSXML.createTextElement(doc_, "suffix", ends[1]));
295 }
296
297 return expansion;
298
299 }
300
301 protected boolean addDocumentList(Element phind_data, String record,
302 String word,
303 String freq,
304 long first, long last) {
305
306 Element document_list = doc_.createElement("documentList");
307 phind_data.appendChild(document_list);
308 document_list.setAttribute("length", freq);
309 document_list.setAttribute("start", Long.toString(first));
310 document_list.setAttribute("end", Long.toString(last));
311
312 // get the list of doc,freq
313 String [] doc_freqs = record.split(";");
314 int length = doc_freqs.length;
315 if (length<last) last=length;
316
317 for (long i = first; i < last; i++) {
318 String doc_elem = doc_freqs[(int)i];
319 int p = doc_elem.indexOf(',');
320 long doc_num;
321 String doc_freq;
322 if (p == -1) { // there is no freq in the record
323 doc_num =Long.parseLong(doc_elem);
324 doc_freq = "1";
325 } else {
326 doc_num = Long.parseLong(doc_elem.substring(0,p));
327 doc_freq = doc_elem.substring(p+1);
328 }
329 Element document = getDocument( doc_num);
330 document.setAttribute("freq", doc_freq);
331 document.setAttribute("num", Long.toString(i));
332 document_list.appendChild(document);
333 }
334
335
336 return true;
337 }
338
339
340 protected Element getDocument(long doc_num) {
341
342 // look up the phrase in the docs thingy
343 String record = mgpp_src_.getDocument(basepath_+File.separatorChar+"docs", "Document",
344 doc_num);
345
346 if (record ==null || record.equals("")) return null;
347
348 // ignore everything up to and including first \t
349 record = record.substring(record.indexOf('\t')+1);
350
351 String [] fields = record.split("\t");
352 String hash = fields[0];
353 String title = fields[1];
354
355 Element d = doc_.createElement("document");
356 d.setAttribute("hash", hash);
357 d.appendChild(GSXML.createTextElement(doc_, "title", title));
358
359 return d;
360
361 }
362 protected boolean addThesaurusList(Element phind_data, String record,
363 String word,
364 String freq,
365 long first, long last) {
366
367
368 Element thesaurus_list = doc_.createElement("thesaurusList");
369 phind_data.appendChild(thesaurus_list);
370 thesaurus_list.setAttribute("length", freq);
371 thesaurus_list.setAttribute("start", Long.toString(first));
372 thesaurus_list.setAttribute("end", Long.toString(last));
373
374 // get the list of type,dest,dest
375 String [] links = record.split(";");
376 int length = links.length;
377 long index = 0;
378 for (int i = 0; i < length; i++) { // go through the entries
379 String link_info = links[(int)i];
380 String [] items = link_info.split(",");
381 // the first entry is teh type
382 String type = items[0];
383 for (int j = 1; j<items.length; j++, index++) {
384 if (index >= first && index < last) { // only output the ones we want
385 long phrase = Long.parseLong(items[j]);
386 Element t = getThesaurus(phrase);
387 t.setAttribute("type", type);
388 thesaurus_list.appendChild(t);
389 }
390 }
391 }
392
393 return true;
394 }
395
396 protected Element getThesaurus(long phrase_num) {
397
398 // look up the phrase in the pdata thingy
399 String record = mgpp_src_.getDocument(basepath_+File.separatorChar+"pdata", "Document",
400 phrase_num);
401
402 if (record ==null || record.equals("")) return null;
403
404 // ignore everything up to and including first colon
405 record = record.substring(record.indexOf(':')+1);
406
407 String [] fields = record.split(":");
408 String phrase = fields[0];
409 String tf = fields[1];
410 //String ef = fields[2]; dont use this
411 String df = fields[3];
412
413 Element thesaurus = doc_.createElement("thesaurus");
414 thesaurus.setAttribute("tf", tf);
415 thesaurus.setAttribute("df", df);
416 thesaurus.setAttribute("id", Long.toString(phrase_num));
417 thesaurus.appendChild(GSXML.createTextElement(doc_, "phrase", phrase));
418 return thesaurus;
419
420 }
421
422 /** returns an array of two elements - the prefix and the suffix*/
423 protected String [] splitPhraseOnWord(String phrase, String word) {
424
425 if (word.equals("")) {
426
427 String [] res = {phrase, ""};
428 return res;
429 }
430 // use 2 so that we only split on the first occurrance. trailing empty strings should be included
431 String [] result = phrase.split(word, 2);
432 return result;
433
434 }
435
436 protected Element phindError(String message) {
437 Element e = doc_.createElement("phindError");
438 Text t = doc_.createTextNode(message);
439 e.appendChild(t);
440 return e;
441 }
442
443}
444
Note: See TracBrowser for help on using the repository browser.