source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java@ 4012

Last change on this file since 4012 was 3991, checked in by kjdon, 21 years ago

mgpp wrapper now needs to have an empty basepath, so changed all the loadIndex and getDocument calls

  • Property svn:keywords set to Author Date Id Revision
File size: 14.7 KB
Line 
1/*
2 * PhindServices.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21import org.greenstone.gsdl3.util.*;
22
23import org.greenstone.mgpp.*;
24import org.w3c.dom.Document;
25import org.w3c.dom.Node;
26import org.w3c.dom.Element;
27import org.w3c.dom.Text;
28
29import java.util.Vector;
30import java.util.HashMap;
31import java.io.File;
32/**
33 * PhindServices - the phind phrase browsing service
34 *
35 * @author <a href="mailto:[email protected]">Katherine Don</a>
36 * @version $Revision: 3991 $
37 */
38public class PhindPhraseBrowse
39 extends ServiceRack {
40
41 // the services on offer
42 private static final String PHIND_SERVICE = "PhindApplet";
43
44 private MGPPWrapper mgpp_src_=null;
45 private String basepath_ = null;
46 public PhindPhraseBrowse() {
47 mgpp_src_ = new MGPPWrapper();
48 // set up the default params
49 mgpp_src_.setQueryLevel("Document");
50 mgpp_src_.setReturnLevel("Document");
51 mgpp_src_.setMaxDocs(5);
52 mgpp_src_.setStem(false);
53 mgpp_src_.setCase(true);
54 }
55 /** configure the service module
56 *
57 * @param info a DOM Element containing any config info for the service
58 * @return true if configured
59 */
60 public boolean configure(Element info, Element extra_info) {
61
62 System.out.println("configuring PhindPhraseBrowse");
63
64 // set up short_service_info_ - for now just has name and type
65 Element e = doc_.createElement(GSXML.SERVICE_ELEM);
66 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
67 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
68 short_service_info_.appendChild(e);
69
70 // set up service_info_map_ - we only have one element, and it has
71 // no extra info yet - we are not processing the config info
72 Element f = doc_.createElement(GSXML.SERVICE_ELEM);
73 f.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
74 f.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
75
76 // add in the applet info for the phind applet
77 // need to make this dynamic - library names etc
78 // change the applet params - have a single param with the library name
79 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
80 // phindcgi param now is not complete - library must be prepended to it.
81 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='lib/java' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, gsdl3.jar, jaxp.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
82 app_info += GSCGI.ACTION_ARG +"=a&amp;"+GSCGI.REQUEST_TYPE_ARG +"=r&amp;"+GSCGI.SERVICE_ARG+"="+PHIND_SERVICE+"&amp;"+GSCGI.OUTPUT_ARG+"=xml'/>";
83 app_info +="<PARAM NAME='collection' VALUE='";
84 app_info += cluster_name_;
85 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
86
87 Document dom = converter_.getDOM(app_info);
88 Element app_elem = dom.getDocumentElement();
89 f.appendChild(doc_.importNode(app_elem, true));
90
91 service_info_map_.put(PHIND_SERVICE, f);
92
93 return true;
94 }
95
96 /** creates a display element containing all the text strings needed to display the service page, in the language specified */
97 protected Element createServiceDisplay(String service, String lang) {
98 Element display = doc_.createElement(GSXML.DISPLAY_ELEM);
99 display.appendChild(GSXML.createTextElement(doc_, GSXML.DISPLAY_NAME_ELEM, getTextString(service+".name", lang)));
100 //display.appendChild(GSXML.createTextElement(doc_, GSXML.DISPLAY_SUBMIT_ELEM, getTextString(service+".submit", lang)));
101
102 return display;
103
104 }
105
106 protected Element processPhindApplet(Element request) {
107
108 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
109 HashMap params = GSXML.extractParams(param_elem, false);
110
111 long first_e = Long.parseLong((String)params.get("pfe"));
112 long last_e = Long.parseLong((String)params.get("ple"));
113 long first_l = Long.parseLong((String)params.get("pfl"));
114 long last_l = Long.parseLong((String)params.get("pll"));
115 long first_d = Long.parseLong((String)params.get("pfd"));
116 long last_d = Long.parseLong((String)params.get("pld"));
117
118 long phrase;
119 String phrase_str = (String)params.get("ppnum");
120 if (phrase_str == null || phrase_str.equals("")) {
121 phrase=0;
122 } else {
123 phrase = Long.parseLong(phrase_str);
124 }
125 String word = (String)params.get("pptext");
126 String phind_index = (String)params.get("pc");
127 // the location of the mgpp database files
128 basepath_ = GSFile.phindBaseDir(site_home_, cluster_name_, phind_index);
129
130 // the result element
131 Element result = doc_.createElement(GSXML.RESPONSE_ELEM);
132 result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
133 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_QUERY);
134
135 // applet result info must be in appletInfo element
136 Element applet_data = doc_.createElement(GSXML.APPLET_DATA_ELEM);
137 result.appendChild(applet_data);
138 Element phind_data = doc_.createElement("phindData");
139 applet_data.appendChild(phind_data);
140
141
142 // if we dont know the phrase number, look it up
143 if (phrase == 0) {
144 if (word==null || word.equals("")) {
145 Element error = phindError("no word or phrase");
146 phind_data.appendChild(error);
147 return result;
148 }
149 phrase = findPhraseNumberFromWord( word);
150 }
151 if (phrase==0) {
152 // the word is not in the collection
153 // return a phind error string
154 Element error = phindError("the term "+word+" is not in the collection");
155 phind_data.appendChild(error);
156 return result;
157 }
158
159 // get the phrase data into the phind_data node
160 getPhraseData(phind_data, phrase, first_l, last_l,
161 first_e, last_e, first_d, last_d);
162 return result;
163
164
165 }// processPhindApplet
166
167 protected long findPhraseNumberFromWord(String word) {
168
169 // set the mgpp index data - we are looking up pword
170 mgpp_src_.loadIndexData("", basepath_+File.separatorChar+"pword");
171
172 mgpp_src_.runQuery(word);
173
174 MGPPQueryResult res = mgpp_src_.getQueryResult();
175 Vector docs = res.getDocs();
176 if (docs.size()==0) {
177 // phrase not found
178 return 0;
179 }
180 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
181 return doc.num_;
182 }
183
184 protected boolean getPhraseData(Element phind_data,
185 long phrase, long first_l, long last_l,
186 long first_e, long last_e, long first_d,
187 long last_d) {
188
189 String record = mgpp_src_.getDocument("", basepath_+File.separatorChar+"pdata", "Document",
190 phrase);
191 if (record.equals("")) {
192 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
193 phind_data.appendChild(error);
194 return false;
195 }
196
197 // parse the record - its in gordons cryptic form
198 // ":word:tf:ef:df:el:dl:lf:ll"
199 // el: e,e,e
200 // dl: d;f,d;f,
201 // lf and ll may be null
202 // l: type,dest, dest; type,dest,dest
203
204 // ignore everything up to and including first colon (has
205 // <Document>3505: at the start)
206 record = record.substring(record.indexOf(':')+1);
207
208 // split on ':'
209 String [] fields = record.split(":");
210 String word = fields[0];
211 String tf = fields[1];
212 String ef = fields[2];
213 String df = fields[3];
214
215
216 String expansions = fields[4];
217 String documents = fields[5];
218 String lf = "0";
219 String linklist = "";
220 if (fields.length > 7) {// have thesaurus stuff
221 lf =fields[6];
222 linklist = fields[7];
223 }
224
225 // the phindData attributes and phrase
226 phind_data.setAttribute("id", Long.toString(phrase));
227 phind_data.setAttribute("df", df);
228 phind_data.setAttribute("ef", ef);
229 phind_data.setAttribute("lf", lf);
230 phind_data.setAttribute("tf", tf);
231 GSXML.createTextElement(doc_, "phrase", word);
232
233 addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
234 addDocumentList(phind_data, documents, word, df, first_d, last_d);
235 if (!lf.equals("0")) {
236 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
237 }
238 return true;
239 }
240
241 protected boolean addExpansionList( Element phind_data, String record,
242 String word,
243 String freq,
244 long first, long last) {
245
246 Element expansion_list = doc_.createElement("expansionList");
247 phind_data.appendChild(expansion_list);
248 expansion_list.setAttribute("length", freq);
249 expansion_list.setAttribute("start", Long.toString(first));
250 expansion_list.setAttribute("end", Long.toString(last));
251
252 // get the list of strings
253 String [] expansions = record.split(",");
254 int length = expansions.length;
255 if (length < last) last = length;
256 for (long i = first; i < last; i++) {
257 long num = Long.parseLong(expansions[(int)i]);
258 Element expansion = getExpansion( num, word);
259 expansion.setAttribute("num", Long.toString(i));
260 expansion_list.appendChild(expansion);
261 }
262 return true;
263 }
264
265 protected Element getExpansion(long phrase_num,
266 String orig_phrase) {
267
268 // look up the phrase in the pdata thingy
269 String record = mgpp_src_.getDocument("", basepath_+File.separatorChar+"pdata", "Document",
270 phrase_num);
271
272 if (record ==null || record.equals("")) return null;
273
274 // ignore everything up to and including first colon
275 record = record.substring(record.indexOf(':')+1);
276
277 String [] fields = record.split(":");
278 String phrase = fields[0];
279 String tf = fields[1];
280 //String ef = fields[2]; dont use this
281 String df = fields[3];
282
283 Element expansion = doc_.createElement("expansion");
284 expansion.setAttribute("tf", tf);
285 expansion.setAttribute("df", df);
286 expansion.setAttribute("id", Long.toString(phrase_num));
287
288 // get teh suffix and prefix
289 String [] ends = splitPhraseOnWord(phrase, orig_phrase);
290 if (!ends[0].equals("")) {
291 expansion.appendChild(GSXML.createTextElement(doc_, "prefix", ends[0]));
292 }
293 if (!ends[1].equals("")) {
294 expansion.appendChild(GSXML.createTextElement(doc_, "suffix", ends[1]));
295 }
296
297 return expansion;
298
299 }
300
301 protected boolean addDocumentList(Element phind_data, String record,
302 String word,
303 String freq,
304 long first, long last) {
305
306 Element document_list = doc_.createElement("documentList");
307 phind_data.appendChild(document_list);
308 document_list.setAttribute("length", freq);
309 document_list.setAttribute("start", Long.toString(first));
310 document_list.setAttribute("end", Long.toString(last));
311
312 // get the list of doc,freq
313 String [] doc_freqs = record.split(";");
314 int length = doc_freqs.length;
315 if (length<last) last=length;
316
317 for (long i = first; i < last; i++) {
318 String doc_elem = doc_freqs[(int)i];
319 int p = doc_elem.indexOf(',');
320 long doc_num;
321 String doc_freq;
322 if (p == -1) { // there is no freq in the record
323 doc_num =Long.parseLong(doc_elem);
324 doc_freq = "1";
325 } else {
326 doc_num = Long.parseLong(doc_elem.substring(0,p));
327 doc_freq = doc_elem.substring(p+1);
328 }
329 Element document = getDocument( doc_num);
330 document.setAttribute("freq", doc_freq);
331 document.setAttribute("num", Long.toString(i));
332 document_list.appendChild(document);
333 }
334
335
336 return true;
337 }
338
339
340 protected Element getDocument(long doc_num) {
341
342 // look up the phrase in the docs thingy
343 String record = mgpp_src_.getDocument("", basepath_+File.separatorChar+"docs", "Document",
344 doc_num);
345
346 if (record ==null || record.equals("")) return null;
347
348 // ignore everything up to and including first \t
349 record = record.substring(record.indexOf('\t')+1);
350
351 String [] fields = record.split("\t");
352 String hash = fields[0];
353 String title = fields[1];
354
355 Element d = doc_.createElement("document");
356 d.setAttribute("hash", hash);
357 d.appendChild(GSXML.createTextElement(doc_, "title", title));
358
359 return d;
360
361 }
362 protected boolean addThesaurusList(Element phind_data, String record,
363 String word,
364 String freq,
365 long first, long last) {
366
367
368 Element thesaurus_list = doc_.createElement("thesaurusList");
369 phind_data.appendChild(thesaurus_list);
370 thesaurus_list.setAttribute("length", freq);
371 thesaurus_list.setAttribute("start", Long.toString(first));
372 thesaurus_list.setAttribute("end", Long.toString(last));
373
374 // get the list of type,dest,dest
375 String [] links = record.split(";");
376 int length = links.length;
377 long index = 0;
378 for (int i = 0; i < length; i++) { // go through the entries
379 String link_info = links[(int)i];
380 String [] items = link_info.split(",");
381 // the first entry is teh type
382 String type = items[0];
383 for (int j = 1; j<items.length; j++, index++) {
384 if (index >= first && index < last) { // only output the ones we want
385 long phrase = Long.parseLong(items[j]);
386 Element t = getThesaurus(phrase);
387 t.setAttribute("type", type);
388 thesaurus_list.appendChild(t);
389 }
390 }
391 }
392
393 return true;
394 }
395
396 protected Element getThesaurus(long phrase_num) {
397
398 // look up the phrase in the pdata thingy
399 String record = mgpp_src_.getDocument("", basepath_+File.separatorChar+"pdata", "Document",
400 phrase_num);
401
402 if (record ==null || record.equals("")) return null;
403
404 // ignore everything up to and including first colon
405 record = record.substring(record.indexOf(':')+1);
406
407 String [] fields = record.split(":");
408 String phrase = fields[0];
409 String tf = fields[1];
410 //String ef = fields[2]; dont use this
411 String df = fields[3];
412
413 Element thesaurus = doc_.createElement("thesaurus");
414 thesaurus.setAttribute("tf", tf);
415 thesaurus.setAttribute("df", df);
416 thesaurus.setAttribute("id", Long.toString(phrase_num));
417 thesaurus.appendChild(GSXML.createTextElement(doc_, "phrase", phrase));
418 return thesaurus;
419
420 }
421
422 /** returns an array of two elements - the prefix and the suffix*/
423 protected String [] splitPhraseOnWord(String phrase, String word) {
424
425 if (word.equals("")) {
426
427 String [] res = {phrase, ""};
428 return res;
429 }
430 // use 2 so that we only split on the first occurrance. trailing empty strings should be included
431 String [] result = phrase.split(word, 2);
432 return result;
433
434 }
435
436 protected Element phindError(String message) {
437 Element e = doc_.createElement("phindError");
438 Text t = doc_.createTextNode(message);
439 e.appendChild(t);
440 return e;
441 }
442
443}
444
Note: See TracBrowser for help on using the repository browser.