source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java@ 25635

Last change on this file since 25635 was 25635, checked in by sjm84, 12 years ago

Fixing Greenstone 3's use (or lack thereof) of generics, this was done automatically so we may want to change it over time. This change will also auto-format any files that have not already been formatted.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.5 KB
Line 
1/*
2 * PhindServices.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21import org.greenstone.gsdl3.util.*;
22
23import org.greenstone.mgpp.*;
24import org.w3c.dom.Document;
25import org.w3c.dom.Node;
26import org.w3c.dom.Element;
27import org.w3c.dom.Text;
28
29import java.util.Vector;
30import java.util.HashMap;
31import java.io.File;
32import java.io.Serializable;
33
34import org.apache.log4j.*;
35
36/**
37 * PhindServices - the phind phrase browsing service
38 *
39 * @author <a href="mailto:[email protected]">Katherine Don</a>
40 * @version $Revision: 25635 $
41 */
42public class PhindPhraseBrowse
43 extends ServiceRack {
44
45 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.PhindPhraseBrowse.class.getName());
46
47 // the services on offer
48 private static final String PHIND_SERVICE = "PhindApplet";
49
50 private static MGPPRetrieveWrapper mgpp_retrieve_src=null;
51 private static MGPPSearchWrapper mgpp_search_src=null;
52 private String basepath = null;
53
54 private Element applet_description = null;
55
56 public PhindPhraseBrowse() {
57 if(this.mgpp_retrieve_src == null) {
58 this.mgpp_retrieve_src = new MGPPRetrieveWrapper();
59 }
60 if(this.mgpp_search_src == null) {
61 this.mgpp_search_src = new MGPPSearchWrapper();
62 }
63 // set up the default params
64 this.mgpp_search_src.setQueryLevel("Document");
65 this.mgpp_search_src.setReturnLevel("Document");
66 this.mgpp_search_src.setMaxDocs(5);
67 this.mgpp_search_src.setStem(false);
68 this.mgpp_search_src.setCase(true);
69 }
70
71 public void cleanUp() {
72 super.cleanUp();
73 this.mgpp_search_src.unloadIndexData();
74 }
75
76 /** configure the service module
77 *
78 * @param info a DOM Element containing any config info for the service
79 * @return true if configured
80 */
81 public boolean configure(Element info, Element extra_info) {
82
83 if (!super.configure(info, extra_info)){
84 return false;
85 }
86
87 logger.info("configuring PhindPhraseBrowse");
88
89 // set up short_service_info_ - for now just has name and type
90 Element e = this.doc.createElement(GSXML.SERVICE_ELEM);
91 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
92 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
93 this.short_service_info.appendChild(e);
94
95 // set up the static applet description
96
97 applet_description = this.doc.createElement(GSXML.SERVICE_ELEM);
98 applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
99 applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
100
101 // add in the applet info for the phind applet
102 // need to make this dynamic - library names etc
103 // change the applet params - have a single param with the library name
104 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
105 // phindcgi param now is not complete - library must be prepended to it.
106 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='applet' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
107 app_info += GSParams.ACTION +"=a&amp;"+GSParams.REQUEST_TYPE +"=r&amp;"+GSParams.SERVICE+"="+PHIND_SERVICE+"&amp;"+GSParams.OUTPUT+"=xml&amp;"+GSParams.RESPONSE_ONLY+"=1'/>";
108 app_info +="<PARAM NAME='collection' VALUE='";
109 app_info += this.cluster_name;
110 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
111
112 Document dom = this.converter.getDOM(app_info);
113 if (dom==null) {
114 logger.error("Couldn't parse applet info");
115 return false;
116 }
117 Element app_elem = dom.getDocumentElement();
118 applet_description.appendChild(this.doc.importNode(app_elem, true));
119
120 return true;
121 }
122
123 protected Element getServiceDescription(String service, String lang, String subset) {
124 if (!service.equals(PHIND_SERVICE)) {
125 return null;
126 }
127 Element describe = (Element) applet_description.cloneNode(true);
128 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(PHIND_SERVICE+".name", lang)));
129 describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(PHIND_SERVICE+".description", lang)));
130 return describe;
131 }
132
133 protected Element processPhindApplet(Element request) {
134
135 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
136 HashMap<String, Serializable> params = GSXML.extractParams(param_elem, false);
137
138 long first_e = Long.parseLong((String)params.get("pfe"));
139 long last_e = Long.parseLong((String)params.get("ple"));
140 long first_l = Long.parseLong((String)params.get("pfl"));
141 long last_l = Long.parseLong((String)params.get("pll"));
142 long first_d = Long.parseLong((String)params.get("pfd"));
143 long last_d = Long.parseLong((String)params.get("pld"));
144
145 long phrase;
146 String phrase_str = (String)params.get("ppnum");
147 if (phrase_str == null || phrase_str.equals("")) {
148 phrase=0;
149 } else {
150 phrase = Long.parseLong(phrase_str);
151 }
152 String word = (String)params.get("pptext");
153 String phind_index = (String)params.get("pc");
154 // the location of the mgpp database files
155 this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index);
156
157 // the result element
158 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
159 result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
160 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
161
162 // applet result info must be in appletInfo element
163 Element applet_data = this.doc.createElement(GSXML.APPLET_DATA_ELEM);
164 result.appendChild(applet_data);
165 Element phind_data = this.doc.createElement("phindData");
166 applet_data.appendChild(phind_data);
167
168
169 // if we dont know the phrase number, look it up
170 if (phrase == 0) {
171 if (word==null || word.equals("")) {
172 Element error = phindError("no word or phrase");
173 phind_data.appendChild(error);
174 return result;
175 }
176 phrase = findPhraseNumberFromWord( word);
177 }
178 if (phrase==0) {
179 // the word is not in the collection
180 // return a phind error string
181 Element error = phindError("the term "+word+" is not in the collection");
182 phind_data.appendChild(error);
183 return result;
184 }
185
186 // get the phrase data into the phind_data node
187 getPhraseData(phind_data, phrase, first_l, last_l,
188 first_e, last_e, first_d, last_d);
189 return result;
190
191
192 }// processPhindApplet
193
194 protected long findPhraseNumberFromWord(String word) {
195 synchronized (mgpp_search_src) {
196 // set the mgpp index data - we are looking up pword
197 mgpp_search_src.loadIndexData(this.basepath+File.separatorChar+"pword");
198
199 mgpp_search_src.runQuery(word);
200
201 MGPPQueryResult res = mgpp_search_src.getQueryResult();
202 Vector docs = res.getDocs();
203 if (docs.size()==0) {
204 // phrase not found
205 return 0;
206 }
207 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
208 return doc.num_;
209 }
210 }
211
212 protected boolean getPhraseData(Element phind_data,
213 long phrase, long first_l, long last_l,
214 long first_e, long last_e, long first_d,
215 long last_d) {
216
217 synchronized (mgpp_retrieve_src) {
218 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
219 phrase);
220 if (record.equals("")) {
221 Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
222 phind_data.appendChild(error);
223 return false;
224 }
225
226 // parse the record - its in gordons cryptic form
227 // ":word:tf:ef:df:el:dl:lf:ll"
228 // el: e,e,e
229 // dl: d;f,d;f,
230 // lf and ll may be null
231 // l: type,dest, dest; type,dest,dest
232
233 // ignore everything up to and including first colon (has
234 // <Document>3505: at the start)
235 record = record.substring(record.indexOf(':')+1);
236
237 // split on ':'
238 String [] fields = record.split(":");
239 String word = fields[0];
240 String tf = fields[1];
241 String ef = fields[2];
242 String df = fields[3];
243
244
245 String expansions = fields[4];
246 String documents = fields[5];
247 String lf = "0";
248 String linklist = "";
249 if (fields.length > 7) {// have thesaurus stuff
250 lf =fields[6];
251 linklist = fields[7];
252 }
253
254 // the phindData attributes and phrase
255 phind_data.setAttribute("id", Long.toString(phrase));
256 phind_data.setAttribute("df", df);
257 phind_data.setAttribute("ef", ef);
258 phind_data.setAttribute("lf", lf);
259 phind_data.setAttribute("tf", tf);
260 GSXML.createTextElement(this.doc, "phrase", word);
261
262 addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
263 addDocumentList(phind_data, documents, word, df, first_d, last_d);
264 if (!lf.equals("0")) {
265 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
266 }
267 return true;
268 }
269 }
270
271 protected boolean addExpansionList( Element phind_data, String record,
272 String word,
273 String freq,
274 long first, long last) {
275
276 Element expansion_list = this.doc.createElement("expansionList");
277 phind_data.appendChild(expansion_list);
278 expansion_list.setAttribute("length", freq);
279 expansion_list.setAttribute("start", Long.toString(first));
280 expansion_list.setAttribute("end", Long.toString(last));
281
282 // get the list of strings
283 String [] expansions = record.split(",");
284 int length = expansions.length;
285 if (length < last) last = length;
286 for (long i = first; i < last; i++) {
287 long num = Long.parseLong(expansions[(int)i]);
288 Element expansion = getExpansion( num, word);
289 expansion.setAttribute("num", Long.toString(i));
290 expansion_list.appendChild(expansion);
291 }
292 return true;
293 }
294
295 protected Element getExpansion(long phrase_num,
296 String orig_phrase) {
297
298 // look up the phrase in the pdata thingy
299 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
300 phrase_num);
301
302 if (record ==null || record.equals("")) return null;
303
304 // ignore everything up to and including first colon
305 record = record.substring(record.indexOf(':')+1);
306
307 String [] fields = record.split(":");
308 String phrase = fields[0];
309 String tf = fields[1];
310 //String ef = fields[2]; dont use this
311 String df = fields[3];
312
313 Element expansion = this.doc.createElement("expansion");
314 expansion.setAttribute("tf", tf);
315 expansion.setAttribute("df", df);
316 expansion.setAttribute("id", Long.toString(phrase_num));
317
318 // get teh suffix and prefix
319 String [] ends = splitPhraseOnWord(phrase, orig_phrase);
320 if (!ends[0].equals("")) {
321 expansion.appendChild(GSXML.createTextElement(this.doc, "prefix", ends[0]));
322 }
323 if (!ends[1].equals("")) {
324 expansion.appendChild(GSXML.createTextElement(this.doc, "suffix", ends[1]));
325 }
326
327 return expansion;
328
329 }
330
331 protected boolean addDocumentList(Element phind_data, String record,
332 String word,
333 String freq,
334 long first, long last) {
335
336 Element document_list = this.doc.createElement("documentList");
337 phind_data.appendChild(document_list);
338 document_list.setAttribute("length", freq);
339 document_list.setAttribute("start", Long.toString(first));
340 document_list.setAttribute("end", Long.toString(last));
341
342 // get the list of doc,freq
343 String [] doc_freqs = record.split(";");
344 int length = doc_freqs.length;
345 if (length<last) last=length;
346
347 for (long i = first; i < last; i++) {
348 String doc_elem = doc_freqs[(int)i];
349 int p = doc_elem.indexOf(',');
350 long doc_num;
351 String doc_freq;
352 if (p == -1) { // there is no freq in the record
353 doc_num =Long.parseLong(doc_elem);
354 doc_freq = "1";
355 } else {
356 doc_num = Long.parseLong(doc_elem.substring(0,p));
357 doc_freq = doc_elem.substring(p+1);
358 }
359 Element document = getDocument( doc_num);
360 document.setAttribute("freq", doc_freq);
361 document.setAttribute("num", Long.toString(i));
362 document_list.appendChild(document);
363 }
364
365
366 return true;
367 }
368
369
370 protected Element getDocument(long doc_num) {
371
372 // look up the phrase in the docs thingy
373 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"docs", "Document",
374 doc_num);
375
376 if (record ==null || record.equals("")) return null;
377
378 // ignore everything up to and including first \t
379 record = record.substring(record.indexOf('\t')+1);
380
381 String [] fields = record.split("\t");
382 String hash = fields[0];
383 String title = fields[1];
384
385 Element d = this.doc.createElement("document");
386 d.setAttribute("hash", hash);
387 d.appendChild(GSXML.createTextElement(this.doc, "title", title));
388
389 return d;
390
391 }
392 protected boolean addThesaurusList(Element phind_data, String record,
393 String word,
394 String freq,
395 long first, long last) {
396
397
398 Element thesaurus_list = this.doc.createElement("thesaurusList");
399 phind_data.appendChild(thesaurus_list);
400 thesaurus_list.setAttribute("length", freq);
401 thesaurus_list.setAttribute("start", Long.toString(first));
402 thesaurus_list.setAttribute("end", Long.toString(last));
403
404 // get the list of type,dest,dest
405 String [] links = record.split(";");
406 int length = links.length;
407 long index = 0;
408 for (int i = 0; i < length; i++) { // go through the entries
409 String link_info = links[(int)i];
410 String [] items = link_info.split(",");
411 // the first entry is teh type
412 String type = items[0];
413 for (int j = 1; j<items.length; j++, index++) {
414 if (index >= first && index < last) { // only output the ones we want
415 long phrase = Long.parseLong(items[j]);
416 Element t = getThesaurus(phrase);
417 t.setAttribute("type", type);
418 thesaurus_list.appendChild(t);
419 }
420 }
421 }
422
423 return true;
424 }
425
426 protected Element getThesaurus(long phrase_num) {
427
428 // look up the phrase in the pdata thingy
429 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
430 phrase_num);
431
432 if (record ==null || record.equals("")) return null;
433
434 // ignore everything up to and including first colon
435 record = record.substring(record.indexOf(':')+1);
436
437 String [] fields = record.split(":");
438 String phrase = fields[0];
439 String tf = fields[1];
440 //String ef = fields[2]; dont use this
441 String df = fields[3];
442
443 Element thesaurus = this.doc.createElement("thesaurus");
444 thesaurus.setAttribute("tf", tf);
445 thesaurus.setAttribute("df", df);
446 thesaurus.setAttribute("id", Long.toString(phrase_num));
447 thesaurus.appendChild(GSXML.createTextElement(this.doc, "phrase", phrase));
448 return thesaurus;
449
450 }
451
452 /** returns an array of two elements - the prefix and the suffix*/
453 protected String [] splitPhraseOnWord(String phrase, String word) {
454
455 if (word.equals("")) {
456
457 String [] res = {phrase, ""};
458 return res;
459 }
460 // use 2 so that we only split on the first occurrance. trailing empty strings should be included
461 String [] result = phrase.split(word, 2);
462 return result;
463
464 }
465
466 protected Element phindError(String message) {
467 Element e = this.doc.createElement("phindError");
468 Text t = this.doc.createTextNode(message);
469 e.appendChild(t);
470 return e;
471 }
472
473}
474
Note: See TracBrowser for help on using the repository browser.