source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java@ 32453

Last change on this file since 32453 was 29309, checked in by kjdon, 10 years ago

removed my name

  • Property svn:keywords set to Author Date Id Revision
File size: 16.9 KB
Line 
1/*
2 * PhindServices.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.service;
20
21import org.greenstone.gsdl3.util.*;
22
23import org.greenstone.mgpp.*;
24import org.w3c.dom.Document;
25import org.w3c.dom.Node;
26import org.w3c.dom.Element;
27import org.w3c.dom.Text;
28
29import java.util.Vector;
30import java.util.HashMap;
31import java.io.File;
32import java.io.Serializable;
33
34import org.apache.log4j.*;
35
36/**
37 * PhindServices - the phind phrase browsing service
38 *
39 */
40public class PhindPhraseBrowse
41 extends ServiceRack {
42
43 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.PhindPhraseBrowse.class.getName());
44
45 // the services on offer
46 private static final String PHIND_SERVICE = "PhindApplet";
47
48 private static MGPPRetrieveWrapper mgpp_retrieve_src=null;
49 private static MGPPSearchWrapper mgpp_search_src=null;
50 private String basepath = null;
51
52 private Element applet_description = null;
53
54 public PhindPhraseBrowse() {
55 if(this.mgpp_retrieve_src == null) {
56 this.mgpp_retrieve_src = new MGPPRetrieveWrapper();
57 }
58 if(this.mgpp_search_src == null) {
59 this.mgpp_search_src = new MGPPSearchWrapper();
60 }
61 // set up the default params
62 this.mgpp_search_src.setQueryLevel("Document");
63 this.mgpp_search_src.setReturnLevel("Document");
64 this.mgpp_search_src.setMaxDocs(5);
65 this.mgpp_search_src.setStem(false);
66 this.mgpp_search_src.setCase(true);
67 }
68
69 public void cleanUp() {
70 super.cleanUp();
71 this.mgpp_search_src.unloadIndexData();
72 }
73
74 /** configure the service module
75 *
76 * @param info a DOM Element containing any config info for the service
77 * @return true if configured
78 */
79 public boolean configure(Element info, Element extra_info) {
80
81 if (!super.configure(info, extra_info)){
82 return false;
83 }
84
85 logger.info("configuring PhindPhraseBrowse");
86
87 // set up short_service_info_ - for now just has name and type
88 Element e = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
89 e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
90 e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
91 this.short_service_info.appendChild(e);
92
93 // set up the static applet description
94
95 applet_description = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
96 applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
97 applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
98
99 // add in the applet info for the phind applet
100 // need to make this dynamic - library names etc
101 // change the applet params - have a single param with the library name
102 // this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
103 // phindcgi param now is not complete - library must be prepended to it.
104 String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='applet' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
105 app_info += GSParams.ACTION +"=a&amp;"+GSParams.REQUEST_TYPE +"=r&amp;"+GSParams.SERVICE+"="+PHIND_SERVICE+"&amp;"+GSParams.OUTPUT+"=xml&amp;"+GSParams.RESPONSE_ONLY+"=1'/>";
106 app_info +="<PARAM NAME='collection' VALUE='";
107 app_info += this.cluster_name;
108 app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
109
110 Document dom = this.converter.getDOM(app_info);
111 if (dom==null) {
112 logger.error("Couldn't parse applet info");
113 return false;
114 }
115 Element app_elem = dom.getDocumentElement();
116 applet_description.appendChild(this.desc_doc.importNode(app_elem, true));
117
118 return true;
119 }
120
121 protected Element getServiceDescription(Document doc, String service, String lang, String subset) {
122 if (!service.equals(PHIND_SERVICE)) {
123 return null;
124 }
125 Element describe = (Element)doc.importNode(applet_description,true);
126
127 Element el1 = GSXML.createDisplayTextElement(doc, GSXML.DISPLAY_TEXT_NAME, getTextString(PHIND_SERVICE+".name", lang));
128 describe.appendChild(el1);
129
130 Element el2 = GSXML.createDisplayTextElement(doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(PHIND_SERVICE+".description", lang));
131 describe.appendChild(el2);
132
133 return describe;
134 }
135
136 protected Element processPhindApplet(Element request) {
137 Document result_doc = XMLConverter.newDOM();
138 Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
139 HashMap<String, Serializable> params = GSXML.extractParams(param_elem, false);
140
141 long first_e = Long.parseLong((String)params.get("pfe"));
142 long last_e = Long.parseLong((String)params.get("ple"));
143 long first_l = Long.parseLong((String)params.get("pfl"));
144 long last_l = Long.parseLong((String)params.get("pll"));
145 long first_d = Long.parseLong((String)params.get("pfd"));
146 long last_d = Long.parseLong((String)params.get("pld"));
147
148 long phrase;
149 String phrase_str = (String)params.get("ppnum");
150 if (phrase_str == null || phrase_str.equals("")) {
151 phrase=0;
152 } else {
153 phrase = Long.parseLong(phrase_str);
154 }
155 String word = (String)params.get("pptext");
156 String phind_index = (String)params.get("pc");
157 // the location of the mgpp database files
158 this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index);
159
160 // the result element
161 Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
162 result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
163 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
164
165 // applet result info must be in appletInfo element
166 Element applet_data = result_doc.createElement(GSXML.APPLET_DATA_ELEM);
167 result.appendChild(applet_data);
168 Element phind_data = result_doc.createElement("phindData");
169 applet_data.appendChild(phind_data);
170
171
172 // if we dont know the phrase number, look it up
173 if (phrase == 0) {
174 if (word==null || word.equals("")) {
175 Element error = phindError(result_doc, "no word or phrase");
176 phind_data.appendChild(error);
177 return result;
178 }
179 phrase = findPhraseNumberFromWord( word);
180 }
181 if (phrase==0) {
182 // the word is not in the collection
183 // return a phind error string
184 Element error = phindError(result_doc, "the term "+word+" is not in the collection");
185 phind_data.appendChild(error);
186 return result;
187 }
188
189 // get the phrase data into the phind_data node
190 getPhraseData(phind_data, phrase, first_l, last_l,
191 first_e, last_e, first_d, last_d);
192 return result;
193
194
195 }// processPhindApplet
196
197 protected long findPhraseNumberFromWord(String word) {
198 synchronized (mgpp_search_src) {
199 // set the mgpp index data - we are looking up pword
200 mgpp_search_src.loadIndexData(this.basepath+File.separatorChar+"pword");
201
202 mgpp_search_src.runQuery(word);
203
204 MGPPQueryResult res = mgpp_search_src.getQueryResult();
205 Vector docs = res.getDocs();
206 if (docs.size()==0) {
207 // phrase not found
208 return 0;
209 }
210 MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
211 return doc.num_;
212 }
213 }
214
215 protected boolean getPhraseData(Element phind_data,
216 long phrase, long first_l, long last_l,
217 long first_e, long last_e, long first_d,
218 long last_d) {
219
220 synchronized (mgpp_retrieve_src) {
221 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
222 phrase);
223 if (record.equals("")) {
224 Element error = phindError(phind_data.getOwnerDocument(), "somethings gone wrong - we haven't got a record for phrase number "+phrase);
225 phind_data.appendChild(error);
226 return false;
227 }
228
229 // parse the record - its in gordons cryptic form
230 // ":word:tf:ef:df:el:dl:lf:ll"
231 // el: e,e,e
232 // dl: d;f,d;f,
233 // lf and ll may be null
234 // l: type,dest, dest; type,dest,dest
235
236 // ignore everything up to and including first colon (has
237 // <Document>3505: at the start)
238 record = record.substring(record.indexOf(':')+1);
239
240 // split on ':'
241 String [] fields = record.split(":");
242 String word = fields[0];
243 String tf = fields[1];
244 String ef = fields[2];
245 String df = fields[3];
246
247
248 String expansions = fields[4];
249 String documents = fields[5];
250 String lf = "0";
251 String linklist = "";
252 if (fields.length > 7) {// have thesaurus stuff
253 lf =fields[6];
254 linklist = fields[7];
255 }
256
257 // the phindData attributes and phrase
258 phind_data.setAttribute("id", Long.toString(phrase));
259 phind_data.setAttribute("df", df);
260 phind_data.setAttribute("ef", ef);
261 phind_data.setAttribute("lf", lf);
262 phind_data.setAttribute("tf", tf);
263 // GSXML.createTextElement(result_doc, "phrase", word); ??? - this needs to be appended somewhere????
264
265 addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
266 addDocumentList(phind_data, documents, word, df, first_d, last_d);
267 if (!lf.equals("0")) {
268 addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
269 }
270 return true;
271 } // end of synchronized (mgpp_retrieve_src)
272 }
273
274 protected boolean addExpansionList( Element phind_data, String record,
275 String word,
276 String freq,
277 long first, long last) {
278 Document phind_doc = phind_data.getOwnerDocument();
279 Element expansion_list = phind_doc.createElement("expansionList");
280 phind_data.appendChild(expansion_list);
281 expansion_list.setAttribute("length", freq);
282 expansion_list.setAttribute("start", Long.toString(first));
283 expansion_list.setAttribute("end", Long.toString(last));
284
285 // get the list of strings
286 String [] expansions = record.split(",");
287 int length = expansions.length;
288 if (length < last) last = length;
289 for (long i = first; i < last; i++) {
290 long num = Long.parseLong(expansions[(int)i]);
291 Element expansion = getExpansion(phind_doc, num, word);
292 expansion.setAttribute("num", Long.toString(i));
293 expansion_list.appendChild(expansion);
294 }
295 return true;
296 }
297
298 protected Element getExpansion(Document phind_doc, long phrase_num,
299 String orig_phrase) {
300
301 // look up the phrase in the pdata thingy
302 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
303 phrase_num);
304
305 if (record ==null || record.equals("")) return null;
306
307 // ignore everything up to and including first colon
308 record = record.substring(record.indexOf(':')+1);
309
310 String [] fields = record.split(":");
311 String phrase = fields[0];
312 String tf = fields[1];
313 //String ef = fields[2]; dont use this
314 String df = fields[3];
315
316 Element expansion = phind_doc.createElement("expansion");
317 expansion.setAttribute("tf", tf);
318 expansion.setAttribute("df", df);
319 expansion.setAttribute("id", Long.toString(phrase_num));
320
321 // get teh suffix and prefix
322 String [] ends = splitPhraseOnWord(phrase, orig_phrase);
323 if (!ends[0].equals("")) {
324 expansion.appendChild(GSXML.createTextElement(phind_doc, "prefix", ends[0]));
325 }
326 if (!ends[1].equals("")) {
327 expansion.appendChild(GSXML.createTextElement(phind_doc, "suffix", ends[1]));
328 }
329
330 return expansion;
331
332 }
333
334 protected boolean addDocumentList(Element phind_data, String record,
335 String word,
336 String freq,
337 long first, long last) {
338 Document phind_doc = phind_data.getOwnerDocument();
339 Element document_list = phind_doc.createElement("documentList");
340 phind_data.appendChild(document_list);
341 document_list.setAttribute("length", freq);
342 document_list.setAttribute("start", Long.toString(first));
343 document_list.setAttribute("end", Long.toString(last));
344
345 // get the list of doc,freq
346 String [] doc_freqs = record.split(";");
347 int length = doc_freqs.length;
348 if (length<last) last=length;
349
350 for (long i = first; i < last; i++) {
351 String doc_elem = doc_freqs[(int)i];
352 int p = doc_elem.indexOf(',');
353 long doc_num;
354 String doc_freq;
355 if (p == -1) { // there is no freq in the record
356 doc_num =Long.parseLong(doc_elem);
357 doc_freq = "1";
358 } else {
359 doc_num = Long.parseLong(doc_elem.substring(0,p));
360 doc_freq = doc_elem.substring(p+1);
361 }
362 Element document = getDocument(phind_doc, doc_num);
363 document.setAttribute("freq", doc_freq);
364 document.setAttribute("num", Long.toString(i));
365 document_list.appendChild(document);
366 }
367
368
369 return true;
370 }
371
372
373 protected Element getDocument(Document phind_doc, long doc_num) {
374
375 // look up the phrase in the docs thingy
376 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"docs", "Document",
377 doc_num);
378
379 if (record ==null || record.equals("")) return null;
380
381 // ignore everything up to and including first \t
382 record = record.substring(record.indexOf('\t')+1);
383
384 String [] fields = record.split("\t");
385 String hash = fields[0];
386 String title = fields[1];
387
388 Element d = phind_doc.createElement("document");
389 d.setAttribute("hash", hash);
390 d.appendChild(GSXML.createTextElement(phind_doc, "title", title));
391
392 return d;
393
394 }
395 protected boolean addThesaurusList(Element phind_data, String record,
396 String word,
397 String freq,
398 long first, long last) {
399
400 Document phind_doc = phind_data.getOwnerDocument();
401 Element thesaurus_list = phind_doc.createElement("thesaurusList");
402 phind_data.appendChild(thesaurus_list);
403 thesaurus_list.setAttribute("length", freq);
404 thesaurus_list.setAttribute("start", Long.toString(first));
405 thesaurus_list.setAttribute("end", Long.toString(last));
406
407 // get the list of type,dest,dest
408 String [] links = record.split(";");
409 int length = links.length;
410 long index = 0;
411 for (int i = 0; i < length; i++) { // go through the entries
412 String link_info = links[(int)i];
413 String [] items = link_info.split(",");
414 // the first entry is teh type
415 String type = items[0];
416 for (int j = 1; j<items.length; j++, index++) {
417 if (index >= first && index < last) { // only output the ones we want
418 long phrase = Long.parseLong(items[j]);
419 Element t = getThesaurus(phind_doc, phrase);
420 t.setAttribute("type", type);
421 thesaurus_list.appendChild(t);
422 }
423 }
424 }
425
426 return true;
427 }
428
429 protected Element getThesaurus(Document phind_doc, long phrase_num) {
430
431 // look up the phrase in the pdata thingy
432 String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
433 phrase_num);
434
435 if (record ==null || record.equals("")) return null;
436
437 // ignore everything up to and including first colon
438 record = record.substring(record.indexOf(':')+1);
439
440 String [] fields = record.split(":");
441 String phrase = fields[0];
442 String tf = fields[1];
443 //String ef = fields[2]; dont use this
444 String df = fields[3];
445
446 Element thesaurus = phind_doc.createElement("thesaurus");
447 thesaurus.setAttribute("tf", tf);
448 thesaurus.setAttribute("df", df);
449 thesaurus.setAttribute("id", Long.toString(phrase_num));
450 thesaurus.appendChild(GSXML.createTextElement(phind_doc, "phrase", phrase));
451 return thesaurus;
452
453 }
454
455 /** returns an array of two elements - the prefix and the suffix*/
456 protected String [] splitPhraseOnWord(String phrase, String word) {
457
458 if (word.equals("")) {
459
460 String [] res = {phrase, ""};
461 return res;
462 }
463 // use 2 so that we only split on the first occurrance. trailing empty strings should be included
464 String [] result = phrase.split(word, 2);
465 return result;
466
467 }
468
469 protected Element phindError(Document phind_doc, String message) {
470 Element e = phind_doc.createElement("phindError");
471 Text t = phind_doc.createTextNode(message);
472 e.appendChild(t);
473 return e;
474 }
475
476}
477
Note: See TracBrowser for help on using the repository browser.