/*
* PhindServices.java
* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
package org.greenstone.gsdl3.service;
import org.greenstone.gsdl3.util.*;
import org.greenstone.mgpp.*;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
import java.util.Vector;
import java.util.HashMap;
import java.io.File;
/**
* PhindServices - the phind phrase browsing service
*
* @author Katherine Don
* @version $Revision: 9798 $
*/
public class PhindPhraseBrowse
extends ServiceRack {
// the services on offer
private static final String PHIND_SERVICE = "PhindApplet";
private MGPPWrapper mgpp_src=null;
private String basepath = null;
private Element applet_description = null;
public PhindPhraseBrowse() {
this.mgpp_src = new MGPPWrapper();
// set up the default params
this.mgpp_src.setQueryLevel("Document");
this.mgpp_src.setReturnLevel("Document");
this.mgpp_src.setMaxDocs(5);
this.mgpp_src.setStem(false);
this.mgpp_src.setCase(true);
}
/** configure the service module
*
* @param info a DOM Element containing any config info for the service
* @return true if configured
*/
public boolean configure(Element info, Element extra_info) {
System.out.println("configuring PhindPhraseBrowse");
// set up short_service_info_ - for now just has name and type
Element e = this.doc.createElement(GSXML.SERVICE_ELEM);
e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
this.short_service_info.appendChild(e);
// set up the static applet description
applet_description = this.doc.createElement(GSXML.SERVICE_ELEM);
applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
// add in the applet info for the phind applet
// need to make this dynamic - library names etc
// change the applet params - have a single param with the library name
// this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
// phindcgi param now is not complete - library must be prepended to it.
String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='lib' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'> ";
app_info +=" The Phind java applet."+GSXML.APPLET_ELEM+">";
Document dom = this.converter.getDOM(app_info);
if (dom==null) {
System.err.println("PhindPhraseBrowse.configure Error: Couldn't parse applet info");
return false;
}
Element app_elem = dom.getDocumentElement();
applet_description.appendChild(this.doc.importNode(app_elem, true));
return true;
}
protected Element getServiceDescription(String service, String lang, String subset) {
if (!service.equals(PHIND_SERVICE)) {
return null;
}
Element describe = (Element) applet_description.cloneNode(true);
describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(PHIND_SERVICE+".name", lang)));
describe.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(PHIND_SERVICE+".description", lang)));
return describe;
}
protected Element processPhindApplet(Element request) {
Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
HashMap params = GSXML.extractParams(param_elem, false);
long first_e = Long.parseLong((String)params.get("pfe"));
long last_e = Long.parseLong((String)params.get("ple"));
long first_l = Long.parseLong((String)params.get("pfl"));
long last_l = Long.parseLong((String)params.get("pll"));
long first_d = Long.parseLong((String)params.get("pfd"));
long last_d = Long.parseLong((String)params.get("pld"));
long phrase;
String phrase_str = (String)params.get("ppnum");
if (phrase_str == null || phrase_str.equals("")) {
phrase=0;
} else {
phrase = Long.parseLong(phrase_str);
}
String word = (String)params.get("pptext");
String phind_index = (String)params.get("pc");
// the location of the mgpp database files
this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index);
// the result element
Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
// applet result info must be in appletInfo element
Element applet_data = this.doc.createElement(GSXML.APPLET_DATA_ELEM);
result.appendChild(applet_data);
Element phind_data = this.doc.createElement("phindData");
applet_data.appendChild(phind_data);
// if we dont know the phrase number, look it up
if (phrase == 0) {
if (word==null || word.equals("")) {
Element error = phindError("no word or phrase");
phind_data.appendChild(error);
return result;
}
phrase = findPhraseNumberFromWord( word);
}
if (phrase==0) {
// the word is not in the collection
// return a phind error string
Element error = phindError("the term "+word+" is not in the collection");
phind_data.appendChild(error);
return result;
}
// get the phrase data into the phind_data node
getPhraseData(phind_data, phrase, first_l, last_l,
first_e, last_e, first_d, last_d);
return result;
}// processPhindApplet
protected long findPhraseNumberFromWord(String word) {
// set the mgpp index data - we are looking up pword
this.mgpp_src.loadIndexData(this.basepath+File.separatorChar+"pword");
this.mgpp_src.runQuery(word);
MGPPQueryResult res = this.mgpp_src.getQueryResult();
Vector docs = res.getDocs();
if (docs.size()==0) {
// phrase not found
return 0;
}
MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
return doc.num_;
}
protected boolean getPhraseData(Element phind_data,
long phrase, long first_l, long last_l,
long first_e, long last_e, long first_d,
long last_d) {
String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
phrase);
if (record.equals("")) {
Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
phind_data.appendChild(error);
return false;
}
// parse the record - its in gordons cryptic form
// ":word:tf:ef:df:el:dl:lf:ll"
// el: e,e,e
// dl: d;f,d;f,
// lf and ll may be null
// l: type,dest, dest; type,dest,dest
// ignore everything up to and including first colon (has
// 3505: at the start)
record = record.substring(record.indexOf(':')+1);
// split on ':'
String [] fields = record.split(":");
String word = fields[0];
String tf = fields[1];
String ef = fields[2];
String df = fields[3];
String expansions = fields[4];
String documents = fields[5];
String lf = "0";
String linklist = "";
if (fields.length > 7) {// have thesaurus stuff
lf =fields[6];
linklist = fields[7];
}
// the phindData attributes and phrase
phind_data.setAttribute("id", Long.toString(phrase));
phind_data.setAttribute("df", df);
phind_data.setAttribute("ef", ef);
phind_data.setAttribute("lf", lf);
phind_data.setAttribute("tf", tf);
GSXML.createTextElement(this.doc, "phrase", word);
addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
addDocumentList(phind_data, documents, word, df, first_d, last_d);
if (!lf.equals("0")) {
addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
}
return true;
}
protected boolean addExpansionList( Element phind_data, String record,
String word,
String freq,
long first, long last) {
Element expansion_list = this.doc.createElement("expansionList");
phind_data.appendChild(expansion_list);
expansion_list.setAttribute("length", freq);
expansion_list.setAttribute("start", Long.toString(first));
expansion_list.setAttribute("end", Long.toString(last));
// get the list of strings
String [] expansions = record.split(",");
int length = expansions.length;
if (length < last) last = length;
for (long i = first; i < last; i++) {
long num = Long.parseLong(expansions[(int)i]);
Element expansion = getExpansion( num, word);
expansion.setAttribute("num", Long.toString(i));
expansion_list.appendChild(expansion);
}
return true;
}
protected Element getExpansion(long phrase_num,
String orig_phrase) {
// look up the phrase in the pdata thingy
String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
phrase_num);
if (record ==null || record.equals("")) return null;
// ignore everything up to and including first colon
record = record.substring(record.indexOf(':')+1);
String [] fields = record.split(":");
String phrase = fields[0];
String tf = fields[1];
//String ef = fields[2]; dont use this
String df = fields[3];
Element expansion = this.doc.createElement("expansion");
expansion.setAttribute("tf", tf);
expansion.setAttribute("df", df);
expansion.setAttribute("id", Long.toString(phrase_num));
// get teh suffix and prefix
String [] ends = splitPhraseOnWord(phrase, orig_phrase);
if (!ends[0].equals("")) {
expansion.appendChild(GSXML.createTextElement(this.doc, "prefix", ends[0]));
}
if (!ends[1].equals("")) {
expansion.appendChild(GSXML.createTextElement(this.doc, "suffix", ends[1]));
}
return expansion;
}
protected boolean addDocumentList(Element phind_data, String record,
String word,
String freq,
long first, long last) {
Element document_list = this.doc.createElement("documentList");
phind_data.appendChild(document_list);
document_list.setAttribute("length", freq);
document_list.setAttribute("start", Long.toString(first));
document_list.setAttribute("end", Long.toString(last));
// get the list of doc,freq
String [] doc_freqs = record.split(";");
int length = doc_freqs.length;
if (length= first && index < last) { // only output the ones we want
long phrase = Long.parseLong(items[j]);
Element t = getThesaurus(phrase);
t.setAttribute("type", type);
thesaurus_list.appendChild(t);
}
}
}
return true;
}
protected Element getThesaurus(long phrase_num) {
// look up the phrase in the pdata thingy
String record = this.mgpp_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
phrase_num);
if (record ==null || record.equals("")) return null;
// ignore everything up to and including first colon
record = record.substring(record.indexOf(':')+1);
String [] fields = record.split(":");
String phrase = fields[0];
String tf = fields[1];
//String ef = fields[2]; dont use this
String df = fields[3];
Element thesaurus = this.doc.createElement("thesaurus");
thesaurus.setAttribute("tf", tf);
thesaurus.setAttribute("df", df);
thesaurus.setAttribute("id", Long.toString(phrase_num));
thesaurus.appendChild(GSXML.createTextElement(this.doc, "phrase", phrase));
return thesaurus;
}
/** returns an array of two elements - the prefix and the suffix*/
protected String [] splitPhraseOnWord(String phrase, String word) {
if (word.equals("")) {
String [] res = {phrase, ""};
return res;
}
// use 2 so that we only split on the first occurrance. trailing empty strings should be included
String [] result = phrase.split(word, 2);
return result;
}
protected Element phindError(String message) {
Element e = this.doc.createElement("phindError");
Text t = this.doc.createTextNode(message);
e.appendChild(t);
return e;
}
}