Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/PhindPhraseBrowse.java@ 32453

Last change on this file since 32453 was 29309, checked in by kjdon, 10 years ago
removed my name
Property svn:keywords set to `Author Date Id Revision`
File size: 16.9 KB

Line
1	/*
2	* PhindServices.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.service;
20
21	import org.greenstone.gsdl3.util.*;
22
23	import org.greenstone.mgpp.*;
24	import org.w3c.dom.Document;
25	import org.w3c.dom.Node;
26	import org.w3c.dom.Element;
27	import org.w3c.dom.Text;
28
29	import java.util.Vector;
30	import java.util.HashMap;
31	import java.io.File;
32	import java.io.Serializable;
33
34	import org.apache.log4j.*;
35
36	/**
37	* PhindServices - the phind phrase browsing service
38	*
39	*/
40	public class PhindPhraseBrowse
41	extends ServiceRack {
42
43	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.PhindPhraseBrowse.class.getName());
44
45	// the services on offer
46	private static final String PHIND_SERVICE = "PhindApplet";
47
48	private static MGPPRetrieveWrapper mgpp_retrieve_src=null;
49	private static MGPPSearchWrapper mgpp_search_src=null;
50	private String basepath = null;
51
52	private Element applet_description = null;
53
54	public PhindPhraseBrowse() {
55	if(this.mgpp_retrieve_src == null) {
56	this.mgpp_retrieve_src = new MGPPRetrieveWrapper();
57	}
58	if(this.mgpp_search_src == null) {
59	this.mgpp_search_src = new MGPPSearchWrapper();
60	}
61	// set up the default params
62	this.mgpp_search_src.setQueryLevel("Document");
63	this.mgpp_search_src.setReturnLevel("Document");
64	this.mgpp_search_src.setMaxDocs(5);
65	this.mgpp_search_src.setStem(false);
66	this.mgpp_search_src.setCase(true);
67	}
68
69	public void cleanUp() {
70	super.cleanUp();
71	this.mgpp_search_src.unloadIndexData();
72	}
73
74	/** configure the service module
75	*
76	* @param info a DOM Element containing any config info for the service
77	* @return true if configured
78	*/
79	public boolean configure(Element info, Element extra_info) {
80
81	if (!super.configure(info, extra_info)){
82	return false;
83	}
84
85	logger.info("configuring PhindPhraseBrowse");
86
87	// set up short_service_info_ - for now just has name and type
88	Element e = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
89	e.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
90	e.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
91	this.short_service_info.appendChild(e);
92
93	// set up the static applet description
94
95	applet_description = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
96	applet_description.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_APPLET);
97	applet_description.setAttribute(GSXML.NAME_ATT, PHIND_SERVICE);
98
99	// add in the applet info for the phind applet
100	// need to make this dynamic - library names etc
101	// change the applet params - have a single param with the library name
102	// this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
103	// phindcgi param now is not complete - library must be prepended to it.
104	String app_info = "<"+GSXML.APPLET_ELEM+" CODEBASE='applet' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?";
105	app_info += GSParams.ACTION +"=a&"+GSParams.REQUEST_TYPE +"=r&"+GSParams.SERVICE+"="+PHIND_SERVICE+"&"+GSParams.OUTPUT+"=xml&"+GSParams.RESPONSE_ONLY+"=1'/>";
106	app_info +="<PARAM NAME='collection' VALUE='";
107	app_info += this.cluster_name;
108	app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</"+GSXML.APPLET_ELEM+">";
109
110	Document dom = this.converter.getDOM(app_info);
111	if (dom==null) {
112	logger.error("Couldn't parse applet info");
113	return false;
114	}
115	Element app_elem = dom.getDocumentElement();
116	applet_description.appendChild(this.desc_doc.importNode(app_elem, true));
117
118	return true;
119	}
120
121	protected Element getServiceDescription(Document doc, String service, String lang, String subset) {
122	if (!service.equals(PHIND_SERVICE)) {
123	return null;
124	}
125	Element describe = (Element)doc.importNode(applet_description,true);
126
127	Element el1 = GSXML.createDisplayTextElement(doc, GSXML.DISPLAY_TEXT_NAME, getTextString(PHIND_SERVICE+".name", lang));
128	describe.appendChild(el1);
129
130	Element el2 = GSXML.createDisplayTextElement(doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(PHIND_SERVICE+".description", lang));
131	describe.appendChild(el2);
132
133	return describe;
134	}
135
136	protected Element processPhindApplet(Element request) {
137	Document result_doc = XMLConverter.newDOM();
138	Element param_elem = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
139	HashMap<String, Serializable> params = GSXML.extractParams(param_elem, false);
140
141	long first_e = Long.parseLong((String)params.get("pfe"));
142	long last_e = Long.parseLong((String)params.get("ple"));
143	long first_l = Long.parseLong((String)params.get("pfl"));
144	long last_l = Long.parseLong((String)params.get("pll"));
145	long first_d = Long.parseLong((String)params.get("pfd"));
146	long last_d = Long.parseLong((String)params.get("pld"));
147
148	long phrase;
149	String phrase_str = (String)params.get("ppnum");
150	if (phrase_str == null \|\| phrase_str.equals("")) {
151	phrase=0;
152	} else {
153	phrase = Long.parseLong(phrase_str);
154	}
155	String word = (String)params.get("pptext");
156	String phind_index = (String)params.get("pc");
157	// the location of the mgpp database files
158	this.basepath = GSFile.phindBaseDir(this.site_home, this.cluster_name, phind_index);
159
160	// the result element
161	Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
162	result.setAttribute(GSXML.FROM_ATT, PHIND_SERVICE);
163	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
164
165	// applet result info must be in appletInfo element
166	Element applet_data = result_doc.createElement(GSXML.APPLET_DATA_ELEM);
167	result.appendChild(applet_data);
168	Element phind_data = result_doc.createElement("phindData");
169	applet_data.appendChild(phind_data);
170
171
172	// if we dont know the phrase number, look it up
173	if (phrase == 0) {
174	if (word==null \|\| word.equals("")) {
175	Element error = phindError(result_doc, "no word or phrase");
176	phind_data.appendChild(error);
177	return result;
178	}
179	phrase = findPhraseNumberFromWord( word);
180	}
181	if (phrase==0) {
182	// the word is not in the collection
183	// return a phind error string
184	Element error = phindError(result_doc, "the term "+word+" is not in the collection");
185	phind_data.appendChild(error);
186	return result;
187	}
188
189	// get the phrase data into the phind_data node
190	getPhraseData(phind_data, phrase, first_l, last_l,
191	first_e, last_e, first_d, last_d);
192	return result;
193
194
195	}// processPhindApplet
196
197	protected long findPhraseNumberFromWord(String word) {
198	synchronized (mgpp_search_src) {
199	// set the mgpp index data - we are looking up pword
200	mgpp_search_src.loadIndexData(this.basepath+File.separatorChar+"pword");
201
202	mgpp_search_src.runQuery(word);
203
204	MGPPQueryResult res = mgpp_search_src.getQueryResult();
205	Vector docs = res.getDocs();
206	if (docs.size()==0) {
207	// phrase not found
208	return 0;
209	}
210	MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
211	return doc.num_;
212	}
213	}
214
215	protected boolean getPhraseData(Element phind_data,
216	long phrase, long first_l, long last_l,
217	long first_e, long last_e, long first_d,
218	long last_d) {
219
220	synchronized (mgpp_retrieve_src) {
221	String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
222	phrase);
223	if (record.equals("")) {
224	Element error = phindError(phind_data.getOwnerDocument(), "somethings gone wrong - we haven't got a record for phrase number "+phrase);
225	phind_data.appendChild(error);
226	return false;
227	}
228
229	// parse the record - its in gordons cryptic form
230	// ":word:tf:ef:df:el:dl:lf:ll"
231	// el: e,e,e
232	// dl: d;f,d;f,
233	// lf and ll may be null
234	// l: type,dest, dest; type,dest,dest
235
236	// ignore everything up to and including first colon (has
237	// <Document>3505: at the start)
238	record = record.substring(record.indexOf(':')+1);
239
240	// split on ':'
241	String [] fields = record.split(":");
242	String word = fields[0];
243	String tf = fields[1];
244	String ef = fields[2];
245	String df = fields[3];
246
247
248	String expansions = fields[4];
249	String documents = fields[5];
250	String lf = "0";
251	String linklist = "";
252	if (fields.length > 7) {// have thesaurus stuff
253	lf =fields[6];
254	linklist = fields[7];
255	}
256
257	// the phindData attributes and phrase
258	phind_data.setAttribute("id", Long.toString(phrase));
259	phind_data.setAttribute("df", df);
260	phind_data.setAttribute("ef", ef);
261	phind_data.setAttribute("lf", lf);
262	phind_data.setAttribute("tf", tf);
263	// GSXML.createTextElement(result_doc, "phrase", word); ??? - this needs to be appended somewhere????
264
265	addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
266	addDocumentList(phind_data, documents, word, df, first_d, last_d);
267	if (!lf.equals("0")) {
268	addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
269	}
270	return true;
271	} // end of synchronized (mgpp_retrieve_src)
272	}
273
274	protected boolean addExpansionList( Element phind_data, String record,
275	String word,
276	String freq,
277	long first, long last) {
278	Document phind_doc = phind_data.getOwnerDocument();
279	Element expansion_list = phind_doc.createElement("expansionList");
280	phind_data.appendChild(expansion_list);
281	expansion_list.setAttribute("length", freq);
282	expansion_list.setAttribute("start", Long.toString(first));
283	expansion_list.setAttribute("end", Long.toString(last));
284
285	// get the list of strings
286	String [] expansions = record.split(",");
287	int length = expansions.length;
288	if (length < last) last = length;
289	for (long i = first; i < last; i++) {
290	long num = Long.parseLong(expansions[(int)i]);
291	Element expansion = getExpansion(phind_doc, num, word);
292	expansion.setAttribute("num", Long.toString(i));
293	expansion_list.appendChild(expansion);
294	}
295	return true;
296	}
297
298	protected Element getExpansion(Document phind_doc, long phrase_num,
299	String orig_phrase) {
300
301	// look up the phrase in the pdata thingy
302	String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
303	phrase_num);
304
305	if (record ==null \|\| record.equals("")) return null;
306
307	// ignore everything up to and including first colon
308	record = record.substring(record.indexOf(':')+1);
309
310	String [] fields = record.split(":");
311	String phrase = fields[0];
312	String tf = fields[1];
313	//String ef = fields[2]; dont use this
314	String df = fields[3];
315
316	Element expansion = phind_doc.createElement("expansion");
317	expansion.setAttribute("tf", tf);
318	expansion.setAttribute("df", df);
319	expansion.setAttribute("id", Long.toString(phrase_num));
320
321	// get teh suffix and prefix
322	String [] ends = splitPhraseOnWord(phrase, orig_phrase);
323	if (!ends[0].equals("")) {
324	expansion.appendChild(GSXML.createTextElement(phind_doc, "prefix", ends[0]));
325	}
326	if (!ends[1].equals("")) {
327	expansion.appendChild(GSXML.createTextElement(phind_doc, "suffix", ends[1]));
328	}
329
330	return expansion;
331
332	}
333
334	protected boolean addDocumentList(Element phind_data, String record,
335	String word,
336	String freq,
337	long first, long last) {
338	Document phind_doc = phind_data.getOwnerDocument();
339	Element document_list = phind_doc.createElement("documentList");
340	phind_data.appendChild(document_list);
341	document_list.setAttribute("length", freq);
342	document_list.setAttribute("start", Long.toString(first));
343	document_list.setAttribute("end", Long.toString(last));
344
345	// get the list of doc,freq
346	String [] doc_freqs = record.split(";");
347	int length = doc_freqs.length;
348	if (length<last) last=length;
349
350	for (long i = first; i < last; i++) {
351	String doc_elem = doc_freqs[(int)i];
352	int p = doc_elem.indexOf(',');
353	long doc_num;
354	String doc_freq;
355	if (p == -1) { // there is no freq in the record
356	doc_num =Long.parseLong(doc_elem);
357	doc_freq = "1";
358	} else {
359	doc_num = Long.parseLong(doc_elem.substring(0,p));
360	doc_freq = doc_elem.substring(p+1);
361	}
362	Element document = getDocument(phind_doc, doc_num);
363	document.setAttribute("freq", doc_freq);
364	document.setAttribute("num", Long.toString(i));
365	document_list.appendChild(document);
366	}
367
368
369	return true;
370	}
371
372
373	protected Element getDocument(Document phind_doc, long doc_num) {
374
375	// look up the phrase in the docs thingy
376	String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"docs", "Document",
377	doc_num);
378
379	if (record ==null \|\| record.equals("")) return null;
380
381	// ignore everything up to and including first \t
382	record = record.substring(record.indexOf('\t')+1);
383
384	String [] fields = record.split("\t");
385	String hash = fields[0];
386	String title = fields[1];
387
388	Element d = phind_doc.createElement("document");
389	d.setAttribute("hash", hash);
390	d.appendChild(GSXML.createTextElement(phind_doc, "title", title));
391
392	return d;
393
394	}
395	protected boolean addThesaurusList(Element phind_data, String record,
396	String word,
397	String freq,
398	long first, long last) {
399
400	Document phind_doc = phind_data.getOwnerDocument();
401	Element thesaurus_list = phind_doc.createElement("thesaurusList");
402	phind_data.appendChild(thesaurus_list);
403	thesaurus_list.setAttribute("length", freq);
404	thesaurus_list.setAttribute("start", Long.toString(first));
405	thesaurus_list.setAttribute("end", Long.toString(last));
406
407	// get the list of type,dest,dest
408	String [] links = record.split(";");
409	int length = links.length;
410	long index = 0;
411	for (int i = 0; i < length; i++) { // go through the entries
412	String link_info = links[(int)i];
413	String [] items = link_info.split(",");
414	// the first entry is teh type
415	String type = items[0];
416	for (int j = 1; j<items.length; j++, index++) {
417	if (index >= first && index < last) { // only output the ones we want
418	long phrase = Long.parseLong(items[j]);
419	Element t = getThesaurus(phind_doc, phrase);
420	t.setAttribute("type", type);
421	thesaurus_list.appendChild(t);
422	}
423	}
424	}
425
426	return true;
427	}
428
429	protected Element getThesaurus(Document phind_doc, long phrase_num) {
430
431	// look up the phrase in the pdata thingy
432	String record = this.mgpp_retrieve_src.getDocument(this.basepath+File.separatorChar+"pdata", "Document",
433	phrase_num);
434
435	if (record ==null \|\| record.equals("")) return null;
436
437	// ignore everything up to and including first colon
438	record = record.substring(record.indexOf(':')+1);
439
440	String [] fields = record.split(":");
441	String phrase = fields[0];
442	String tf = fields[1];
443	//String ef = fields[2]; dont use this
444	String df = fields[3];
445
446	Element thesaurus = phind_doc.createElement("thesaurus");
447	thesaurus.setAttribute("tf", tf);
448	thesaurus.setAttribute("df", df);
449	thesaurus.setAttribute("id", Long.toString(phrase_num));
450	thesaurus.appendChild(GSXML.createTextElement(phind_doc, "phrase", phrase));
451	return thesaurus;
452
453	}
454
455	/** returns an array of two elements - the prefix and the suffix*/
456	protected String [] splitPhraseOnWord(String phrase, String word) {
457
458	if (word.equals("")) {
459
460	String [] res = {phrase, ""};
461	return res;
462	}
463	// use 2 so that we only split on the first occurrance. trailing empty strings should be included
464	String [] result = phrase.split(word, 2);
465	return result;
466
467	}
468
469	protected Element phindError(Document phind_doc, String message) {
470	Element e = phind_doc.createElement("phindError");
471	Text t = phind_doc.createTextNode(message);
472	e.appendChild(t);
473	return e;
474	}
475
476	}
477

Note: See TracBrowser for help on using the repository browser.

Download in other formats: