Context Navigation

source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/PhindService.java@ 3471

Last change on this file since 3471 was 3471, checked in by kjdon, 22 years ago
service modules now belong to a serviceCluster or colleciton - collection_name has been changed to the more general cluster_name. service module cant configure itself from a file - we no longer know where the appropriate file is. so must be configured by passing the xml node to the configure method
Property svn:keywords set to `Author Date Id Revision`
File size: 15.9 KB

Line
1	/*
2	* PhindService.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.service;
20
21	import org.greenstone.gsdl3.util.*;
22
23	import org.greenstone.mgpp.*;
24	import org.w3c.dom.Document;
25	import org.w3c.dom.Node;
26	import org.w3c.dom.Element;
27	import org.w3c.dom.Text;
28
29	import java.util.Vector;
30	import java.util.HashMap;
31	/**
32	* PhindService - the phind phrase browsing service
33	*
34	* @author <a href="mailto:[email protected]">Katherine Don</a>
35	* @version $Revision: 3471 $
36	*/
37	public class PhindService
38	extends ServiceModule {
39
40	private MGPPWrapper mgpp_src_=null;
41	private String basepath_ = null;
42	public PhindService() {
43	mgpp_src_ = new MGPPWrapper();
44	// set up the default params
45	mgpp_src_.setQueryLevel("Document");
46	mgpp_src_.setReturnLevel("Document");
47	mgpp_src_.setMaxDocs(5);
48	mgpp_src_.setStem(false);
49	mgpp_src_.setCase(true);
50	}
51	/** configure the service module
52	*
53	* @param info a DOM Element containing any config info for the service
54	* @return true if configured
55	*/
56	public boolean configure(Element info) {
57
58	System.out.println("configuring PhindService");
59
60	// set up short_service_info_ - for now just has name and type
61	Element e = doc_.createElement("service");
62	e.setAttribute("type", "query");
63	e.setAttribute("name", "PhindApplet");
64	short_service_info_.appendChild(e);
65
66	// set up service_info_map_ - we only have one element, and it has
67	// no extra info yet - we are not processing the config info
68	Element f = doc_.createElement("service");
69	f.setAttribute("type", "query");
70	f.setAttribute("name", "PhindApplet");
71
72	// add in the applet info for the phind applet
73	// need to make this dynamic - library names etc
74	// change the applet params - have a single param with the library name
75	// this is left blank at this end, and must be filled in by applet action - if the library name is not needed, this param is left out
76	// phindcgi param now is not complete - library must be prepended to it.
77	String app_info = "<applet CODEBASE='lib/java' CODE='org.greenstone.applet.phind.Phind.class' ARCHIVE='phind.jar, xercesImpl.jar, gsdl3.jar, jaxp.jar, xml-apis.jar' WIDTH='500' HEIGHT='400'><PARAM NAME='library' VALUE=''/> <PARAM NAME='phindcgi' VALUE='?a=a&sa=r&sn=Phind'/>";
78	app_info +="<PARAM NAME='collection' VALUE='";
79	app_info += cluster_name_;
80	app_info += "'/> <PARAM NAME='classifier' VALUE='1'/> <PARAM NAME='orientation' VALUE='vertical'/> <PARAM NAME='depth' VALUE='2'/> <PARAM NAME='resultorder' VALUE='L,l,E,e,D,d'/> <PARAM NAME='backdrop' VALUE='interfaces/default/images/phindbg1.jpg'/><PARAM NAME='fontsize' VALUE='10'/> <PARAM NAME='blocksize' VALUE='10'/>The Phind java applet.</applet>";
81
82	Document dom = converter_.getDOM(app_info);
83	Element app_elem = dom.getDocumentElement();
84	f.appendChild(doc_.importNode(app_elem, true));
85
86	service_info_map_.put("PhindApplet", f);
87
88	return true;
89	}
90
91	protected Element processService(String name, Element request) {
92
93	if (!name.equals("PhindApplet")) {
94	System.err.println("PhindService:you have asked for a non-existant service - "+name+"!");
95	return null;
96	}
97	Element param_elem = (Element)GSXML.getChildByTagName(request, "paramList");
98	HashMap params = GSXML.extractParams(param_elem);
99
100	long first_e = Long.parseLong((String)params.get("pfe"));
101	long last_e = Long.parseLong((String)params.get("ple"));
102	long first_l = Long.parseLong((String)params.get("pfl"));
103	long last_l = Long.parseLong((String)params.get("pll"));
104	long first_d = Long.parseLong((String)params.get("pfd"));
105	long last_d = Long.parseLong((String)params.get("pld"));
106
107	long phrase;
108	String phrase_str = (String)params.get("ppnum");
109	if (phrase_str == null \|\| phrase_str.equals("")) {
110	phrase=0;
111	} else {
112	phrase = Long.parseLong(phrase_str);
113	}
114	String word = (String)params.get("pptext");
115	String phind_index = (String)params.get("pc");
116	// the location of the mgpp database files
117	basepath_ = GSFile.phindBaseDir(site_home_, cluster_name_, phind_index);
118
119	// the result element
120	Element result = doc_.createElement("response");
121	String from = GSPath.appendLink(cluster_name_, "PhindApplet");
122	result.setAttribute("from", from);
123	result.setAttribute("type", "query");
124
125	// applet result info must be in appletInfo element
126	Element applet_data = doc_.createElement("appletData");
127	result.appendChild(applet_data);
128	Element phind_data = doc_.createElement("phindData");
129	applet_data.appendChild(phind_data);
130
131
132	// if we dont know the phrase number, look it up
133	if (phrase == 0) {
134	if (word==null \|\| word.equals("")) {
135	Element error = phindError("no word or phrase");
136	phind_data.appendChild(error);
137	return result;
138	}
139	phrase = findPhraseNumberFromWord( word);
140	System.out.println("phind, term number for "+word+" is "+phrase);
141	}
142	if (phrase==0) {
143	// the word is not in the collection
144	// return a phind error string
145	Element error = phindError("the term "+word+" is not in the collection");
146	phind_data.appendChild(error);
147	return result;
148	}
149
150	// get the phrase data into the phind_data node
151	getPhraseData(phind_data, phrase, first_l, last_l,
152	first_e, last_e, first_d, last_d);
153	return result;
154
155
156	}// processService
157
158	protected long findPhraseNumberFromWord(String word) {
159
160	// set the mgpp index data - we are looking up pword
161	mgpp_src_.loadIndexData(basepath_, "pword");
162
163	mgpp_src_.runQuery(word);
164
165	MGPPQueryResult res = mgpp_src_.getQueryResult();
166	Vector docs = res.getDocs();
167	if (docs.size()==0) {
168	// phrase not found
169	return 0;
170	}
171	MGPPDocInfo doc = (MGPPDocInfo)docs.firstElement();
172	return doc.num_;
173	}
174
175	protected boolean getPhraseData(Element phind_data,
176	long phrase, long first_l, long last_l,
177	long first_e, long last_e, long first_d,
178	long last_d) {
179
180	String record = mgpp_src_.getDocument(basepath_, "pdata", "Document",
181	phrase);
182	if (record.equals("")) {
183	Element error = phindError("somethings gone wrong - we haven't got a record for phrase number "+phrase);
184	phind_data.appendChild(error);
185	return false;
186	}
187
188	System.out.println("record="+record);
189	// parse the record - its in gordons cryptic form
190	// ":word:tf:ef:df:el:dl:lf:ll"
191	// el: e,e,e
192	// dl: d;f,d;f,
193	// lf and ll may be null
194	// l: type,dest, dest; type,dest,dest
195
196	// ignore everything up to and including first colon (has
197	// <Document>3505: at the start)
198	record = record.substring(record.indexOf(':')+1);
199
200	// split on ':'
201	String [] fields = record.split(":");
202	String word = fields[0];
203	String tf = fields[1];
204	String ef = fields[2];
205	String df = fields[3];
206
207
208	String expansions = fields[4];
209	String documents = fields[5];
210	String lf = "0";
211	String linklist = "";
212	if (fields.length > 7) {// have thesaurus stuff
213	lf =fields[6];
214	linklist = fields[7];
215	}
216
217	// the phindData attributes and phrase
218	phind_data.setAttribute("id", Long.toString(phrase));
219	phind_data.setAttribute("df", df);
220	phind_data.setAttribute("ef", ef);
221	phind_data.setAttribute("lf", lf);
222	phind_data.setAttribute("tf", tf);
223	GSXML.createTextElement(doc_, "phrase", word);
224
225	addExpansionList(phind_data, expansions, word, ef, first_e, last_e);
226	addDocumentList(phind_data, documents, word, df, first_d, last_d);
227	if (!lf.equals("0")) {
228	System.out.println("adding thesaurus stuff");
229	addThesaurusList(phind_data, linklist, word, lf, first_l, last_l);
230	}
231	return true;
232	}
233
234	protected boolean addExpansionList( Element phind_data, String record,
235	String word,
236	String freq,
237	long first, long last) {
238
239	Element expansion_list = doc_.createElement("expansionList");
240	phind_data.appendChild(expansion_list);
241	expansion_list.setAttribute("length", freq);
242	expansion_list.setAttribute("start", Long.toString(first));
243	expansion_list.setAttribute("end", Long.toString(last));
244
245	// get the list of strings
246	String [] expansions = record.split(",");
247	int length = expansions.length;
248	if (length < last) last = length;
249	for (long i = first; i < last; i++) {
250	long num = Long.parseLong(expansions[(int)i]);
251	Element expansion = getExpansion( num, word);
252	expansion.setAttribute("num", Long.toString(i));
253	expansion_list.appendChild(expansion);
254	}
255	return true;
256	}
257
258	protected Element getExpansion(long phrase_num,
259	String orig_phrase) {
260
261	// look up the phrase in the pdata thingy
262	String record = mgpp_src_.getDocument(basepath_, "pdata", "Document",
263	phrase_num);
264
265	if (record ==null \|\| record.equals("")) return null;
266
267	// ignore everything up to and including first colon
268	record = record.substring(record.indexOf(':')+1);
269
270	String [] fields = record.split(":");
271	String phrase = fields[0];
272	String tf = fields[1];
273	//String ef = fields[2]; dont use this
274	String df = fields[3];
275
276	Element expansion = doc_.createElement("expansion");
277	expansion.setAttribute("tf", tf);
278	expansion.setAttribute("df", df);
279	expansion.setAttribute("id", Long.toString(phrase_num));
280
281	// get teh suffix and prefix
282	String [] ends = splitPhraseOnWord(phrase, orig_phrase);
283	if (!ends[0].equals("")) {
284	expansion.appendChild(GSXML.createTextElement(doc_, "prefix", ends[0]));
285	}
286	if (!ends[1].equals("")) {
287	expansion.appendChild(GSXML.createTextElement(doc_, "suffix", ends[1]));
288	}
289
290	return expansion;
291
292	}
293
294	protected boolean addDocumentList(Element phind_data, String record,
295	String word,
296	String freq,
297	long first, long last) {
298
299	Element document_list = doc_.createElement("documentList");
300	phind_data.appendChild(document_list);
301	document_list.setAttribute("length", freq);
302	document_list.setAttribute("start", Long.toString(first));
303	document_list.setAttribute("end", Long.toString(last));
304
305	// get the list of doc,freq
306	String [] doc_freqs = record.split(";");
307	int length = doc_freqs.length;
308	if (length<last) last=length;
309
310	for (long i = first; i < last; i++) {
311	String doc_elem = doc_freqs[(int)i];
312	int p = doc_elem.indexOf(',');
313	long doc_num;
314	String doc_freq;
315	if (p == -1) { // there is no freq in the record
316	doc_num =Long.parseLong(doc_elem);
317	doc_freq = "1";
318	} else {
319	doc_num = Long.parseLong(doc_elem.substring(0,p));
320	doc_freq = doc_elem.substring(p+1);
321	}
322	Element document = getDocument( doc_num);
323	document.setAttribute("freq", doc_freq);
324	document.setAttribute("num", Long.toString(i));
325	document_list.appendChild(document);
326	}
327
328
329	return true;
330	}
331
332
333	protected Element getDocument(long doc_num) {
334
335	// look up the phrase in the docs thingy
336	String record = mgpp_src_.getDocument(basepath_, "docs", "Document",
337	doc_num);
338
339	if (record ==null \|\| record.equals("")) return null;
340	System.out.println("doc record:"+record);
341
342	// ignore everything up to and including first \t
343	record = record.substring(record.indexOf('\t')+1);
344
345	String [] fields = record.split("\t");
346	String hash = fields[0];
347	String title = fields[1];
348
349	Element d = doc_.createElement("document");
350	d.setAttribute("hash", hash);
351	d.appendChild(GSXML.createTextElement(doc_, "title", title));
352
353	return d;
354
355	}
356	protected boolean addThesaurusList(Element phind_data, String record,
357	String word,
358	String freq,
359	long first, long last) {
360
361
362	Element thesaurus_list = doc_.createElement("thesaurusList");
363	phind_data.appendChild(thesaurus_list);
364	thesaurus_list.setAttribute("length", freq);
365	thesaurus_list.setAttribute("start", Long.toString(first));
366	thesaurus_list.setAttribute("end", Long.toString(last));
367
368	System.out.println("record for thesaurus="+record);
369
370	// get the list of type,dest,dest
371	String [] links = record.split(";");
372	int length = links.length;
373	long index = 0;
374	for (int i = 0; i < length; i++) { // go through the entries
375	String link_info = links[(int)i];
376	String [] items = link_info.split(",");
377	// the first entry is teh type
378	String type = items[0];
379	for (int j = 1; j<items.length; j++, index++) {
380	if (index >= first && index < last) { // only output the ones we want
381	long phrase = Long.parseLong(items[j]);
382	Element t = getThesaurus(phrase);
383	t.setAttribute("type", type);
384	thesaurus_list.appendChild(t);
385	}
386	}
387	}
388
389	return true;
390	}
391
392	protected Element getThesaurus(long phrase_num) {
393
394	// look up the phrase in the pdata thingy
395	String record = mgpp_src_.getDocument(basepath_, "pdata", "Document",
396	phrase_num);
397
398	if (record ==null \|\| record.equals("")) return null;
399
400	// ignore everything up to and including first colon
401	record = record.substring(record.indexOf(':')+1);
402
403	String [] fields = record.split(":");
404	String phrase = fields[0];
405	String tf = fields[1];
406	//String ef = fields[2]; dont use this
407	String df = fields[3];
408
409	Element thesaurus = doc_.createElement("thesaurus");
410	thesaurus.setAttribute("tf", tf);
411	thesaurus.setAttribute("df", df);
412	thesaurus.setAttribute("id", Long.toString(phrase_num));
413	thesaurus.appendChild(GSXML.createTextElement(doc_, "phrase", phrase));
414	return thesaurus;
415
416	}
417
418	/** returns an array of two elements - the prefix and the suffix*/
419	protected String [] splitPhraseOnWord(String phrase, String word) {
420
421	if (word.equals("")) {
422
423	String [] res = {phrase, ""};
424	return res;
425	}
426	// use 2 so that we only split on the first occurrance. trailing empty strings should be included
427	String [] result = phrase.split(word, 2);
428	if (result.length !=2) {
429	System.out.println("didn't get two substrings!!");
430	}
431	return result;
432
433	}
434
435	protected Element phindError(String message) {
436	Element e = doc_.createElement("phindError");
437	Text t = doc_.createTextNode(message);
438	e.appendChild(t);
439	return e;
440	}
441
442	}
443
444
445	/*
446	// CREATE dummy response
447	Element res = doc_.createElement("response");
448	res.setAttribute("from", "PhindApplet");
449	Element data = doc_.createElement("service");
450	Element app_data = doc_.createElement("appletData");
451	data.appendChild(app_data);
452	String phind_info ="<phindData id='2507' tf='19424' ef='1632' df='1843' lf='0'><phrase>FOREST</phrase><expansionList length='1632' start='0' end='10'><expansion num='0' id='177648' tf='2162' df='519'><suffix>MANAGEMENT</suffix></expansion> <expansion num='1' id='177531' tf='1958' df='566'><suffix>PRODUCTS</suffix></expansion> <expansion num='2' id='177469' tf='1328' df='532'><suffix>RESOURCES</suffix></expansion> <expansion num='3' id='177773' tf='943' df='177'><suffix>GENETIC</suffix></expansion> <expansion num='4' id='177335' tf='736' df='258'><prefix>SUSTAINABLE</prefix></expansion> </expansionList><documentList length='1843' start='0' end='10'><document num='0' hash='HASH011fb8a7d8bf781ab3cbb087' freq='363'><title>FO-edu List of Countries 0</title></document><document num='1' hash='HASH27ae41229eb0636849a5be' freq='344' ><title>FO-edu List of Countries 1</title></document><document num='2' hash='HASH0187ef85c9dbf5bf132ea1d1' freq='263'><title>FO-edu List of Countries 2</title></document><document num='3' hash='HASH0125ec9ef67960446f471280' freq='238'><title>FO-edu List of Countries 3</title></document><document num='4' hash='HASH67087f7717eb35050ce1ac' freq='213'><title>FO-edu List of Countries 4</title></document></documentList><thesaurusList><thesaurus num='3' id='36506' tf='0' df='0' type='RT'><phrase>FRANCOPHONE</phrase></thesaurus></thesaurusList></phindData>";
453
454	Node t = converter_.getDOM(phind_info).getDocumentElement();
455	app_data.appendChild(doc_.importNode(t, true));
456
457	res.appendChild(data);
458
459	return res;
460	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: