source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/IViaProxy.java@ 8158

Last change on this file since 8158 was 8158, checked in by nzdl, 20 years ago

renamed InfomineProxy to IViaProxy, and made it a bit more general - iVia address now specified by config file

  • Property svn:keywords set to Author Date Id Revision
File size: 18.2 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gdbm.*;
5import org.greenstone.gsdl3.util.*;
6
7// XML classes
8import org.w3c.dom.Element;
9import org.w3c.dom.Document;
10import org.w3c.dom.NodeList;
11
12import java.util.HashMap;
13import java.io.File;
14import java.io.InputStream;
15import java.io.BufferedReader;
16import java.io.InputStreamReader;
17import java.io.IOException;
18import java.net.HttpURLConnection;
19import java.net.URLConnection;
20import java.net.URL;
21import java.net.Authenticator;
22import java.net.MalformedURLException;
23
24/**
25 *
26 * @author <a href="mailto:[email protected]">Katherine Don</a>
27 * @version $Revision: 8158 $
28 */
29
30public class IViaProxy
31 extends ServiceRack {
32
33 // the services on offer
34 // these strings must match what is found in the properties file
35 protected static final String TEXT_QUERY_SERVICE = "TextQuery";
36 protected static final String DOC_CONTENT_SERVICE = "DocumentContentRetrieve";
37 protected static final String DOC_META_SERVICE = "DocumentMetadataRetrieve";
38 protected static final String QUERY_PARAM = "query";
39 protected static final String FIELD_PARAM = "fields";
40 // have standard gs param names for hits per page, and start page
41 // these need to be mapped to iVia params
42 protected static final String GS_HITS_PARAM = "hitsPerPage";
43 protected static final String IM_HITS_PARAM = "no_of_records_per_page";
44 protected static final String GS_START_PAGE_PARAM = "startPage";
45 protected static final String IM_START_PAGE_PARAM = "start_page_no";
46
47 protected String ivia_server_url = null;
48
49 public boolean configure(Element info, Element extra_info) {
50
51 Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer");
52 if (server_elem == null) {
53 System.err.println("IViaProxy.configure error: no iViaServer element found");
54 return false;
55 }
56 ivia_server_url = server_elem.getAttribute("url");
57 if (ivia_server_url.equals("")) {
58 System.err.println("IViaProxy.configure error: no url for the iViaServer element");
59 return false;
60 }
61 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
62 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
63 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
64 this.short_service_info.appendChild(tq_service);
65
66 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
67 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
68 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
69 this.short_service_info.appendChild(dc_service);
70
71 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
72 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
73 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
74 this.short_service_info.appendChild(dm_service);
75
76 //
77 // add some format info to service map if there is any
78 String path = GSPath.appendLink(GSXML.SEARCH_ELEM, GSXML.FORMAT_ELEM);
79 Element format = (Element) GSXML.getNodeByPath(extra_info, path);
80 if (format != null) {
81 this.format_info_map.put(TEXT_QUERY_SERVICE, this.doc.importNode(format, true));
82 }
83
84
85 // look for document display format
86 path = GSPath.appendLink(GSXML.DISPLAY_ELEM, GSXML.FORMAT_ELEM);
87 Element display_format = (Element)GSXML.getNodeByPath(extra_info, path);
88 if (display_format != null) {
89 this.format_info_map.put(DOC_CONTENT_SERVICE, this.doc.importNode(display_format, true));
90 // shoudl we make a copy?
91 }
92
93 return true;
94
95 }
96
97 protected Element getServiceDescription(String service, String lang, String subset) {
98
99 if (service.equals(TEXT_QUERY_SERVICE)) {
100 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
101 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
102 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
103 if (subset == null || subset.equals(GSXML.DISPLAY_TEXT_ELEM + GSXML.LIST_MODIFIER)) {
104 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(TEXT_QUERY_SERVICE+".name", lang)));
105 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_SUBMIT, getTextString(TEXT_QUERY_SERVICE+".submit", lang)));
106 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(TEXT_QUERY_SERVICE+".description", lang)));
107 }
108 if (subset == null || subset.equals(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER)) {
109 Element param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
110 tq_service.appendChild(param_list);
111 Element param = GSXML.createParameterDescription(this.doc, QUERY_PARAM, getTextString("param."+QUERY_PARAM, lang), GSXML.PARAM_TYPE_STRING, null, null, null);
112 param_list.appendChild(param);
113 String [] field_ids = {"kw", "au", "su", "ti", "de", "fu"};
114 String [] field_names = {
115 getTextString("param."+FIELD_PARAM+".kw", lang),
116 getTextString("param."+FIELD_PARAM+".au", lang),
117 getTextString("param."+FIELD_PARAM+".su", lang),
118 getTextString("param."+FIELD_PARAM+".ti", lang),
119 getTextString("param."+FIELD_PARAM+".de", lang),
120 getTextString("param."+FIELD_PARAM+".fu", lang) };
121
122 param = GSXML.createParameterDescription(this.doc, FIELD_PARAM, getTextString("param."+FIELD_PARAM, lang), GSXML.PARAM_TYPE_ENUM_MULTI, "kw,au,su,ti,de,fu", field_ids, field_names);
123 param_list.appendChild(param);
124
125
126 String [] hits_options = {"10", "30", "50"};
127 param = GSXML.createParameterDescription(this.doc, GS_HITS_PARAM, getTextString("param."+GS_HITS_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, "10", hits_options, hits_options);
128 param_list.appendChild(param);
129
130 param = GSXML.createParameterDescription(this.doc, GS_START_PAGE_PARAM, "", GSXML.PARAM_TYPE_INVISIBLE, "1", null, null);
131 param_list.appendChild(param);
132 }
133 return tq_service;
134 }
135 if (service.equals(DOC_META_SERVICE)) {
136 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
137 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
138 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
139 return dm_service;
140
141 }
142 if (service.equals(DOC_CONTENT_SERVICE)) {
143 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
144 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
145 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
146 return dc_service;
147
148
149 }
150 return null;
151 }
152
153 /** Process a text query - implemented by concrete subclasses */
154 protected Element processTextQuery(Element request) {
155
156 // Create a new (empty) result message
157 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
158 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
159 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
160 Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
161 result.appendChild(doc_node_list);
162
163
164 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
165 if (param_list == null) {
166 System.err.println("IViaProxy Error:: TextQuery request had no paramList.");
167 return result; // Return the empty result
168 }
169
170 // Process the request parameters
171 HashMap params = GSXML.extractParams(param_list, false);
172
173 // Make sure a query has been specified
174 String query = (String) params.get(QUERY_PARAM);
175 if (query == null || query.equals("")) {
176 return result; // Return the empty result
177 }
178 // tidy whitespace
179 query = query.replaceAll("\\s+", "+");
180 String url_string = ivia_server_url+"/cgi-bin/canned_search?theme=gsdl3&query="+query;
181
182 // check for fields
183 String fields = (String) params.get(FIELD_PARAM);
184 if (fields != null && !fields.equals("")) {
185 url_string += "&fields="+fields;
186 }
187
188 //check for hits per page
189 String hits_per_page = (String) params.get(GS_HITS_PARAM);
190 if (hits_per_page != null && !hits_per_page.equals("")) {
191 url_string += "&"+IM_HITS_PARAM+"="+hits_per_page;
192 }
193
194 // check for start page
195 String start_page = (String) params.get(GS_START_PAGE_PARAM);
196 if (start_page != null && !start_page.equals("")) {
197 url_string += "&"+IM_START_PAGE_PARAM+"="+start_page;
198 }
199 String results_num = null;
200 String doc_ids = null;
201 try {
202 ///ystem.err.println("IViaProxy, sending "+url_string);
203 BufferedReader reader = makeConnection(url_string);
204 results_num = reader.readLine();
205 doc_ids = reader.readLine();
206
207 } catch (Exception e) {
208 System.err.println("IViaProxy.TextQuery Error: exception happened during query");
209 e.printStackTrace();
210 return result;
211 }
212
213 if (results_num.startsWith("Resources: ")) {
214 results_num = results_num.substring(11);
215 } else {
216 System.err.println("IViaProxy.TextQuery Error: badly formatted results line: "+results_num);
217 return result;
218 }
219 if (doc_ids.startsWith("Ids: ")) {
220 doc_ids = doc_ids.substring(5).trim();
221 } else {
222 System.err.println("IViaProxy.TextQuery Error: badly formatted docs line: "+doc_ids);
223 return result;
224 }
225
226 // get the num docs and add to a metadata list
227 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
228 result.appendChild(metadata_list);
229
230 // Add a metadata element specifying the number of matching documents
231 long numdocs = Long.parseLong(results_num);
232 GSXML.addMetadata(this.doc, metadata_list, "numDocsMatched", ""+numdocs);
233 String [] ids = doc_ids.split(" ");
234
235 for (int d=0; d<ids.length; d++) {
236 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
237 doc_node.setAttribute(GSXML.NODE_ID_ATT, ids[d]);
238 doc_node_list.appendChild(doc_node);
239 }
240 ///ystem.err.println("IViaProxy result:");
241 ///ystem.err.println(this.converter.getString(result));
242 return result;
243
244 }
245
246 protected Element processDocumentMetadataRetrieve(Element request) {
247 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
248 result.setAttribute(GSXML.FROM_ATT, DOC_META_SERVICE);
249 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
250
251 // Get the parameters of the request
252 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
253 if (param_list == null) {
254 System.err.println("IViaProxy.DocumentMetadataRetrieve Error: missing paramList.\n");
255 return result; // Return the empty result
256 }
257
258 // The metadata information required
259 StringBuffer field_list = new StringBuffer();
260 Element param = (Element) param_list.getFirstChild();
261 while (param != null) {
262 // Identify the metadata information desired
263 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
264 String metadata = GSXML.getValue(param);
265 if (isAcceptableMetadata(metadata)) {
266 field_list.append(metadata);
267 field_list.append(",");
268 }
269 }
270 param = (Element) param.getNextSibling();
271 }
272
273 if (field_list.length()==0) {
274 System.err.println("IViaProxy.DocumentMetadataRetrieve Error: no metadata specified.\n");
275 return result;
276 }
277
278 // Get the documents
279 Element request_node_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
280 if (request_node_list == null) {
281 System.err.println("IViaProxy Error: DocumentMetadataRetrieve request had no "+GSXML.DOC_NODE_ELEM+"List.\n");
282 return result;
283 }
284
285 StringBuffer record_id_list = new StringBuffer();
286
287 NodeList request_nodes = request_node_list.getChildNodes();
288 for (int i = 0; i < request_nodes.getLength(); i++) {
289 Element request_node = (Element) request_nodes.item(i);
290 String node_id = request_node.getAttribute(GSXML.NODE_ID_ATT);
291 record_id_list.append(node_id);
292 record_id_list.append(",");
293 }
294
295 // do the query to the iVia server
296 String url_string = ivia_server_url+"/cgi-bin/view_record_set?theme=gsdl3&record_id_list="+record_id_list.toString()+"&field_list="+field_list.toString();
297
298 Element node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
299 result.appendChild(node_list);
300 try {
301 BufferedReader reader = makeConnection(url_string);
302 String line;
303 while ((line = reader.readLine()) != null) {
304 if (!line.startsWith("Record:")) {
305 continue;
306 }
307 // the first line is the record
308 line=line.substring(8);
309 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
310 doc_node.setAttribute(GSXML.NODE_ID_ATT, line);
311 Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
312 doc_node.appendChild(meta_list);
313 while ((line = reader.readLine()) != null) {
314 //metadata entry
315 int col_pos = line.indexOf(':');
316 if (col_pos == -1) {
317 // end of the metadata for this doc
318 break;
319 }
320 String name = line.substring(0,col_pos);
321 String value = line.substring(col_pos+2); // includes a space
322 GSXML.addMetadata(this.doc, meta_list, name, value);
323 }
324 node_list.appendChild(doc_node);
325
326 }
327 } catch (Exception e) {
328 System.err.println("IViaProxy Error:exception happened");
329 e.printStackTrace();
330 }
331 ///ystem.out.println("IViaProxy: returning result: ");
332 ///ystem.out.println(this.converter.getPrettyString(result));
333 return result;
334
335 }
336
337 protected Element processDocumentContentRetrieve(Element request) {
338 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
339 result.setAttribute(GSXML.FROM_ATT, DOC_CONTENT_SERVICE);
340 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
341
342 // Get the request doc_list
343 Element query_doc_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
344 if (query_doc_list == null) {
345 System.err.println("IViaProxy Error: DocumentContentRetrieve request specified no doc nodes.\n");
346 return result;
347 }
348
349 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
350 result.appendChild(doc_list);
351
352 // Get the documents
353 String[] doc_ids = GSXML.getAttributeValuesFromList(query_doc_list,
354 GSXML.NODE_ID_ATT);
355 for (int i = 0; i < doc_ids.length; i++) {
356 String doc_id = doc_ids[i];
357 Element doc_node = getDocument(doc_id);
358 doc_list.appendChild(doc_node);
359 }
360 return result;
361
362 }
363
364
365 /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */
366 protected Element getDocument(String doc_id) {
367
368 String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id;
369 StringBuffer buffer = new StringBuffer();
370 try {
371 BufferedReader reader = makeConnection(url_string);
372
373 String line;
374 while((line = reader.readLine())!= null) {
375 buffer.append(line);
376 }
377
378
379 } catch (Exception e) {
380 System.err.println("IViaProxy Error:exception happened");
381 e.printStackTrace();
382 }
383
384 String node_content = buffer.toString();
385 String escaped_content = GSXML.xmlSafe(node_content);
386
387 StringBuffer processed_content = new StringBuffer(escaped_content.length());
388 processed_content.append("<nodeContent>");
389 int pos = 0;
390 int lastpos = 0;
391 while ((pos = escaped_content.indexOf("&lt;a ", lastpos))!= -1) {
392 processed_content.append(escaped_content.substring(lastpos, pos));
393 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
394 if (endpos == -1) {
395 break;
396 }
397 String link = escaped_content.substring(pos, endpos+10);
398 link = convertLink(link);
399 processed_content.append(link);
400 lastpos = endpos+10;
401 }
402 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
403 processed_content.append("</nodeContent>");
404
405 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
406 doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_id);
407
408 Element content_element = this.converter.getDOM(processed_content.toString()).getDocumentElement();
409 doc_node.appendChild(this.doc.importNode(content_element, true));
410
411 return doc_node;
412
413 }
414
415 /** converts a url from an <a> element into a greenstone suitable one */
416 protected String convertLink(String aref) {
417
418 if (aref.indexOf("href=&quot;http") != -1) {
419 return aref; // an external link
420 }
421 String type = "other";
422 if (aref.indexOf("/cgi-bin/canned_search")!=-1) {
423 type="query";
424 } else if (aref.indexOf("/cgi-bin/click_through") != -1) {
425 type = "external";
426 } else if (aref.indexOf("/cgi-bin/view_record") != -1) {
427 type="document";
428 }
429
430 int href_start = aref.indexOf("href=&quot;")+11;
431 int href_end = aref.indexOf("&gt;", href_start);
432 String href = aref.substring(href_start, href_end);
433 String link_content = aref.substring(href_end+4, aref.length()-10);
434
435 if (type.equals("external")) {
436 // the external link is everything after the http at the end.
437 String address = href.substring(href.lastIndexOf("http"));
438 address = address.replaceAll("%3[aA]", ":");
439 address = address.replaceAll("%2[fF]", "/");
440
441 return "&lt;a href=\""+address+"\"&gt;"+link_content+"&lt;/a&gt;";
442 }
443 if (type.equals("other")) {
444 return "other type of link ("+link_content+")";
445 }
446 StringBuffer result = new StringBuffer();
447 result.append("<link type='");
448 result.append(type);
449 result.append("'");
450 if (type.equals("query")) {
451 result.append(" service='TextQuery'");
452 }
453 result.append(">");
454 // add in the parameters
455 href = href.substring(href.indexOf("?")+1);
456 String [] params = href.split("&amp;");
457 for (int i=0; i<params.length; i++) {
458 String param = params[i];
459 int eq_pos = param.indexOf("=");
460 if (eq_pos != -1) {
461
462 result.append("<param name='"+param.substring(0, eq_pos)+"' value='"+param.substring(eq_pos+1)+"'/>");
463 }
464 }
465 result.append(link_content);
466 result.append("</link>");
467
468 return result.toString();
469
470 }
471
472 // iVia craps out if we ask for a metadata which is not valid. So need
473 // to make sure we only ask for acceptable fields.
474 protected boolean isAcceptableMetadata(String meta) {
475 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
476 if (valid_metadata.indexOf(","+meta+",")!=-1) {
477 return true;
478 }
479 return false;
480 }
481 protected BufferedReader makeConnection(String url_string) {
482 BufferedReader reader = null;
483 try {
484 URL url = new URL(url_string);
485 HttpURLConnection connection = (HttpURLConnection)url.openConnection();
486 InputStream input = connection.getInputStream();
487 reader = new BufferedReader(new InputStreamReader(input));
488 } catch (java.net.MalformedURLException e) {
489
490 System.err.println("IViaProxy Error: Malformed URL: "+url_string);
491 } catch (java.io.IOException e) {
492 System.err.println("IViaProxy Error: An error occurred during IO to url "+url_string);
493 }
494 return reader;
495 }
496
497}
Note: See TracBrowser for help on using the repository browser.