source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/IViaProxy.java@ 10093

Last change on this file since 10093 was 10093, checked in by kjdon, 19 years ago

The ServiceRack class's configure method is no longer abstract so all the
subclasses should call super.configure.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.5 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.util.*;
5
6// XML classes
7import org.w3c.dom.Element;
8import org.w3c.dom.Document;
9import org.w3c.dom.NodeList;
10
11import java.util.HashMap;
12import java.io.File;
13import java.io.InputStream;
14import java.io.BufferedReader;
15import java.io.InputStreamReader;
16import java.io.IOException;
17import java.net.HttpURLConnection;
18import java.net.URLConnection;
19import java.net.URL;
20import java.net.Authenticator;
21import java.net.MalformedURLException;
22
23/**
24 *
25 * @author <a href="mailto:[email protected]">Katherine Don</a>
26 * @version $Revision: 10093 $
27 */
28
29public class IViaProxy
30 extends ServiceRack {
31
32 // the services on offer
33 // these strings must match what is found in the properties file
34 protected static final String TEXT_QUERY_SERVICE = "TextQuery";
35 protected static final String DOC_CONTENT_SERVICE = "DocumentContentRetrieve";
36 protected static final String DOC_META_SERVICE = "DocumentMetadataRetrieve";
37 protected static final String QUERY_PARAM = "query";
38 protected static final String FIELD_PARAM = "fields";
39 // have standard gs param names for hits per page, and start page
40 // these need to be mapped to iVia params
41 protected static final String GS_HITS_PARAM = "hitsPerPage";
42 protected static final String IM_HITS_PARAM = "no_of_records_per_page";
43 protected static final String GS_START_PAGE_PARAM = "startPage";
44 protected static final String IM_START_PAGE_PARAM = "start_page_no";
45
46 protected String ivia_server_url = null;
47
48 public boolean configure(Element info, Element extra_info) {
49
50 if (!super.configure(info, extra_info)){
51 return false;
52 }
53
54 Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer");
55 if (server_elem == null) {
56 System.err.println("IViaProxy.configure error: no iViaServer element found");
57 return false;
58 }
59 ivia_server_url = server_elem.getAttribute("url");
60 if (ivia_server_url.equals("")) {
61 System.err.println("IViaProxy.configure error: no url for the iViaServer element");
62 return false;
63 }
64 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
65 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
66 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
67 this.short_service_info.appendChild(tq_service);
68
69 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
70 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
71 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
72 this.short_service_info.appendChild(dc_service);
73
74 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
75 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
76 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
77 this.short_service_info.appendChild(dm_service);
78
79 //
80 // add some format info to service map if there is any
81 String path = GSPath.appendLink(GSXML.SEARCH_ELEM, GSXML.FORMAT_ELEM);
82 Element format = (Element) GSXML.getNodeByPath(extra_info, path);
83 if (format != null) {
84 this.format_info_map.put(TEXT_QUERY_SERVICE, this.doc.importNode(format, true));
85 }
86
87
88 // look for document display format
89 path = GSPath.appendLink(GSXML.DISPLAY_ELEM, GSXML.FORMAT_ELEM);
90 Element display_format = (Element)GSXML.getNodeByPath(extra_info, path);
91 if (display_format != null) {
92 this.format_info_map.put(DOC_CONTENT_SERVICE, this.doc.importNode(display_format, true));
93 // shoudl we make a copy?
94 }
95
96 return true;
97
98 }
99
100 protected Element getServiceDescription(String service, String lang, String subset) {
101
102 if (service.equals(TEXT_QUERY_SERVICE)) {
103 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
104 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
105 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
106 if (subset == null || subset.equals(GSXML.DISPLAY_TEXT_ELEM + GSXML.LIST_MODIFIER)) {
107 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(TEXT_QUERY_SERVICE+".name", lang)));
108 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_SUBMIT, getTextString(TEXT_QUERY_SERVICE+".submit", lang)));
109 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(TEXT_QUERY_SERVICE+".description", lang)));
110 }
111 if (subset == null || subset.equals(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER)) {
112 Element param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
113 tq_service.appendChild(param_list);
114 Element param = GSXML.createParameterDescription(this.doc, QUERY_PARAM, getTextString("param."+QUERY_PARAM, lang), GSXML.PARAM_TYPE_STRING, null, null, null);
115 param_list.appendChild(param);
116 String [] field_ids = {"kw", "au", "su", "ti", "de", "fu"};
117 String [] field_names = {
118 getTextString("param."+FIELD_PARAM+".kw", lang),
119 getTextString("param."+FIELD_PARAM+".au", lang),
120 getTextString("param."+FIELD_PARAM+".su", lang),
121 getTextString("param."+FIELD_PARAM+".ti", lang),
122 getTextString("param."+FIELD_PARAM+".de", lang),
123 getTextString("param."+FIELD_PARAM+".fu", lang) };
124
125 param = GSXML.createParameterDescription(this.doc, FIELD_PARAM, getTextString("param."+FIELD_PARAM, lang), GSXML.PARAM_TYPE_ENUM_MULTI, "kw,au,su,ti,de,fu", field_ids, field_names);
126 param_list.appendChild(param);
127
128
129 String [] hits_options = {"10", "30", "50"};
130 param = GSXML.createParameterDescription(this.doc, GS_HITS_PARAM, getTextString("param."+GS_HITS_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, "10", hits_options, hits_options);
131 param_list.appendChild(param);
132
133 param = GSXML.createParameterDescription(this.doc, GS_START_PAGE_PARAM, "", GSXML.PARAM_TYPE_INVISIBLE, "1", null, null);
134 param_list.appendChild(param);
135 }
136 return tq_service;
137 }
138 if (service.equals(DOC_META_SERVICE)) {
139 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
140 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
141 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
142 return dm_service;
143
144 }
145 if (service.equals(DOC_CONTENT_SERVICE)) {
146 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
147 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
148 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
149 return dc_service;
150
151
152 }
153 return null;
154 }
155
156 /** Process a text query - implemented by concrete subclasses */
157 protected Element processTextQuery(Element request) {
158
159 // Create a new (empty) result message
160 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
161 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
162 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
163 Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
164 result.appendChild(doc_node_list);
165
166
167 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
168 if (param_list == null) {
169 System.err.println("IViaProxy Error:: TextQuery request had no paramList.");
170 return result; // Return the empty result
171 }
172
173 // Process the request parameters
174 HashMap params = GSXML.extractParams(param_list, false);
175
176 // Make sure a query has been specified
177 String query = (String) params.get(QUERY_PARAM);
178 if (query == null || query.equals("")) {
179 return result; // Return the empty result
180 }
181 // tidy whitespace
182 query = query.replaceAll("\\s+", "+");
183 String url_string = ivia_server_url+"/cgi-bin/canned_search?theme=gsdl3&query="+query;
184
185 // check for fields
186 String fields = (String) params.get(FIELD_PARAM);
187 if (fields != null && !fields.equals("")) {
188 url_string += "&fields="+fields;
189 }
190
191 //check for hits per page
192 String hits_per_page = (String) params.get(GS_HITS_PARAM);
193 if (hits_per_page != null && !hits_per_page.equals("")) {
194 url_string += "&"+IM_HITS_PARAM+"="+hits_per_page;
195 }
196
197 // check for start page
198 String start_page = (String) params.get(GS_START_PAGE_PARAM);
199 if (start_page != null && !start_page.equals("")) {
200 url_string += "&"+IM_START_PAGE_PARAM+"="+start_page;
201 }
202 String results_num = null;
203 String doc_ids = null;
204 try {
205 ///ystem.err.println("IViaProxy, sending "+url_string);
206 BufferedReader reader = makeConnection(url_string);
207 results_num = reader.readLine();
208 doc_ids = reader.readLine();
209
210 } catch (Exception e) {
211 System.err.println("IViaProxy.TextQuery Error: exception happened during query");
212 e.printStackTrace();
213 return result;
214 }
215
216 if (results_num.startsWith("Resources: ")) {
217 results_num = results_num.substring(11);
218 } else {
219 System.err.println("IViaProxy.TextQuery Error: badly formatted results line: "+results_num);
220 return result;
221 }
222 if (doc_ids.startsWith("Ids: ")) {
223 doc_ids = doc_ids.substring(5).trim();
224 } else {
225 System.err.println("IViaProxy.TextQuery Error: badly formatted docs line: "+doc_ids);
226 return result;
227 }
228
229 // get the num docs and add to a metadata list
230 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
231 result.appendChild(metadata_list);
232
233 // Add a metadata element specifying the number of matching documents
234 long numdocs = Long.parseLong(results_num);
235 GSXML.addMetadata(this.doc, metadata_list, "numDocsMatched", ""+numdocs);
236 String [] ids = doc_ids.split(" ");
237
238 for (int d=0; d<ids.length; d++) {
239 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
240 doc_node.setAttribute(GSXML.NODE_ID_ATT, ids[d]);
241 doc_node_list.appendChild(doc_node);
242 }
243 ///ystem.err.println("IViaProxy result:");
244 ///ystem.err.println(this.converter.getString(result));
245 return result;
246
247 }
248
249 protected Element processDocumentMetadataRetrieve(Element request) {
250 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
251 result.setAttribute(GSXML.FROM_ATT, DOC_META_SERVICE);
252 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
253
254 // Get the parameters of the request
255 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
256 if (param_list == null) {
257 System.err.println("IViaProxy.DocumentMetadataRetrieve Error: missing paramList.\n");
258 return result; // Return the empty result
259 }
260
261 // The metadata information required
262 StringBuffer field_list = new StringBuffer();
263 Element param = (Element) param_list.getFirstChild();
264 while (param != null) {
265 // Identify the metadata information desired
266 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
267 String metadata = GSXML.getValue(param);
268 if (isAcceptableMetadata(metadata)) {
269 field_list.append(metadata);
270 field_list.append(",");
271 }
272 }
273 param = (Element) param.getNextSibling();
274 }
275
276 if (field_list.length()==0) {
277 System.err.println("IViaProxy.DocumentMetadataRetrieve Error: no metadata specified.\n");
278 return result;
279 }
280
281 // Get the documents
282 Element request_node_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
283 if (request_node_list == null) {
284 System.err.println("IViaProxy Error: DocumentMetadataRetrieve request had no "+GSXML.DOC_NODE_ELEM+"List.\n");
285 return result;
286 }
287
288 StringBuffer record_id_list = new StringBuffer();
289
290 NodeList request_nodes = request_node_list.getChildNodes();
291 for (int i = 0; i < request_nodes.getLength(); i++) {
292 Element request_node = (Element) request_nodes.item(i);
293 String node_id = request_node.getAttribute(GSXML.NODE_ID_ATT);
294 record_id_list.append(node_id);
295 record_id_list.append(",");
296 }
297
298 // do the query to the iVia server
299 String url_string = ivia_server_url+"/cgi-bin/view_record_set?theme=gsdl3&record_id_list="+record_id_list.toString()+"&field_list="+field_list.toString();
300
301 Element node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
302 result.appendChild(node_list);
303 try {
304 BufferedReader reader = makeConnection(url_string);
305 String line;
306 while ((line = reader.readLine()) != null) {
307 if (!line.startsWith("Record:")) {
308 continue;
309 }
310 // the first line is the record
311 line=line.substring(8);
312 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
313 doc_node.setAttribute(GSXML.NODE_ID_ATT, line);
314 Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
315 doc_node.appendChild(meta_list);
316 while ((line = reader.readLine()) != null) {
317 //metadata entry
318 int col_pos = line.indexOf(':');
319 if (col_pos == -1) {
320 // end of the metadata for this doc
321 break;
322 }
323 String name = line.substring(0,col_pos);
324 String value = line.substring(col_pos+2); // includes a space
325 GSXML.addMetadata(this.doc, meta_list, name, value);
326 }
327 node_list.appendChild(doc_node);
328
329 }
330 } catch (Exception e) {
331 System.err.println("IViaProxy Error:exception happened");
332 e.printStackTrace();
333 }
334 ///ystem.out.println("IViaProxy: returning result: ");
335 ///ystem.out.println(this.converter.getPrettyString(result));
336 return result;
337
338 }
339
340 protected Element processDocumentContentRetrieve(Element request) {
341 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
342 result.setAttribute(GSXML.FROM_ATT, DOC_CONTENT_SERVICE);
343 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
344
345 // Get the request doc_list
346 Element query_doc_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
347 if (query_doc_list == null) {
348 System.err.println("IViaProxy Error: DocumentContentRetrieve request specified no doc nodes.\n");
349 return result;
350 }
351
352 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
353 result.appendChild(doc_list);
354
355 // Get the documents
356 String[] doc_ids = GSXML.getAttributeValuesFromList(query_doc_list,
357 GSXML.NODE_ID_ATT);
358 for (int i = 0; i < doc_ids.length; i++) {
359 String doc_id = doc_ids[i];
360 Element doc_node = getDocument(doc_id);
361 doc_list.appendChild(doc_node);
362 }
363 return result;
364
365 }
366
367
368 /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */
369 protected Element getDocument(String doc_id) {
370
371 String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id;
372 StringBuffer buffer = new StringBuffer();
373 try {
374 BufferedReader reader = makeConnection(url_string);
375
376 String line;
377 while((line = reader.readLine())!= null) {
378 buffer.append(line);
379 }
380
381
382 } catch (Exception e) {
383 System.err.println("IViaProxy Error:exception happened");
384 e.printStackTrace();
385 }
386
387 String node_content = buffer.toString();
388 String escaped_content = GSXML.xmlSafe(node_content);
389
390 StringBuffer processed_content = new StringBuffer(escaped_content.length());
391 processed_content.append("<nodeContent>");
392 int pos = 0;
393 int lastpos = 0;
394 while ((pos = escaped_content.indexOf("&lt;a ", lastpos))!= -1) {
395 processed_content.append(escaped_content.substring(lastpos, pos));
396 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
397 if (endpos == -1) {
398 break;
399 }
400 String link = escaped_content.substring(pos, endpos+10);
401 link = convertLink(link);
402 processed_content.append(link);
403 lastpos = endpos+10;
404 }
405 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
406 processed_content.append("</nodeContent>");
407
408 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
409 doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_id);
410
411 Document content_doc = this.converter.getDOM(processed_content.toString());
412 if (content_doc != null) {
413 Element content_element = content_doc.getDocumentElement();
414 doc_node.appendChild(this.doc.importNode(content_element, true));
415 } else {
416 System.err.println("IViaProxy.getDocument Error: Couldn't parse the node content");
417 }
418 return doc_node;
419
420 }
421
422 /** converts a url from an <a> element into a greenstone suitable one */
423 protected String convertLink(String aref) {
424
425 if (aref.indexOf("href=&quot;http") != -1) {
426 return aref; // an external link
427 }
428 String type = "other";
429 if (aref.indexOf("/cgi-bin/canned_search")!=-1) {
430 type="query";
431 } else if (aref.indexOf("/cgi-bin/click_through") != -1) {
432 type = "external";
433 } else if (aref.indexOf("/cgi-bin/view_record") != -1) {
434 type="document";
435 }
436
437 int href_start = aref.indexOf("href=&quot;")+11;
438 int href_end = aref.indexOf("&gt;", href_start);
439 String href = aref.substring(href_start, href_end);
440 String link_content = aref.substring(href_end+4, aref.length()-10);
441
442 if (type.equals("external")) {
443 // the external link is everything after the http at the end.
444 String address = href.substring(href.lastIndexOf("http"));
445 address = address.replaceAll("%3[aA]", ":");
446 address = address.replaceAll("%2[fF]", "/");
447
448 return "&lt;a href=\""+address+"\"&gt;"+link_content+"&lt;/a&gt;";
449 }
450 if (type.equals("other")) {
451 return "other type of link ("+link_content+")";
452 }
453 StringBuffer result = new StringBuffer();
454 result.append("<link type='");
455 result.append(type);
456 result.append("'");
457 if (type.equals("query")) {
458 result.append(" service='TextQuery'");
459 }
460 result.append(">");
461 // add in the parameters
462 href = href.substring(href.indexOf("?")+1);
463 String [] params = href.split("&amp;");
464 for (int i=0; i<params.length; i++) {
465 String param = params[i];
466 int eq_pos = param.indexOf("=");
467 if (eq_pos != -1) {
468
469 result.append("<param name='"+param.substring(0, eq_pos)+"' value='"+param.substring(eq_pos+1)+"'/>");
470 }
471 }
472 result.append(link_content);
473 result.append("</link>");
474
475 return result.toString();
476
477 }
478
479 // iVia craps out if we ask for a metadata which is not valid. So need
480 // to make sure we only ask for acceptable fields.
481 protected boolean isAcceptableMetadata(String meta) {
482 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
483 if (valid_metadata.indexOf(","+meta+",")!=-1) {
484 return true;
485 }
486 return false;
487 }
488 protected BufferedReader makeConnection(String url_string) {
489 BufferedReader reader = null;
490 try {
491 URL url = new URL(url_string);
492 HttpURLConnection connection = (HttpURLConnection)url.openConnection();
493 InputStream input = connection.getInputStream();
494 reader = new BufferedReader(new InputStreamReader(input));
495 } catch (java.net.MalformedURLException e) {
496
497 System.err.println("IViaProxy Error: Malformed URL: "+url_string);
498 } catch (java.io.IOException e) {
499 System.err.println("IViaProxy Error: An error occurred during IO to url "+url_string);
500 }
501 return reader;
502 }
503
504}
Note: See TracBrowser for help on using the repository browser.