source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/IViaProxy.java@ 9874

Last change on this file since 9874 was 9874, checked in by kjdon, 19 years ago

merged from branch ant-install-branch: merge 1

  • Property svn:keywords set to Author Date Id Revision
File size: 18.4 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.util.*;
5
6// XML classes
7import org.w3c.dom.Element;
8import org.w3c.dom.Document;
9import org.w3c.dom.NodeList;
10
11import java.util.HashMap;
12import java.io.File;
13import java.io.InputStream;
14import java.io.BufferedReader;
15import java.io.InputStreamReader;
16import java.io.IOException;
17import java.net.HttpURLConnection;
18import java.net.URLConnection;
19import java.net.URL;
20import java.net.Authenticator;
21import java.net.MalformedURLException;
22
23/**
24 *
25 * @author <a href="mailto:[email protected]">Katherine Don</a>
26 * @version $Revision: 9874 $
27 */
28
29public class IViaProxy
30 extends ServiceRack {
31
32 // the services on offer
33 // these strings must match what is found in the properties file
34 protected static final String TEXT_QUERY_SERVICE = "TextQuery";
35 protected static final String DOC_CONTENT_SERVICE = "DocumentContentRetrieve";
36 protected static final String DOC_META_SERVICE = "DocumentMetadataRetrieve";
37 protected static final String QUERY_PARAM = "query";
38 protected static final String FIELD_PARAM = "fields";
39 // have standard gs param names for hits per page, and start page
40 // these need to be mapped to iVia params
41 protected static final String GS_HITS_PARAM = "hitsPerPage";
42 protected static final String IM_HITS_PARAM = "no_of_records_per_page";
43 protected static final String GS_START_PAGE_PARAM = "startPage";
44 protected static final String IM_START_PAGE_PARAM = "start_page_no";
45
46 protected String ivia_server_url = null;
47
48 public boolean configure(Element info, Element extra_info) {
49
50 Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer");
51 if (server_elem == null) {
52 System.err.println("IViaProxy.configure error: no iViaServer element found");
53 return false;
54 }
55 ivia_server_url = server_elem.getAttribute("url");
56 if (ivia_server_url.equals("")) {
57 System.err.println("IViaProxy.configure error: no url for the iViaServer element");
58 return false;
59 }
60 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
61 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
62 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
63 this.short_service_info.appendChild(tq_service);
64
65 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
66 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
67 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
68 this.short_service_info.appendChild(dc_service);
69
70 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
71 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
72 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
73 this.short_service_info.appendChild(dm_service);
74
75 //
76 // add some format info to service map if there is any
77 String path = GSPath.appendLink(GSXML.SEARCH_ELEM, GSXML.FORMAT_ELEM);
78 Element format = (Element) GSXML.getNodeByPath(extra_info, path);
79 if (format != null) {
80 this.format_info_map.put(TEXT_QUERY_SERVICE, this.doc.importNode(format, true));
81 }
82
83
84 // look for document display format
85 path = GSPath.appendLink(GSXML.DISPLAY_ELEM, GSXML.FORMAT_ELEM);
86 Element display_format = (Element)GSXML.getNodeByPath(extra_info, path);
87 if (display_format != null) {
88 this.format_info_map.put(DOC_CONTENT_SERVICE, this.doc.importNode(display_format, true));
89 // shoudl we make a copy?
90 }
91
92 return true;
93
94 }
95
96 protected Element getServiceDescription(String service, String lang, String subset) {
97
98 if (service.equals(TEXT_QUERY_SERVICE)) {
99 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
100 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
101 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
102 if (subset == null || subset.equals(GSXML.DISPLAY_TEXT_ELEM + GSXML.LIST_MODIFIER)) {
103 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(TEXT_QUERY_SERVICE+".name", lang)));
104 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_SUBMIT, getTextString(TEXT_QUERY_SERVICE+".submit", lang)));
105 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(TEXT_QUERY_SERVICE+".description", lang)));
106 }
107 if (subset == null || subset.equals(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER)) {
108 Element param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
109 tq_service.appendChild(param_list);
110 Element param = GSXML.createParameterDescription(this.doc, QUERY_PARAM, getTextString("param."+QUERY_PARAM, lang), GSXML.PARAM_TYPE_STRING, null, null, null);
111 param_list.appendChild(param);
112 String [] field_ids = {"kw", "au", "su", "ti", "de", "fu"};
113 String [] field_names = {
114 getTextString("param."+FIELD_PARAM+".kw", lang),
115 getTextString("param."+FIELD_PARAM+".au", lang),
116 getTextString("param."+FIELD_PARAM+".su", lang),
117 getTextString("param."+FIELD_PARAM+".ti", lang),
118 getTextString("param."+FIELD_PARAM+".de", lang),
119 getTextString("param."+FIELD_PARAM+".fu", lang) };
120
121 param = GSXML.createParameterDescription(this.doc, FIELD_PARAM, getTextString("param."+FIELD_PARAM, lang), GSXML.PARAM_TYPE_ENUM_MULTI, "kw,au,su,ti,de,fu", field_ids, field_names);
122 param_list.appendChild(param);
123
124
125 String [] hits_options = {"10", "30", "50"};
126 param = GSXML.createParameterDescription(this.doc, GS_HITS_PARAM, getTextString("param."+GS_HITS_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, "10", hits_options, hits_options);
127 param_list.appendChild(param);
128
129 param = GSXML.createParameterDescription(this.doc, GS_START_PAGE_PARAM, "", GSXML.PARAM_TYPE_INVISIBLE, "1", null, null);
130 param_list.appendChild(param);
131 }
132 return tq_service;
133 }
134 if (service.equals(DOC_META_SERVICE)) {
135 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
136 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
137 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
138 return dm_service;
139
140 }
141 if (service.equals(DOC_CONTENT_SERVICE)) {
142 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
143 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
144 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
145 return dc_service;
146
147
148 }
149 return null;
150 }
151
152 /** Process a text query - implemented by concrete subclasses */
153 protected Element processTextQuery(Element request) {
154
155 // Create a new (empty) result message
156 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
157 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
158 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
159 Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
160 result.appendChild(doc_node_list);
161
162
163 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
164 if (param_list == null) {
165 System.err.println("IViaProxy Error:: TextQuery request had no paramList.");
166 return result; // Return the empty result
167 }
168
169 // Process the request parameters
170 HashMap params = GSXML.extractParams(param_list, false);
171
172 // Make sure a query has been specified
173 String query = (String) params.get(QUERY_PARAM);
174 if (query == null || query.equals("")) {
175 return result; // Return the empty result
176 }
177 // tidy whitespace
178 query = query.replaceAll("\\s+", "+");
179 String url_string = ivia_server_url+"/cgi-bin/canned_search?theme=gsdl3&query="+query;
180
181 // check for fields
182 String fields = (String) params.get(FIELD_PARAM);
183 if (fields != null && !fields.equals("")) {
184 url_string += "&fields="+fields;
185 }
186
187 //check for hits per page
188 String hits_per_page = (String) params.get(GS_HITS_PARAM);
189 if (hits_per_page != null && !hits_per_page.equals("")) {
190 url_string += "&"+IM_HITS_PARAM+"="+hits_per_page;
191 }
192
193 // check for start page
194 String start_page = (String) params.get(GS_START_PAGE_PARAM);
195 if (start_page != null && !start_page.equals("")) {
196 url_string += "&"+IM_START_PAGE_PARAM+"="+start_page;
197 }
198 String results_num = null;
199 String doc_ids = null;
200 try {
201 ///ystem.err.println("IViaProxy, sending "+url_string);
202 BufferedReader reader = makeConnection(url_string);
203 results_num = reader.readLine();
204 doc_ids = reader.readLine();
205
206 } catch (Exception e) {
207 System.err.println("IViaProxy.TextQuery Error: exception happened during query");
208 e.printStackTrace();
209 return result;
210 }
211
212 if (results_num.startsWith("Resources: ")) {
213 results_num = results_num.substring(11);
214 } else {
215 System.err.println("IViaProxy.TextQuery Error: badly formatted results line: "+results_num);
216 return result;
217 }
218 if (doc_ids.startsWith("Ids: ")) {
219 doc_ids = doc_ids.substring(5).trim();
220 } else {
221 System.err.println("IViaProxy.TextQuery Error: badly formatted docs line: "+doc_ids);
222 return result;
223 }
224
225 // get the num docs and add to a metadata list
226 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
227 result.appendChild(metadata_list);
228
229 // Add a metadata element specifying the number of matching documents
230 long numdocs = Long.parseLong(results_num);
231 GSXML.addMetadata(this.doc, metadata_list, "numDocsMatched", ""+numdocs);
232 String [] ids = doc_ids.split(" ");
233
234 for (int d=0; d<ids.length; d++) {
235 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
236 doc_node.setAttribute(GSXML.NODE_ID_ATT, ids[d]);
237 doc_node_list.appendChild(doc_node);
238 }
239 ///ystem.err.println("IViaProxy result:");
240 ///ystem.err.println(this.converter.getString(result));
241 return result;
242
243 }
244
245 protected Element processDocumentMetadataRetrieve(Element request) {
246 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
247 result.setAttribute(GSXML.FROM_ATT, DOC_META_SERVICE);
248 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
249
250 // Get the parameters of the request
251 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
252 if (param_list == null) {
253 System.err.println("IViaProxy.DocumentMetadataRetrieve Error: missing paramList.\n");
254 return result; // Return the empty result
255 }
256
257 // The metadata information required
258 StringBuffer field_list = new StringBuffer();
259 Element param = (Element) param_list.getFirstChild();
260 while (param != null) {
261 // Identify the metadata information desired
262 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
263 String metadata = GSXML.getValue(param);
264 if (isAcceptableMetadata(metadata)) {
265 field_list.append(metadata);
266 field_list.append(",");
267 }
268 }
269 param = (Element) param.getNextSibling();
270 }
271
272 if (field_list.length()==0) {
273 System.err.println("IViaProxy.DocumentMetadataRetrieve Error: no metadata specified.\n");
274 return result;
275 }
276
277 // Get the documents
278 Element request_node_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
279 if (request_node_list == null) {
280 System.err.println("IViaProxy Error: DocumentMetadataRetrieve request had no "+GSXML.DOC_NODE_ELEM+"List.\n");
281 return result;
282 }
283
284 StringBuffer record_id_list = new StringBuffer();
285
286 NodeList request_nodes = request_node_list.getChildNodes();
287 for (int i = 0; i < request_nodes.getLength(); i++) {
288 Element request_node = (Element) request_nodes.item(i);
289 String node_id = request_node.getAttribute(GSXML.NODE_ID_ATT);
290 record_id_list.append(node_id);
291 record_id_list.append(",");
292 }
293
294 // do the query to the iVia server
295 String url_string = ivia_server_url+"/cgi-bin/view_record_set?theme=gsdl3&record_id_list="+record_id_list.toString()+"&field_list="+field_list.toString();
296
297 Element node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
298 result.appendChild(node_list);
299 try {
300 BufferedReader reader = makeConnection(url_string);
301 String line;
302 while ((line = reader.readLine()) != null) {
303 if (!line.startsWith("Record:")) {
304 continue;
305 }
306 // the first line is the record
307 line=line.substring(8);
308 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
309 doc_node.setAttribute(GSXML.NODE_ID_ATT, line);
310 Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
311 doc_node.appendChild(meta_list);
312 while ((line = reader.readLine()) != null) {
313 //metadata entry
314 int col_pos = line.indexOf(':');
315 if (col_pos == -1) {
316 // end of the metadata for this doc
317 break;
318 }
319 String name = line.substring(0,col_pos);
320 String value = line.substring(col_pos+2); // includes a space
321 GSXML.addMetadata(this.doc, meta_list, name, value);
322 }
323 node_list.appendChild(doc_node);
324
325 }
326 } catch (Exception e) {
327 System.err.println("IViaProxy Error:exception happened");
328 e.printStackTrace();
329 }
330 ///ystem.out.println("IViaProxy: returning result: ");
331 ///ystem.out.println(this.converter.getPrettyString(result));
332 return result;
333
334 }
335
336 protected Element processDocumentContentRetrieve(Element request) {
337 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
338 result.setAttribute(GSXML.FROM_ATT, DOC_CONTENT_SERVICE);
339 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
340
341 // Get the request doc_list
342 Element query_doc_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
343 if (query_doc_list == null) {
344 System.err.println("IViaProxy Error: DocumentContentRetrieve request specified no doc nodes.\n");
345 return result;
346 }
347
348 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
349 result.appendChild(doc_list);
350
351 // Get the documents
352 String[] doc_ids = GSXML.getAttributeValuesFromList(query_doc_list,
353 GSXML.NODE_ID_ATT);
354 for (int i = 0; i < doc_ids.length; i++) {
355 String doc_id = doc_ids[i];
356 Element doc_node = getDocument(doc_id);
357 doc_list.appendChild(doc_node);
358 }
359 return result;
360
361 }
362
363
364 /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */
365 protected Element getDocument(String doc_id) {
366
367 String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id;
368 StringBuffer buffer = new StringBuffer();
369 try {
370 BufferedReader reader = makeConnection(url_string);
371
372 String line;
373 while((line = reader.readLine())!= null) {
374 buffer.append(line);
375 }
376
377
378 } catch (Exception e) {
379 System.err.println("IViaProxy Error:exception happened");
380 e.printStackTrace();
381 }
382
383 String node_content = buffer.toString();
384 String escaped_content = GSXML.xmlSafe(node_content);
385
386 StringBuffer processed_content = new StringBuffer(escaped_content.length());
387 processed_content.append("<nodeContent>");
388 int pos = 0;
389 int lastpos = 0;
390 while ((pos = escaped_content.indexOf("&lt;a ", lastpos))!= -1) {
391 processed_content.append(escaped_content.substring(lastpos, pos));
392 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
393 if (endpos == -1) {
394 break;
395 }
396 String link = escaped_content.substring(pos, endpos+10);
397 link = convertLink(link);
398 processed_content.append(link);
399 lastpos = endpos+10;
400 }
401 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
402 processed_content.append("</nodeContent>");
403
404 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
405 doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_id);
406
407 Document content_doc = this.converter.getDOM(processed_content.toString());
408 if (content_doc != null) {
409 Element content_element = content_doc.getDocumentElement();
410 doc_node.appendChild(this.doc.importNode(content_element, true));
411 } else {
412 System.err.println("IViaProxy.getDocument Error: Couldn't parse the node content");
413 }
414 return doc_node;
415
416 }
417
418 /** converts a url from an <a> element into a greenstone suitable one */
419 protected String convertLink(String aref) {
420
421 if (aref.indexOf("href=&quot;http") != -1) {
422 return aref; // an external link
423 }
424 String type = "other";
425 if (aref.indexOf("/cgi-bin/canned_search")!=-1) {
426 type="query";
427 } else if (aref.indexOf("/cgi-bin/click_through") != -1) {
428 type = "external";
429 } else if (aref.indexOf("/cgi-bin/view_record") != -1) {
430 type="document";
431 }
432
433 int href_start = aref.indexOf("href=&quot;")+11;
434 int href_end = aref.indexOf("&gt;", href_start);
435 String href = aref.substring(href_start, href_end);
436 String link_content = aref.substring(href_end+4, aref.length()-10);
437
438 if (type.equals("external")) {
439 // the external link is everything after the http at the end.
440 String address = href.substring(href.lastIndexOf("http"));
441 address = address.replaceAll("%3[aA]", ":");
442 address = address.replaceAll("%2[fF]", "/");
443
444 return "&lt;a href=\""+address+"\"&gt;"+link_content+"&lt;/a&gt;";
445 }
446 if (type.equals("other")) {
447 return "other type of link ("+link_content+")";
448 }
449 StringBuffer result = new StringBuffer();
450 result.append("<link type='");
451 result.append(type);
452 result.append("'");
453 if (type.equals("query")) {
454 result.append(" service='TextQuery'");
455 }
456 result.append(">");
457 // add in the parameters
458 href = href.substring(href.indexOf("?")+1);
459 String [] params = href.split("&amp;");
460 for (int i=0; i<params.length; i++) {
461 String param = params[i];
462 int eq_pos = param.indexOf("=");
463 if (eq_pos != -1) {
464
465 result.append("<param name='"+param.substring(0, eq_pos)+"' value='"+param.substring(eq_pos+1)+"'/>");
466 }
467 }
468 result.append(link_content);
469 result.append("</link>");
470
471 return result.toString();
472
473 }
474
475 // iVia craps out if we ask for a metadata which is not valid. So need
476 // to make sure we only ask for acceptable fields.
477 protected boolean isAcceptableMetadata(String meta) {
478 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
479 if (valid_metadata.indexOf(","+meta+",")!=-1) {
480 return true;
481 }
482 return false;
483 }
484 protected BufferedReader makeConnection(String url_string) {
485 BufferedReader reader = null;
486 try {
487 URL url = new URL(url_string);
488 HttpURLConnection connection = (HttpURLConnection)url.openConnection();
489 InputStream input = connection.getInputStream();
490 reader = new BufferedReader(new InputStreamReader(input));
491 } catch (java.net.MalformedURLException e) {
492
493 System.err.println("IViaProxy Error: Malformed URL: "+url_string);
494 } catch (java.io.IOException e) {
495 System.err.println("IViaProxy Error: An error occurred during IO to url "+url_string);
496 }
497 return reader;
498 }
499
500}
Note: See TracBrowser for help on using the repository browser.