source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/IViaProxy.java@ 25635

Last change on this file since 25635 was 25635, checked in by sjm84, 12 years ago

Fixing Greenstone 3's use (or lack thereof) of generics, this was done automatically so we may want to change it over time. This change will also auto-format any files that have not already been formatted.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.2 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.util.*;
5
6// XML classes
7import org.w3c.dom.Element;
8import org.w3c.dom.Document;
9import org.w3c.dom.NodeList;
10
11import java.util.HashMap;
12import java.io.File;
13import java.io.InputStream;
14import java.io.BufferedReader;
15import java.io.InputStreamReader;
16import java.io.IOException;
17import java.io.Serializable;
18import java.net.HttpURLConnection;
19import java.net.URLConnection;
20import java.net.URL;
21import java.net.Authenticator;
22import java.net.MalformedURLException;
23
24import org.apache.log4j.*;
25
26/**
27 *
28 * @author <a href="mailto:[email protected]">Katherine Don</a>
29 * @version $Revision: 25635 $
30 */
31
32public class IViaProxy
33 extends ServiceRack {
34
35 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.IViaProxy.class.getName());
36
37 // the services on offer
38 // these strings must match what is found in the properties file
39 protected static final String TEXT_QUERY_SERVICE = "TextQuery";
40 protected static final String DOC_CONTENT_SERVICE = "DocumentContentRetrieve";
41 protected static final String DOC_META_SERVICE = "DocumentMetadataRetrieve";
42 protected static final String QUERY_PARAM = "query";
43 protected static final String FIELD_PARAM = "fields";
44 // have standard gs param names for hits per page, and start page
45 // these need to be mapped to iVia params
46 protected static final String GS_HITS_PARAM = "hitsPerPage";
47 protected static final String IM_HITS_PARAM = "no_of_records_per_page";
48 protected static final String GS_START_PAGE_PARAM = "startPage";
49 protected static final String IM_START_PAGE_PARAM = "start_page_no";
50
51 protected String ivia_server_url = null;
52
53 public boolean configure(Element info, Element extra_info) {
54
55 if (!super.configure(info, extra_info)){
56 return false;
57 }
58
59 Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer");
60 if (server_elem == null) {
61 logger.error("no iViaServer element found");
62 return false;
63 }
64 ivia_server_url = server_elem.getAttribute("url");
65 if (ivia_server_url.equals("")) {
66 logger.error("no url for the iViaServer element");
67 return false;
68 }
69 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
70 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
71 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
72 this.short_service_info.appendChild(tq_service);
73
74 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
75 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
76 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
77 this.short_service_info.appendChild(dc_service);
78
79 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
80 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
81 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
82 this.short_service_info.appendChild(dm_service);
83
84 //
85 // add some format info to service map if there is any
86 String path = GSPath.appendLink(GSXML.SEARCH_ELEM, GSXML.FORMAT_ELEM);
87 Element format = (Element) GSXML.getNodeByPath(extra_info, path);
88 if (format != null) {
89 this.format_info_map.put(TEXT_QUERY_SERVICE, this.doc.importNode(format, true));
90 }
91
92
93 // look for document display format
94 path = GSPath.appendLink(GSXML.DISPLAY_ELEM, GSXML.FORMAT_ELEM);
95 Element display_format = (Element)GSXML.getNodeByPath(extra_info, path);
96 if (display_format != null) {
97 this.format_info_map.put(DOC_CONTENT_SERVICE, this.doc.importNode(display_format, true));
98 // shoudl we make a copy?
99 }
100
101 return true;
102
103 }
104
105 protected Element getServiceDescription(String service, String lang, String subset) {
106
107 if (service.equals(TEXT_QUERY_SERVICE)) {
108 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
109 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
110 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
111 if (subset == null || subset.equals(GSXML.DISPLAY_TEXT_ELEM + GSXML.LIST_MODIFIER)) {
112 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(TEXT_QUERY_SERVICE+".name", lang)));
113 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_SUBMIT, getTextString(TEXT_QUERY_SERVICE+".submit", lang)));
114 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(TEXT_QUERY_SERVICE+".description", lang)));
115 }
116 if (subset == null || subset.equals(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER)) {
117 Element param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
118 tq_service.appendChild(param_list);
119 Element param = GSXML.createParameterDescription(this.doc, QUERY_PARAM, getTextString("param."+QUERY_PARAM, lang), GSXML.PARAM_TYPE_STRING, null, null, null);
120 param_list.appendChild(param);
121 String [] field_ids = {"kw", "au", "su", "ti", "de", "fu"};
122 String [] field_names = {
123 getTextString("param."+FIELD_PARAM+".kw", lang),
124 getTextString("param."+FIELD_PARAM+".au", lang),
125 getTextString("param."+FIELD_PARAM+".su", lang),
126 getTextString("param."+FIELD_PARAM+".ti", lang),
127 getTextString("param."+FIELD_PARAM+".de", lang),
128 getTextString("param."+FIELD_PARAM+".fu", lang) };
129
130 param = GSXML.createParameterDescription(this.doc, FIELD_PARAM, getTextString("param."+FIELD_PARAM, lang), GSXML.PARAM_TYPE_ENUM_MULTI, "kw,au,su,ti,de,fu", field_ids, field_names);
131 param_list.appendChild(param);
132
133
134 String [] hits_options = {"10", "30", "50"};
135 param = GSXML.createParameterDescription(this.doc, GS_HITS_PARAM, getTextString("param."+GS_HITS_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, "10", hits_options, hits_options);
136 param_list.appendChild(param);
137
138 param = GSXML.createParameterDescription(this.doc, GS_START_PAGE_PARAM, "", GSXML.PARAM_TYPE_INVISIBLE, "1", null, null);
139 param_list.appendChild(param);
140 }
141 return tq_service;
142 }
143 if (service.equals(DOC_META_SERVICE)) {
144 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
145 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
146 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
147 return dm_service;
148
149 }
150 if (service.equals(DOC_CONTENT_SERVICE)) {
151 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
152 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
153 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
154 return dc_service;
155
156
157 }
158 return null;
159 }
160
161 /** Process a text query - implemented by concrete subclasses */
162 protected Element processTextQuery(Element request) {
163
164 // Create a new (empty) result message
165 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
166 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
167 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
168 Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
169 result.appendChild(doc_node_list);
170
171
172 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
173 if (param_list == null) {
174 logger.error("TextQuery request had no paramList.");
175 return result; // Return the empty result
176 }
177
178 // Process the request parameters
179 HashMap<String, Serializable> params = GSXML.extractParams(param_list, false);
180
181 // Make sure a query has been specified
182 String query = (String) params.get(QUERY_PARAM);
183 if (query == null || query.equals("")) {
184 return result; // Return the empty result
185 }
186 // tidy whitespace
187 query = query.replaceAll("\\s+", "+");
188 String url_string = ivia_server_url+"/cgi-bin/canned_search?theme=gsdl3&query="+query;
189
190 // check for fields
191 String fields = (String) params.get(FIELD_PARAM);
192 if (fields != null && !fields.equals("")) {
193 url_string += "&fields="+fields;
194 }
195
196 //check for hits per page
197 String hits_per_page = (String) params.get(GS_HITS_PARAM);
198 if (hits_per_page != null && !hits_per_page.equals("")) {
199 url_string += "&"+IM_HITS_PARAM+"="+hits_per_page;
200 }
201
202 // check for start page
203 String start_page = (String) params.get(GS_START_PAGE_PARAM);
204 if (start_page != null && !start_page.equals("")) {
205 url_string += "&"+IM_START_PAGE_PARAM+"="+start_page;
206 }
207 String results_num = null;
208 String doc_ids = null;
209 try {
210 logger.debug("IViaProxy, sending "+url_string);
211 BufferedReader reader = makeConnection(url_string);
212 results_num = reader.readLine();
213 doc_ids = reader.readLine();
214
215 } catch (Exception e) {
216 logger.error("exception happened during query");
217 e.printStackTrace();
218 return result;
219 }
220
221 if (results_num.startsWith("Resources: ")) {
222 results_num = results_num.substring(11);
223 } else {
224 logger.error("badly formatted results line: "+results_num);
225 return result;
226 }
227 if (doc_ids.startsWith("Ids: ")) {
228 doc_ids = doc_ids.substring(5).trim();
229 } else {
230 logger.error("badly formatted docs line: "+doc_ids);
231 return result;
232 }
233
234 // get the num docs and add to a metadata list
235 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
236 result.appendChild(metadata_list);
237
238 // Add a metadata element specifying the number of matching documents
239 long numdocs = Long.parseLong(results_num);
240 GSXML.addMetadata(this.doc, metadata_list, "numDocsMatched", ""+numdocs);
241 String [] ids = doc_ids.split(" ");
242
243 for (int d=0; d<ids.length; d++) {
244 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
245 doc_node.setAttribute(GSXML.NODE_ID_ATT, ids[d]);
246 doc_node_list.appendChild(doc_node);
247 }
248 logger.debug("IViaProxy result:");
249 logger.debug(this.converter.getString(result));
250 return result;
251
252 }
253
254 protected Element processDocumentMetadataRetrieve(Element request) {
255 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
256 result.setAttribute(GSXML.FROM_ATT, DOC_META_SERVICE);
257 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
258
259 // Get the parameters of the request
260 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
261 if (param_list == null) {
262 logger.error("missing paramList.\n");
263 return result; // Return the empty result
264 }
265
266 // The metadata information required
267 StringBuffer field_list = new StringBuffer();
268 Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
269 while (param != null) {
270 // Identify the metadata information desired
271 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
272 String metadata = GSXML.getValue(param);
273 if (isAcceptableMetadata(metadata)) {
274 field_list.append(metadata);
275 field_list.append(",");
276 }
277 }
278 param = (Element) param.getNextSibling();
279 }
280
281 if (field_list.length()==0) {
282 logger.error("no metadata specified.\n");
283 return result;
284 }
285
286 // Get the documents
287 Element request_node_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
288 if (request_node_list == null) {
289 logger.error("DocumentMetadataRetrieve request had no "+GSXML.DOC_NODE_ELEM+"List.\n");
290 return result;
291 }
292
293 StringBuffer record_id_list = new StringBuffer();
294
295 NodeList request_nodes = request_node_list.getChildNodes();
296 for (int i = 0; i < request_nodes.getLength(); i++) {
297 Element request_node = (Element) request_nodes.item(i);
298 String node_id = request_node.getAttribute(GSXML.NODE_ID_ATT);
299 record_id_list.append(node_id);
300 record_id_list.append(",");
301 }
302
303 // do the query to the iVia server
304 String url_string = ivia_server_url+"/cgi-bin/view_record_set?theme=gsdl3&record_id_list="+record_id_list.toString()+"&field_list="+field_list.toString();
305
306 Element node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
307 result.appendChild(node_list);
308 try {
309 BufferedReader reader = makeConnection(url_string);
310 String line;
311 while ((line = reader.readLine()) != null) {
312 if (!line.startsWith("Record:")) {
313 continue;
314 }
315 // the first line is the record
316 line=line.substring(8);
317 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
318 doc_node.setAttribute(GSXML.NODE_ID_ATT, line);
319 Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
320 doc_node.appendChild(meta_list);
321 while ((line = reader.readLine()) != null) {
322 //metadata entry
323 int col_pos = line.indexOf(':');
324 if (col_pos == -1) {
325 // end of the metadata for this doc
326 break;
327 }
328 String name = line.substring(0,col_pos);
329 String value = line.substring(col_pos+2); // includes a space
330 GSXML.addMetadata(this.doc, meta_list, name, value);
331 }
332 node_list.appendChild(doc_node);
333
334 }
335 } catch (Exception e) {
336 logger.error("exception happened");
337 e.printStackTrace();
338 }
339 logger.debug("IViaProxy: returning result: ");
340 logger.debug(this.converter.getPrettyString(result));
341 return result;
342
343 }
344
345 protected Element processDocumentContentRetrieve(Element request) {
346 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
347 result.setAttribute(GSXML.FROM_ATT, DOC_CONTENT_SERVICE);
348 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
349
350 // Get the request doc_list
351 Element query_doc_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
352 if (query_doc_list == null) {
353 logger.error("DocumentContentRetrieve request specified no doc nodes.\n");
354 return result;
355 }
356
357 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
358 result.appendChild(doc_list);
359
360 // Get the documents
361 String[] doc_ids = GSXML.getAttributeValuesFromList(query_doc_list,
362 GSXML.NODE_ID_ATT);
363 for (int i = 0; i < doc_ids.length; i++) {
364 String doc_id = doc_ids[i];
365 Element doc_node = getDocument(doc_id);
366 doc_list.appendChild(doc_node);
367 }
368 return result;
369
370 }
371
372
373 /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */
374 protected Element getDocument(String doc_id) {
375
376 String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id;
377 StringBuffer buffer = new StringBuffer();
378 try {
379 BufferedReader reader = makeConnection(url_string);
380
381 String line;
382 while((line = reader.readLine())!= null) {
383 buffer.append(line);
384 }
385
386
387 } catch (Exception e) {
388 logger.error("exception happened");
389 e.printStackTrace();
390 }
391
392 String node_content = buffer.toString();
393 String escaped_content = GSXML.xmlSafe(node_content);
394
395 StringBuffer processed_content = new StringBuffer(escaped_content.length());
396 processed_content.append("<nodeContent>");
397 int pos = 0;
398 int lastpos = 0;
399 while ((pos = escaped_content.indexOf("&lt;a ", lastpos))!= -1) {
400 processed_content.append(escaped_content.substring(lastpos, pos));
401 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
402 if (endpos == -1) {
403 break;
404 }
405 String link = escaped_content.substring(pos, endpos+10);
406 link = convertLink(link);
407 processed_content.append(link);
408 lastpos = endpos+10;
409 }
410 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
411 processed_content.append("</nodeContent>");
412
413 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
414 doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_id);
415
416 Document content_doc = this.converter.getDOM(processed_content.toString());
417 if (content_doc != null) {
418 Element content_element = content_doc.getDocumentElement();
419 doc_node.appendChild(this.doc.importNode(content_element, true));
420 } else {
421 logger.error("Couldn't parse the node content");
422 }
423 return doc_node;
424
425 }
426
427 /** converts a url from an <a> element into a greenstone suitable one */
428 protected String convertLink(String aref) {
429
430 if (aref.indexOf("href=&quot;http") != -1) {
431 return aref; // an external link
432 }
433 String type = "other";
434 if (aref.indexOf("/cgi-bin/canned_search")!=-1) {
435 type="query";
436 } else if (aref.indexOf("/cgi-bin/click_through") != -1) {
437 type = "external";
438 } else if (aref.indexOf("/cgi-bin/view_record") != -1) {
439 type="document";
440 }
441
442 int href_start = aref.indexOf("href=&quot;")+11;
443 int href_end = aref.indexOf("&gt;", href_start);
444 String href = aref.substring(href_start, href_end);
445 String link_content = aref.substring(href_end+4, aref.length()-10);
446
447 if (type.equals("external")) {
448 // the external link is everything after the http at the end.
449 String address = href.substring(href.lastIndexOf("http"));
450 address = address.replaceAll("%3[aA]", ":");
451 address = address.replaceAll("%2[fF]", "/");
452
453 return "&lt;a href=\""+address+"\"&gt;"+link_content+"&lt;/a&gt;";
454 }
455 if (type.equals("other")) {
456 return "other type of link ("+link_content+")";
457 }
458 StringBuffer result = new StringBuffer();
459 result.append("<link type='");
460 result.append(type);
461 result.append("'");
462 if (type.equals("query")) {
463 result.append(" service='TextQuery'");
464 }
465 result.append(">");
466 // add in the parameters
467 href = href.substring(href.indexOf("?")+1);
468 String [] params = href.split("&amp;");
469 for (int i=0; i<params.length; i++) {
470 String param = params[i];
471 int eq_pos = param.indexOf("=");
472 if (eq_pos != -1) {
473
474 result.append("<param name='"+param.substring(0, eq_pos)+"' value='"+param.substring(eq_pos+1)+"'/>");
475 }
476 }
477 result.append(link_content);
478 result.append("</link>");
479
480 return result.toString();
481
482 }
483
484 // iVia craps out if we ask for a metadata which is not valid. So need
485 // to make sure we only ask for acceptable fields.
486 protected boolean isAcceptableMetadata(String meta) {
487 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
488 if (valid_metadata.indexOf(","+meta+",")!=-1) {
489 return true;
490 }
491 return false;
492 }
493 protected BufferedReader makeConnection(String url_string) {
494 BufferedReader reader = null;
495 try {
496 URL url = new URL(url_string);
497 HttpURLConnection connection = (HttpURLConnection)url.openConnection();
498 InputStream input = connection.getInputStream();
499 reader = new BufferedReader(new InputStreamReader(input));
500 } catch (java.net.MalformedURLException e) {
501
502 logger.error("Malformed URL: "+url_string);
503 } catch (java.io.IOException e) {
504 logger.error("An error occurred during IO to url "+url_string);
505 }
506 return reader;
507 }
508
509}
Note: See TracBrowser for help on using the repository browser.