source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/IViaProxy.java@ 25978

Last change on this file since 25978 was 25978, checked in by sjm84, 12 years ago

Reformatting this file

  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import java.io.BufferedReader;
5import java.io.InputStream;
6import java.io.InputStreamReader;
7import java.io.Serializable;
8import java.net.HttpURLConnection;
9import java.net.URL;
10import java.util.HashMap;
11
12import org.apache.log4j.Logger;
13import org.greenstone.gsdl3.util.GSPath;
14import org.greenstone.gsdl3.util.GSXML;
15import org.w3c.dom.Document;
16import org.w3c.dom.Element;
17import org.w3c.dom.NodeList;
18
19/**
20 *
21 * @author Katherine Don
22 * @version $Revision: 25978 $
23 */
24
25public class IViaProxy extends ServiceRack
26{
27
28 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.IViaProxy.class.getName());
29
30 // the services on offer
31 // these strings must match what is found in the properties file
32 protected static final String TEXT_QUERY_SERVICE = "TextQuery";
33 protected static final String DOC_CONTENT_SERVICE = "DocumentContentRetrieve";
34 protected static final String DOC_META_SERVICE = "DocumentMetadataRetrieve";
35 protected static final String QUERY_PARAM = "query";
36 protected static final String FIELD_PARAM = "fields";
37 // have standard gs param names for hits per page, and start page
38 // these need to be mapped to iVia params
39 protected static final String GS_HITS_PARAM = "hitsPerPage";
40 protected static final String IM_HITS_PARAM = "no_of_records_per_page";
41 protected static final String GS_START_PAGE_PARAM = "startPage";
42 protected static final String IM_START_PAGE_PARAM = "start_page_no";
43
44 protected String ivia_server_url = null;
45
46 public boolean configure(Element info, Element extra_info)
47 {
48
49 if (!super.configure(info, extra_info))
50 {
51 return false;
52 }
53
54 Element server_elem = (Element) GSXML.getChildByTagName(info, "iViaServer");
55 if (server_elem == null)
56 {
57 logger.error("no iViaServer element found");
58 return false;
59 }
60 ivia_server_url = server_elem.getAttribute("url");
61 if (ivia_server_url.equals(""))
62 {
63 logger.error("no url for the iViaServer element");
64 return false;
65 }
66 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
67 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
68 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
69 this.short_service_info.appendChild(tq_service);
70
71 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
72 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
73 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
74 this.short_service_info.appendChild(dc_service);
75
76 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
77 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
78 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
79 this.short_service_info.appendChild(dm_service);
80
81 //
82 // add some format info to service map if there is any
83 String path = GSPath.appendLink(GSXML.SEARCH_ELEM, GSXML.FORMAT_ELEM);
84 Element format = (Element) GSXML.getNodeByPath(extra_info, path);
85 if (format != null)
86 {
87 this.format_info_map.put(TEXT_QUERY_SERVICE, this.doc.importNode(format, true));
88 }
89
90 // look for document display format
91 path = GSPath.appendLink(GSXML.DISPLAY_ELEM, GSXML.FORMAT_ELEM);
92 Element display_format = (Element) GSXML.getNodeByPath(extra_info, path);
93 if (display_format != null)
94 {
95 this.format_info_map.put(DOC_CONTENT_SERVICE, this.doc.importNode(display_format, true));
96 // shoudl we make a copy?
97 }
98
99 return true;
100
101 }
102
103 protected Element getServiceDescription(String service, String lang, String subset)
104 {
105
106 if (service.equals(TEXT_QUERY_SERVICE))
107 {
108 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
109 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
110 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
111 if (subset == null || subset.equals(GSXML.DISPLAY_TEXT_ELEM + GSXML.LIST_MODIFIER))
112 {
113 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(TEXT_QUERY_SERVICE + ".name", lang)));
114 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_SUBMIT, getTextString(TEXT_QUERY_SERVICE + ".submit", lang)));
115 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(TEXT_QUERY_SERVICE + ".description", lang)));
116 }
117 if (subset == null || subset.equals(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER))
118 {
119 Element param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
120 tq_service.appendChild(param_list);
121 Element param = GSXML.createParameterDescription(this.doc, QUERY_PARAM, getTextString("param." + QUERY_PARAM, lang), GSXML.PARAM_TYPE_STRING, null, null, null);
122 param_list.appendChild(param);
123 String[] field_ids = { "kw", "au", "su", "ti", "de", "fu" };
124 String[] field_names = { getTextString("param." + FIELD_PARAM + ".kw", lang), getTextString("param." + FIELD_PARAM + ".au", lang), getTextString("param." + FIELD_PARAM + ".su", lang), getTextString("param." + FIELD_PARAM + ".ti", lang), getTextString("param." + FIELD_PARAM + ".de", lang), getTextString("param." + FIELD_PARAM + ".fu", lang) };
125
126 param = GSXML.createParameterDescription(this.doc, FIELD_PARAM, getTextString("param." + FIELD_PARAM, lang), GSXML.PARAM_TYPE_ENUM_MULTI, "kw,au,su,ti,de,fu", field_ids, field_names);
127 param_list.appendChild(param);
128
129 String[] hits_options = { "10", "30", "50" };
130 param = GSXML.createParameterDescription(this.doc, GS_HITS_PARAM, getTextString("param." + GS_HITS_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, "10", hits_options, hits_options);
131 param_list.appendChild(param);
132
133 param = GSXML.createParameterDescription(this.doc, GS_START_PAGE_PARAM, "", GSXML.PARAM_TYPE_INVISIBLE, "1", null, null);
134 param_list.appendChild(param);
135 }
136 return tq_service;
137 }
138 if (service.equals(DOC_META_SERVICE))
139 {
140 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
141 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
142 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
143 return dm_service;
144
145 }
146 if (service.equals(DOC_CONTENT_SERVICE))
147 {
148 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
149 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
150 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
151 return dc_service;
152
153 }
154 return null;
155 }
156
157 /** Process a text query - implemented by concrete subclasses */
158 protected Element processTextQuery(Element request)
159 {
160
161 // Create a new (empty) result message
162 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
163 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
164 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
165 Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
166 result.appendChild(doc_node_list);
167
168 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
169 if (param_list == null)
170 {
171 logger.error("TextQuery request had no paramList.");
172 return result; // Return the empty result
173 }
174
175 // Process the request parameters
176 HashMap<String, Serializable> params = GSXML.extractParams(param_list, false);
177
178 // Make sure a query has been specified
179 String query = (String) params.get(QUERY_PARAM);
180 if (query == null || query.equals(""))
181 {
182 return result; // Return the empty result
183 }
184 // tidy whitespace
185 query = query.replaceAll("\\s+", "+");
186 String url_string = ivia_server_url + "/cgi-bin/canned_search?theme=gsdl3&query=" + query;
187
188 // check for fields
189 String fields = (String) params.get(FIELD_PARAM);
190 if (fields != null && !fields.equals(""))
191 {
192 url_string += "&fields=" + fields;
193 }
194
195 //check for hits per page
196 String hits_per_page = (String) params.get(GS_HITS_PARAM);
197 if (hits_per_page != null && !hits_per_page.equals(""))
198 {
199 url_string += "&" + IM_HITS_PARAM + "=" + hits_per_page;
200 }
201
202 // check for start page
203 String start_page = (String) params.get(GS_START_PAGE_PARAM);
204 if (start_page != null && !start_page.equals(""))
205 {
206 url_string += "&" + IM_START_PAGE_PARAM + "=" + start_page;
207 }
208 String results_num = null;
209 String doc_ids = null;
210 try
211 {
212 logger.debug("IViaProxy, sending " + url_string);
213 BufferedReader reader = makeConnection(url_string);
214 results_num = reader.readLine();
215 doc_ids = reader.readLine();
216
217 }
218 catch (Exception e)
219 {
220 logger.error("exception happened during query");
221 e.printStackTrace();
222 return result;
223 }
224
225 if (results_num.startsWith("Resources: "))
226 {
227 results_num = results_num.substring(11);
228 }
229 else
230 {
231 logger.error("badly formatted results line: " + results_num);
232 return result;
233 }
234 if (doc_ids.startsWith("Ids: "))
235 {
236 doc_ids = doc_ids.substring(5).trim();
237 }
238 else
239 {
240 logger.error("badly formatted docs line: " + doc_ids);
241 return result;
242 }
243
244 // get the num docs and add to a metadata list
245 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
246 result.appendChild(metadata_list);
247
248 // Add a metadata element specifying the number of matching documents
249 long numdocs = Long.parseLong(results_num);
250 GSXML.addMetadata(this.doc, metadata_list, "numDocsMatched", "" + numdocs);
251 String[] ids = doc_ids.split(" ");
252
253 for (int d = 0; d < ids.length; d++)
254 {
255 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
256 doc_node.setAttribute(GSXML.NODE_ID_ATT, ids[d]);
257 doc_node_list.appendChild(doc_node);
258 }
259 logger.debug("IViaProxy result:");
260 logger.debug(this.converter.getString(result));
261 return result;
262
263 }
264
265 protected Element processDocumentMetadataRetrieve(Element request)
266 {
267 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
268 result.setAttribute(GSXML.FROM_ATT, DOC_META_SERVICE);
269 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
270
271 // Get the parameters of the request
272 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
273 if (param_list == null)
274 {
275 logger.error("missing paramList.\n");
276 return result; // Return the empty result
277 }
278
279 // The metadata information required
280 StringBuffer field_list = new StringBuffer();
281 Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
282 while (param != null)
283 {
284 // Identify the metadata information desired
285 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata"))
286 {
287 String metadata = GSXML.getValue(param);
288 if (isAcceptableMetadata(metadata))
289 {
290 field_list.append(metadata);
291 field_list.append(",");
292 }
293 }
294 param = (Element) param.getNextSibling();
295 }
296
297 if (field_list.length() == 0)
298 {
299 logger.error("no metadata specified.\n");
300 return result;
301 }
302
303 // Get the documents
304 Element request_node_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
305 if (request_node_list == null)
306 {
307 logger.error("DocumentMetadataRetrieve request had no " + GSXML.DOC_NODE_ELEM + "List.\n");
308 return result;
309 }
310
311 StringBuffer record_id_list = new StringBuffer();
312
313 NodeList request_nodes = request_node_list.getChildNodes();
314 for (int i = 0; i < request_nodes.getLength(); i++)
315 {
316 Element request_node = (Element) request_nodes.item(i);
317 String node_id = request_node.getAttribute(GSXML.NODE_ID_ATT);
318 record_id_list.append(node_id);
319 record_id_list.append(",");
320 }
321
322 // do the query to the iVia server
323 String url_string = ivia_server_url + "/cgi-bin/view_record_set?theme=gsdl3&record_id_list=" + record_id_list.toString() + "&field_list=" + field_list.toString();
324
325 Element node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
326 result.appendChild(node_list);
327 try
328 {
329 BufferedReader reader = makeConnection(url_string);
330 String line;
331 while ((line = reader.readLine()) != null)
332 {
333 if (!line.startsWith("Record:"))
334 {
335 continue;
336 }
337 // the first line is the record
338 line = line.substring(8);
339 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
340 doc_node.setAttribute(GSXML.NODE_ID_ATT, line);
341 Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
342 doc_node.appendChild(meta_list);
343 while ((line = reader.readLine()) != null)
344 {
345 //metadata entry
346 int col_pos = line.indexOf(':');
347 if (col_pos == -1)
348 {
349 // end of the metadata for this doc
350 break;
351 }
352 String name = line.substring(0, col_pos);
353 String value = line.substring(col_pos + 2); // includes a space
354 GSXML.addMetadata(this.doc, meta_list, name, value);
355 }
356 node_list.appendChild(doc_node);
357
358 }
359 }
360 catch (Exception e)
361 {
362 logger.error("exception happened");
363 e.printStackTrace();
364 }
365 logger.debug("IViaProxy: returning result: ");
366 logger.debug(this.converter.getPrettyString(result));
367 return result;
368
369 }
370
371 protected Element processDocumentContentRetrieve(Element request)
372 {
373 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
374 result.setAttribute(GSXML.FROM_ATT, DOC_CONTENT_SERVICE);
375 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
376
377 // Get the request doc_list
378 Element query_doc_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
379 if (query_doc_list == null)
380 {
381 logger.error("DocumentContentRetrieve request specified no doc nodes.\n");
382 return result;
383 }
384
385 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
386 result.appendChild(doc_list);
387
388 // Get the documents
389 String[] doc_ids = GSXML.getAttributeValuesFromList(query_doc_list, GSXML.NODE_ID_ATT);
390 for (int i = 0; i < doc_ids.length; i++)
391 {
392 String doc_id = doc_ids[i];
393 Element doc_node = getDocument(doc_id);
394 doc_list.appendChild(doc_node);
395 }
396 return result;
397
398 }
399
400 /**
401 * gets a document by sending a request to iVia, then processes it and
402 * creates a documentNode around the text
403 */
404 protected Element getDocument(String doc_id)
405 {
406
407 String url_string = ivia_server_url + "/cgi-bin/view_record?theme=gsdl3&record_id=" + doc_id;
408 StringBuffer buffer = new StringBuffer();
409 try
410 {
411 BufferedReader reader = makeConnection(url_string);
412
413 String line;
414 while ((line = reader.readLine()) != null)
415 {
416 buffer.append(line);
417 }
418
419 }
420 catch (Exception e)
421 {
422 logger.error("exception happened");
423 e.printStackTrace();
424 }
425
426 String node_content = buffer.toString();
427 String escaped_content = GSXML.xmlSafe(node_content);
428
429 StringBuffer processed_content = new StringBuffer(escaped_content.length());
430 processed_content.append("<nodeContent>");
431 int pos = 0;
432 int lastpos = 0;
433 while ((pos = escaped_content.indexOf("&lt;a ", lastpos)) != -1)
434 {
435 processed_content.append(escaped_content.substring(lastpos, pos));
436 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
437 if (endpos == -1)
438 {
439 break;
440 }
441 String link = escaped_content.substring(pos, endpos + 10);
442 link = convertLink(link);
443 processed_content.append(link);
444 lastpos = endpos + 10;
445 }
446 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
447 processed_content.append("</nodeContent>");
448
449 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
450 doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_id);
451
452 Document content_doc = this.converter.getDOM(processed_content.toString());
453 if (content_doc != null)
454 {
455 Element content_element = content_doc.getDocumentElement();
456 doc_node.appendChild(this.doc.importNode(content_element, true));
457 }
458 else
459 {
460 logger.error("Couldn't parse the node content");
461 }
462 return doc_node;
463
464 }
465
466 /** converts a url from an <a> element into a greenstone suitable one */
467 protected String convertLink(String aref)
468 {
469
470 if (aref.indexOf("href=&quot;http") != -1)
471 {
472 return aref; // an external link
473 }
474 String type = "other";
475 if (aref.indexOf("/cgi-bin/canned_search") != -1)
476 {
477 type = "query";
478 }
479 else if (aref.indexOf("/cgi-bin/click_through") != -1)
480 {
481 type = "external";
482 }
483 else if (aref.indexOf("/cgi-bin/view_record") != -1)
484 {
485 type = "document";
486 }
487
488 int href_start = aref.indexOf("href=&quot;") + 11;
489 int href_end = aref.indexOf("&gt;", href_start);
490 String href = aref.substring(href_start, href_end);
491 String link_content = aref.substring(href_end + 4, aref.length() - 10);
492
493 if (type.equals("external"))
494 {
495 // the external link is everything after the http at the end.
496 String address = href.substring(href.lastIndexOf("http"));
497 address = address.replaceAll("%3[aA]", ":");
498 address = address.replaceAll("%2[fF]", "/");
499
500 return "&lt;a href=\"" + address + "\"&gt;" + link_content + "&lt;/a&gt;";
501 }
502 if (type.equals("other"))
503 {
504 return "other type of link (" + link_content + ")";
505 }
506 StringBuffer result = new StringBuffer();
507 result.append("<link type='");
508 result.append(type);
509 result.append("'");
510 if (type.equals("query"))
511 {
512 result.append(" service='TextQuery'");
513 }
514 result.append(">");
515 // add in the parameters
516 href = href.substring(href.indexOf("?") + 1);
517 String[] params = href.split("&amp;");
518 for (int i = 0; i < params.length; i++)
519 {
520 String param = params[i];
521 int eq_pos = param.indexOf("=");
522 if (eq_pos != -1)
523 {
524
525 result.append("<param name='" + param.substring(0, eq_pos) + "' value='" + param.substring(eq_pos + 1) + "'/>");
526 }
527 }
528 result.append(link_content);
529 result.append("</link>");
530
531 return result.toString();
532
533 }
534
535 // iVia craps out if we ask for a metadata which is not valid. So need
536 // to make sure we only ask for acceptable fields.
537 protected boolean isAcceptableMetadata(String meta)
538 {
539 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
540 if (valid_metadata.indexOf("," + meta + ",") != -1)
541 {
542 return true;
543 }
544 return false;
545 }
546
547 protected BufferedReader makeConnection(String url_string)
548 {
549 BufferedReader reader = null;
550 try
551 {
552 URL url = new URL(url_string);
553 HttpURLConnection connection = (HttpURLConnection) url.openConnection();
554 InputStream input = connection.getInputStream();
555 reader = new BufferedReader(new InputStreamReader(input));
556 }
557 catch (java.net.MalformedURLException e)
558 {
559
560 logger.error("Malformed URL: " + url_string);
561 }
562 catch (java.io.IOException e)
563 {
564 logger.error("An error occurred during IO to url " + url_string);
565 }
566 return reader;
567 }
568
569}
Note: See TracBrowser for help on using the repository browser.