source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/IViaProxy.java@ 28966

Last change on this file since 28966 was 28966, checked in by kjdon, 10 years ago

Lots of changes. Mainly to do with removing this.doc from everywhere. Document is not thread safe. Now we tend to create a new Document everytime we are starting a new page/message etc. in service this.desc_doc is available as teh document to create service info stuff. But it should only be used for this and not for other messages. newDOM is now static for XMLConverter. method param changes for some GSXML methods.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.3 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import java.io.BufferedReader;
5import java.io.InputStream;
6import java.io.InputStreamReader;
7import java.io.Serializable;
8import java.net.HttpURLConnection;
9import java.net.URL;
10import java.util.HashMap;
11
12import org.apache.log4j.Logger;
13import org.greenstone.gsdl3.util.GSPath;
14import org.greenstone.gsdl3.util.GSXML;
15import org.greenstone.gsdl3.util.XMLConverter;
16
17import org.w3c.dom.Document;
18import org.w3c.dom.Element;
19import org.w3c.dom.NodeList;
20
21/**
22 * Service class to proxy IVia
23 *
24 */
25
26public class IViaProxy extends ServiceRack
27{
28
29 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.IViaProxy.class.getName());
30
31 // the services on offer
32 // these strings must match what is found in the properties file
33 protected static final String TEXT_QUERY_SERVICE = "TextQuery";
34 protected static final String DOC_CONTENT_SERVICE = "DocumentContentRetrieve";
35 protected static final String DOC_META_SERVICE = "DocumentMetadataRetrieve";
36 protected static final String QUERY_PARAM = "query";
37 protected static final String FIELD_PARAM = "fields";
38 // have standard gs param names for hits per page, and start page
39 // these need to be mapped to iVia params
40 protected static final String GS_HITS_PARAM = "hitsPerPage";
41 protected static final String IM_HITS_PARAM = "no_of_records_per_page";
42 protected static final String GS_START_PAGE_PARAM = "startPage";
43 protected static final String IM_START_PAGE_PARAM = "start_page_no";
44
45 protected String ivia_server_url = null;
46
47 public boolean configure(Element info, Element extra_info)
48 {
49
50 if (!super.configure(info, extra_info))
51 {
52 return false;
53 }
54
55 Element server_elem = (Element) GSXML.getChildByTagName(info, "iViaServer");
56 if (server_elem == null)
57 {
58 logger.error("no iViaServer element found");
59 return false;
60 }
61 ivia_server_url = server_elem.getAttribute("url");
62 if (ivia_server_url.equals(""))
63 {
64 logger.error("no url for the iViaServer element");
65 return false;
66 }
67 Element tq_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
68 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
69 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
70 this.short_service_info.appendChild(tq_service);
71
72 Element dc_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
73 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
74 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
75 this.short_service_info.appendChild(dc_service);
76
77 Element dm_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
78 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
79 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
80 this.short_service_info.appendChild(dm_service);
81
82 //
83 // add some format info to service map if there is any
84 String path = GSPath.appendLink(GSXML.SEARCH_ELEM, GSXML.FORMAT_ELEM);
85 Element format = (Element) GSXML.getNodeByPath(extra_info, path);
86 if (format != null)
87 {
88 this.format_info_map.put(TEXT_QUERY_SERVICE, this.desc_doc.importNode(format, true));
89 }
90
91 // look for document display format
92 path = GSPath.appendLink(GSXML.DISPLAY_ELEM, GSXML.FORMAT_ELEM);
93 Element display_format = (Element) GSXML.getNodeByPath(extra_info, path);
94 if (display_format != null)
95 {
96 this.format_info_map.put(DOC_CONTENT_SERVICE, this.desc_doc.importNode(display_format, true));
97 // shoudl we make a copy?
98 }
99
100 return true;
101
102 }
103
104 protected Element getServiceDescription(Document doc, String service, String lang, String subset)
105 {
106
107 if (service.equals(TEXT_QUERY_SERVICE))
108 {
109 Element tq_service = doc.createElement(GSXML.SERVICE_ELEM);
110 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
111 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
112 if (subset == null || subset.equals(GSXML.DISPLAY_TEXT_ELEM + GSXML.LIST_MODIFIER))
113 {
114 tq_service.appendChild(GSXML.createDisplayTextElement(doc, GSXML.DISPLAY_TEXT_NAME, getTextString(TEXT_QUERY_SERVICE + ".name", lang)));
115 tq_service.appendChild(GSXML.createDisplayTextElement(doc, GSXML.DISPLAY_TEXT_SUBMIT, getTextString(TEXT_QUERY_SERVICE + ".submit", lang)));
116 tq_service.appendChild(GSXML.createDisplayTextElement(doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(TEXT_QUERY_SERVICE + ".description", lang)));
117 }
118 if (subset == null || subset.equals(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER))
119 {
120 Element param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
121 tq_service.appendChild(param_list);
122 Element param = GSXML.createParameterDescription(doc, QUERY_PARAM, getTextString("param." + QUERY_PARAM, lang), GSXML.PARAM_TYPE_STRING, null, null, null);
123 param_list.appendChild(param);
124 String[] field_ids = { "kw", "au", "su", "ti", "de", "fu" };
125 String[] field_names = { getTextString("param." + FIELD_PARAM + ".kw", lang), getTextString("param." + FIELD_PARAM + ".au", lang), getTextString("param." + FIELD_PARAM + ".su", lang), getTextString("param." + FIELD_PARAM + ".ti", lang), getTextString("param." + FIELD_PARAM + ".de", lang), getTextString("param." + FIELD_PARAM + ".fu", lang) };
126
127 param = GSXML.createParameterDescription(doc, FIELD_PARAM, getTextString("param." + FIELD_PARAM, lang), GSXML.PARAM_TYPE_ENUM_MULTI, "kw,au,su,ti,de,fu", field_ids, field_names);
128 param_list.appendChild(param);
129
130 String[] hits_options = { "10", "30", "50" };
131 param = GSXML.createParameterDescription(doc, GS_HITS_PARAM, getTextString("param." + GS_HITS_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, "10", hits_options, hits_options);
132 param_list.appendChild(param);
133
134 param = GSXML.createParameterDescription(doc, GS_START_PAGE_PARAM, "", GSXML.PARAM_TYPE_INVISIBLE, "1", null, null);
135 param_list.appendChild(param);
136 }
137 return tq_service;
138 }
139 if (service.equals(DOC_META_SERVICE))
140 {
141 Element dm_service = doc.createElement(GSXML.SERVICE_ELEM);
142 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
143 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
144 return dm_service;
145
146 }
147 if (service.equals(DOC_CONTENT_SERVICE))
148 {
149 Element dc_service = doc.createElement(GSXML.SERVICE_ELEM);
150 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
151 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
152 return dc_service;
153
154 }
155 return null;
156 }
157
158 /** Process a text query - implemented by concrete subclasses */
159 protected Element processTextQuery(Element request)
160 {
161 Document result_doc = XMLConverter.newDOM();
162 // Create a new (empty) result message
163 Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
164 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
165 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
166 Element doc_node_list = result_doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
167 result.appendChild(doc_node_list);
168
169 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
170 if (param_list == null)
171 {
172 logger.error("TextQuery request had no paramList.");
173 return result; // Return the empty result
174 }
175
176 // Process the request parameters
177 HashMap<String, Serializable> params = GSXML.extractParams(param_list, false);
178
179 // Make sure a query has been specified
180 String query = (String) params.get(QUERY_PARAM);
181 if (query == null || query.equals(""))
182 {
183 return result; // Return the empty result
184 }
185 // tidy whitespace
186 query = query.replaceAll("\\s+", "+");
187 String url_string = ivia_server_url + "/cgi-bin/canned_search?theme=gsdl3&query=" + query;
188
189 // check for fields
190 String fields = (String) params.get(FIELD_PARAM);
191 if (fields != null && !fields.equals(""))
192 {
193 url_string += "&fields=" + fields;
194 }
195
196 //check for hits per page
197 String hits_per_page = (String) params.get(GS_HITS_PARAM);
198 if (hits_per_page != null && !hits_per_page.equals(""))
199 {
200 url_string += "&" + IM_HITS_PARAM + "=" + hits_per_page;
201 }
202
203 // check for start page
204 String start_page = (String) params.get(GS_START_PAGE_PARAM);
205 if (start_page != null && !start_page.equals(""))
206 {
207 url_string += "&" + IM_START_PAGE_PARAM + "=" + start_page;
208 }
209 String results_num = null;
210 String doc_ids = null;
211 try
212 {
213 logger.debug("IViaProxy, sending " + url_string);
214 BufferedReader reader = makeConnection(url_string);
215 results_num = reader.readLine();
216 doc_ids = reader.readLine();
217
218 }
219 catch (Exception e)
220 {
221 logger.error("exception happened during query");
222 e.printStackTrace();
223 return result;
224 }
225
226 if (results_num.startsWith("Resources: "))
227 {
228 results_num = results_num.substring(11);
229 }
230 else
231 {
232 logger.error("badly formatted results line: " + results_num);
233 return result;
234 }
235 if (doc_ids.startsWith("Ids: "))
236 {
237 doc_ids = doc_ids.substring(5).trim();
238 }
239 else
240 {
241 logger.error("badly formatted docs line: " + doc_ids);
242 return result;
243 }
244
245 // get the num docs and add to a metadata list
246 Element metadata_list = result_doc.createElement(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
247 result.appendChild(metadata_list);
248
249 // Add a metadata element specifying the number of matching documents
250 long numdocs = Long.parseLong(results_num);
251 GSXML.addMetadata(metadata_list, "numDocsMatched", "" + numdocs);
252 String[] ids = doc_ids.split(" ");
253
254 for (int d = 0; d < ids.length; d++)
255 {
256 Element doc_node = result_doc.createElement(GSXML.DOC_NODE_ELEM);
257 doc_node.setAttribute(GSXML.NODE_ID_ATT, ids[d]);
258 doc_node_list.appendChild(doc_node);
259 }
260 logger.debug("IViaProxy result:");
261 logger.debug(this.converter.getString(result));
262 return result;
263
264 }
265
266 protected Element processDocumentMetadataRetrieve(Element request)
267 {
268 Document result_doc = XMLConverter.newDOM();
269 Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
270 result.setAttribute(GSXML.FROM_ATT, DOC_META_SERVICE);
271 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
272
273 // Get the parameters of the request
274 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
275 if (param_list == null)
276 {
277 logger.error("missing paramList.\n");
278 return result; // Return the empty result
279 }
280
281 // The metadata information required
282 StringBuffer field_list = new StringBuffer();
283 Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
284 while (param != null)
285 {
286 // Identify the metadata information desired
287 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata"))
288 {
289 String metadata = GSXML.getValue(param);
290 if (isAcceptableMetadata(metadata))
291 {
292 field_list.append(metadata);
293 field_list.append(",");
294 }
295 }
296 param = (Element) param.getNextSibling();
297 }
298
299 if (field_list.length() == 0)
300 {
301 logger.error("no metadata specified.\n");
302 return result;
303 }
304
305 // Get the documents
306 Element request_node_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
307 if (request_node_list == null)
308 {
309 logger.error("DocumentMetadataRetrieve request had no " + GSXML.DOC_NODE_ELEM + "List.\n");
310 return result;
311 }
312
313 StringBuffer record_id_list = new StringBuffer();
314
315 NodeList request_nodes = request_node_list.getChildNodes();
316 for (int i = 0; i < request_nodes.getLength(); i++)
317 {
318 Element request_node = (Element) request_nodes.item(i);
319 String node_id = request_node.getAttribute(GSXML.NODE_ID_ATT);
320 record_id_list.append(node_id);
321 record_id_list.append(",");
322 }
323
324 // do the query to the iVia server
325 String url_string = ivia_server_url + "/cgi-bin/view_record_set?theme=gsdl3&record_id_list=" + record_id_list.toString() + "&field_list=" + field_list.toString();
326
327 Element node_list = result_doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
328 result.appendChild(node_list);
329 try
330 {
331 BufferedReader reader = makeConnection(url_string);
332 String line;
333 while ((line = reader.readLine()) != null)
334 {
335 if (!line.startsWith("Record:"))
336 {
337 continue;
338 }
339 // the first line is the record
340 line = line.substring(8);
341 Element doc_node = result_doc.createElement(GSXML.DOC_NODE_ELEM);
342 doc_node.setAttribute(GSXML.NODE_ID_ATT, line);
343 Element meta_list = result_doc.createElement(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
344 doc_node.appendChild(meta_list);
345 while ((line = reader.readLine()) != null)
346 {
347 //metadata entry
348 int col_pos = line.indexOf(':');
349 if (col_pos == -1)
350 {
351 // end of the metadata for this doc
352 break;
353 }
354 String name = line.substring(0, col_pos);
355 String value = line.substring(col_pos + 2); // includes a space
356 GSXML.addMetadata(meta_list, name, value);
357 }
358 node_list.appendChild(doc_node);
359
360 }
361 }
362 catch (Exception e)
363 {
364 logger.error("exception happened");
365 e.printStackTrace();
366 }
367 logger.debug("IViaProxy: returning result: ");
368 logger.debug(this.converter.getPrettyString(result));
369 return result;
370
371 }
372
373 protected Element processDocumentContentRetrieve(Element request)
374 {
375 Document result_doc = XMLConverter.newDOM();
376 Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
377 result.setAttribute(GSXML.FROM_ATT, DOC_CONTENT_SERVICE);
378 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
379
380 // Get the request doc_list
381 Element query_doc_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
382 if (query_doc_list == null)
383 {
384 logger.error("DocumentContentRetrieve request specified no doc nodes.\n");
385 return result;
386 }
387
388 Element doc_list = result_doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
389 result.appendChild(doc_list);
390
391 // Get the documents
392 String[] doc_ids = GSXML.getAttributeValuesFromList(query_doc_list, GSXML.NODE_ID_ATT);
393 for (int i = 0; i < doc_ids.length; i++)
394 {
395 String doc_id = doc_ids[i];
396 Element doc_node = getDocument(result_doc, doc_id);
397 doc_list.appendChild(doc_node);
398 }
399 return result;
400
401 }
402
403 /**
404 * gets a document by sending a request to iVia, then processes it and
405 * creates a documentNode around the text
406 */
407 protected Element getDocument(Document result_doc, String doc_id)
408 {
409
410 String url_string = ivia_server_url + "/cgi-bin/view_record?theme=gsdl3&record_id=" + doc_id;
411 StringBuffer buffer = new StringBuffer();
412 try
413 {
414 BufferedReader reader = makeConnection(url_string);
415
416 String line;
417 while ((line = reader.readLine()) != null)
418 {
419 buffer.append(line);
420 }
421
422 }
423 catch (Exception e)
424 {
425 logger.error("exception happened");
426 e.printStackTrace();
427 }
428
429 String node_content = buffer.toString();
430 String escaped_content = GSXML.xmlSafe(node_content);
431
432 StringBuffer processed_content = new StringBuffer(escaped_content.length());
433 processed_content.append("<nodeContent>");
434 int pos = 0;
435 int lastpos = 0;
436 while ((pos = escaped_content.indexOf("&lt;a ", lastpos)) != -1)
437 {
438 processed_content.append(escaped_content.substring(lastpos, pos));
439 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
440 if (endpos == -1)
441 {
442 break;
443 }
444 String link = escaped_content.substring(pos, endpos + 10);
445 link = convertLink(link);
446 processed_content.append(link);
447 lastpos = endpos + 10;
448 }
449 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
450 processed_content.append("</nodeContent>");
451
452 Element doc_node = result_doc.createElement(GSXML.DOC_NODE_ELEM);
453 doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_id);
454
455 Document content_doc = this.converter.getDOM(processed_content.toString());
456 if (content_doc != null)
457 {
458 Element content_element = content_doc.getDocumentElement();
459 doc_node.appendChild(result_doc.importNode(content_element, true));
460 }
461 else
462 {
463 logger.error("Couldn't parse the node content");
464 }
465 return doc_node;
466
467 }
468
469 /** converts a url from an <a> element into a greenstone suitable one */
470 protected String convertLink(String aref)
471 {
472
473 if (aref.indexOf("href=&quot;http") != -1)
474 {
475 return aref; // an external link
476 }
477 String type = "other";
478 if (aref.indexOf("/cgi-bin/canned_search") != -1)
479 {
480 type = "query";
481 }
482 else if (aref.indexOf("/cgi-bin/click_through") != -1)
483 {
484 type = "external";
485 }
486 else if (aref.indexOf("/cgi-bin/view_record") != -1)
487 {
488 type = "document";
489 }
490
491 int href_start = aref.indexOf("href=&quot;") + 11;
492 int href_end = aref.indexOf("&gt;", href_start);
493 String href = aref.substring(href_start, href_end);
494 String link_content = aref.substring(href_end + 4, aref.length() - 10);
495
496 if (type.equals("external"))
497 {
498 // the external link is everything after the http at the end.
499 String address = href.substring(href.lastIndexOf("http"));
500 address = address.replaceAll("%3[aA]", ":");
501 address = address.replaceAll("%2[fF]", "/");
502
503 return "&lt;a href=\"" + address + "\"&gt;" + link_content + "&lt;/a&gt;";
504 }
505 if (type.equals("other"))
506 {
507 return "other type of link (" + link_content + ")";
508 }
509 StringBuffer result = new StringBuffer();
510 result.append("<link type='");
511 result.append(type);
512 result.append("'");
513 if (type.equals("query"))
514 {
515 result.append(" service='TextQuery'");
516 }
517 result.append(">");
518 // add in the parameters
519 href = href.substring(href.indexOf("?") + 1);
520 String[] params = href.split("&amp;");
521 for (int i = 0; i < params.length; i++)
522 {
523 String param = params[i];
524 int eq_pos = param.indexOf("=");
525 if (eq_pos != -1)
526 {
527
528 result.append("<param name='" + param.substring(0, eq_pos) + "' value='" + param.substring(eq_pos + 1) + "'/>");
529 }
530 }
531 result.append(link_content);
532 result.append("</link>");
533
534 return result.toString();
535
536 }
537
538 // iVia craps out if we ask for a metadata which is not valid. So need
539 // to make sure we only ask for acceptable fields.
540 protected boolean isAcceptableMetadata(String meta)
541 {
542 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
543 if (valid_metadata.indexOf("," + meta + ",") != -1)
544 {
545 return true;
546 }
547 return false;
548 }
549
550 protected BufferedReader makeConnection(String url_string)
551 {
552 BufferedReader reader = null;
553 try
554 {
555 URL url = new URL(url_string);
556 HttpURLConnection connection = (HttpURLConnection) url.openConnection();
557 InputStream input = connection.getInputStream();
558 reader = new BufferedReader(new InputStreamReader(input));
559 }
560 catch (java.net.MalformedURLException e)
561 {
562
563 logger.error("Malformed URL: " + url_string);
564 }
565 catch (java.io.IOException e)
566 {
567 logger.error("An error occurred during IO to url " + url_string);
568 }
569 return reader;
570 }
571
572}
Note: See TracBrowser for help on using the repository browser.