source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/IViaProxy.java@ 25727

Last change on this file since 25727 was 25727, checked in by kjdon, 12 years ago

getting rid of my email address

  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.util.*;
5
6// XML classes
7import org.w3c.dom.Element;
8import org.w3c.dom.Document;
9import org.w3c.dom.NodeList;
10
11import java.util.HashMap;
12import java.io.File;
13import java.io.InputStream;
14import java.io.BufferedReader;
15import java.io.InputStreamReader;
16import java.io.IOException;
17import java.io.Serializable;
18import java.net.HttpURLConnection;
19import java.net.URLConnection;
20import java.net.URL;
21import java.net.Authenticator;
22import java.net.MalformedURLException;
23
24import org.apache.log4j.*;
25
26/**
27 *
28 * @author Katherine Don
29 * @version $Revision: 25727 $
30 */
31
32public class IViaProxy
33 extends ServiceRack {
34
35 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.IViaProxy.class.getName());
36
37 // the services on offer
38 // these strings must match what is found in the properties file
39 protected static final String TEXT_QUERY_SERVICE = "TextQuery";
40 protected static final String DOC_CONTENT_SERVICE = "DocumentContentRetrieve";
41 protected static final String DOC_META_SERVICE = "DocumentMetadataRetrieve";
42 protected static final String QUERY_PARAM = "query";
43 protected static final String FIELD_PARAM = "fields";
44 // have standard gs param names for hits per page, and start page
45 // these need to be mapped to iVia params
46 protected static final String GS_HITS_PARAM = "hitsPerPage";
47 protected static final String IM_HITS_PARAM = "no_of_records_per_page";
48 protected static final String GS_START_PAGE_PARAM = "startPage";
49 protected static final String IM_START_PAGE_PARAM = "start_page_no";
50
51 protected String ivia_server_url = null;
52
53 public boolean configure(Element info, Element extra_info) {
54
55 if (!super.configure(info, extra_info)){
56 return false;
57 }
58
59 Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer");
60 if (server_elem == null) {
61 logger.error("no iViaServer element found");
62 return false;
63 }
64 ivia_server_url = server_elem.getAttribute("url");
65 if (ivia_server_url.equals("")) {
66 logger.error("no url for the iViaServer element");
67 return false;
68 }
69 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
70 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
71 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
72 this.short_service_info.appendChild(tq_service);
73
74 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
75 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
76 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
77 this.short_service_info.appendChild(dc_service);
78
79 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
80 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
81 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
82 this.short_service_info.appendChild(dm_service);
83
84 //
85 // add some format info to service map if there is any
86 String path = GSPath.appendLink(GSXML.SEARCH_ELEM, GSXML.FORMAT_ELEM);
87 Element format = (Element) GSXML.getNodeByPath(extra_info, path);
88 if (format != null) {
89 this.format_info_map.put(TEXT_QUERY_SERVICE, this.doc.importNode(format, true));
90 }
91
92
93 // look for document display format
94 path = GSPath.appendLink(GSXML.DISPLAY_ELEM, GSXML.FORMAT_ELEM);
95 Element display_format = (Element)GSXML.getNodeByPath(extra_info, path);
96 if (display_format != null) {
97 this.format_info_map.put(DOC_CONTENT_SERVICE, this.doc.importNode(display_format, true));
98 // shoudl we make a copy?
99 }
100
101 return true;
102
103 }
104
105 protected Element getServiceDescription(String service, String lang, String subset) {
106
107 if (service.equals(TEXT_QUERY_SERVICE)) {
108 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
109 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
110 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
111 if (subset == null || subset.equals(GSXML.DISPLAY_TEXT_ELEM + GSXML.LIST_MODIFIER)) {
112 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(TEXT_QUERY_SERVICE+".name", lang)));
113 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_SUBMIT, getTextString(TEXT_QUERY_SERVICE+".submit", lang)));
114 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(TEXT_QUERY_SERVICE+".description", lang)));
115 }
116 if (subset == null || subset.equals(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER)) {
117 Element param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
118 tq_service.appendChild(param_list);
119 Element param = GSXML.createParameterDescription(this.doc, QUERY_PARAM, getTextString("param."+QUERY_PARAM, lang), GSXML.PARAM_TYPE_STRING, null, null, null);
120 param_list.appendChild(param);
121 String [] field_ids = {"kw", "au", "su", "ti", "de", "fu"};
122 String [] field_names = {
123 getTextString("param."+FIELD_PARAM+".kw", lang),
124 getTextString("param."+FIELD_PARAM+".au", lang),
125 getTextString("param."+FIELD_PARAM+".su", lang),
126 getTextString("param."+FIELD_PARAM+".ti", lang),
127 getTextString("param."+FIELD_PARAM+".de", lang),
128 getTextString("param."+FIELD_PARAM+".fu", lang) };
129
130 param = GSXML.createParameterDescription(this.doc, FIELD_PARAM, getTextString("param."+FIELD_PARAM, lang), GSXML.PARAM_TYPE_ENUM_MULTI, "kw,au,su,ti,de,fu", field_ids, field_names);
131 param_list.appendChild(param);
132
133
134 String [] hits_options = {"10", "30", "50"};
135 param = GSXML.createParameterDescription(this.doc, GS_HITS_PARAM, getTextString("param."+GS_HITS_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, "10", hits_options, hits_options);
136 param_list.appendChild(param);
137
138 param = GSXML.createParameterDescription(this.doc, GS_START_PAGE_PARAM, "", GSXML.PARAM_TYPE_INVISIBLE, "1", null, null);
139 param_list.appendChild(param);
140 }
141 return tq_service;
142 }
143 if (service.equals(DOC_META_SERVICE)) {
144 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
145 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
146 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
147 return dm_service;
148
149 }
150 if (service.equals(DOC_CONTENT_SERVICE)) {
151 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
152 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
153 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
154 return dc_service;
155
156
157 }
158 return null;
159 }
160
161 /** Process a text query - implemented by concrete subclasses */
162 protected Element processTextQuery(Element request) {
163
164 // Create a new (empty) result message
165 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
166 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
167 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
168 Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
169 result.appendChild(doc_node_list);
170
171
172 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
173 if (param_list == null) {
174 logger.error("TextQuery request had no paramList.");
175 return result; // Return the empty result
176 }
177
178 // Process the request parameters
179 HashMap<String, Serializable> params = GSXML.extractParams(param_list, false);
180
181 // Make sure a query has been specified
182 String query = (String) params.get(QUERY_PARAM);
183 if (query == null || query.equals("")) {
184 return result; // Return the empty result
185 }
186 // tidy whitespace
187 query = query.replaceAll("\\s+", "+");
188 String url_string = ivia_server_url+"/cgi-bin/canned_search?theme=gsdl3&query="+query;
189
190 // check for fields
191 String fields = (String) params.get(FIELD_PARAM);
192 if (fields != null && !fields.equals("")) {
193 url_string += "&fields="+fields;
194 }
195
196 //check for hits per page
197 String hits_per_page = (String) params.get(GS_HITS_PARAM);
198 if (hits_per_page != null && !hits_per_page.equals("")) {
199 url_string += "&"+IM_HITS_PARAM+"="+hits_per_page;
200 }
201
202 // check for start page
203 String start_page = (String) params.get(GS_START_PAGE_PARAM);
204 if (start_page != null && !start_page.equals("")) {
205 url_string += "&"+IM_START_PAGE_PARAM+"="+start_page;
206 }
207 String results_num = null;
208 String doc_ids = null;
209 try {
210 logger.debug("IViaProxy, sending "+url_string);
211 BufferedReader reader = makeConnection(url_string);
212 results_num = reader.readLine();
213 doc_ids = reader.readLine();
214
215 } catch (Exception e) {
216 logger.error("exception happened during query");
217 e.printStackTrace();
218 return result;
219 }
220
221 if (results_num.startsWith("Resources: ")) {
222 results_num = results_num.substring(11);
223 } else {
224 logger.error("badly formatted results line: "+results_num);
225 return result;
226 }
227 if (doc_ids.startsWith("Ids: ")) {
228 doc_ids = doc_ids.substring(5).trim();
229 } else {
230 logger.error("badly formatted docs line: "+doc_ids);
231 return result;
232 }
233
234 // get the num docs and add to a metadata list
235 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
236 result.appendChild(metadata_list);
237
238 // Add a metadata element specifying the number of matching documents
239 long numdocs = Long.parseLong(results_num);
240 GSXML.addMetadata(this.doc, metadata_list, "numDocsMatched", ""+numdocs);
241 String [] ids = doc_ids.split(" ");
242
243 for (int d=0; d<ids.length; d++) {
244 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
245 doc_node.setAttribute(GSXML.NODE_ID_ATT, ids[d]);
246 doc_node_list.appendChild(doc_node);
247 }
248 logger.debug("IViaProxy result:");
249 logger.debug(this.converter.getString(result));
250 return result;
251
252 }
253
254 protected Element processDocumentMetadataRetrieve(Element request) {
255 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
256 result.setAttribute(GSXML.FROM_ATT, DOC_META_SERVICE);
257 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
258
259 // Get the parameters of the request
260 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
261 if (param_list == null) {
262 logger.error("missing paramList.\n");
263 return result; // Return the empty result
264 }
265
266 // The metadata information required
267 StringBuffer field_list = new StringBuffer();
268 Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
269 while (param != null) {
270 // Identify the metadata information desired
271 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
272 String metadata = GSXML.getValue(param);
273 if (isAcceptableMetadata(metadata)) {
274 field_list.append(metadata);
275 field_list.append(",");
276 }
277 }
278 param = (Element) param.getNextSibling();
279 }
280
281 if (field_list.length()==0) {
282 logger.error("no metadata specified.\n");
283 return result;
284 }
285
286 // Get the documents
287 Element request_node_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
288 if (request_node_list == null) {
289 logger.error("DocumentMetadataRetrieve request had no "+GSXML.DOC_NODE_ELEM+"List.\n");
290 return result;
291 }
292
293 StringBuffer record_id_list = new StringBuffer();
294
295 NodeList request_nodes = request_node_list.getChildNodes();
296 for (int i = 0; i < request_nodes.getLength(); i++) {
297 Element request_node = (Element) request_nodes.item(i);
298 String node_id = request_node.getAttribute(GSXML.NODE_ID_ATT);
299 record_id_list.append(node_id);
300 record_id_list.append(",");
301 }
302
303 // do the query to the iVia server
304 String url_string = ivia_server_url+"/cgi-bin/view_record_set?theme=gsdl3&record_id_list="+record_id_list.toString()+"&field_list="+field_list.toString();
305
306 Element node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
307 result.appendChild(node_list);
308 try {
309 BufferedReader reader = makeConnection(url_string);
310 String line;
311 while ((line = reader.readLine()) != null) {
312 if (!line.startsWith("Record:")) {
313 continue;
314 }
315 // the first line is the record
316 line=line.substring(8);
317 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
318 doc_node.setAttribute(GSXML.NODE_ID_ATT, line);
319 Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
320 doc_node.appendChild(meta_list);
321 while ((line = reader.readLine()) != null) {
322 //metadata entry
323 int col_pos = line.indexOf(':');
324 if (col_pos == -1) {
325 // end of the metadata for this doc
326 break;
327 }
328 String name = line.substring(0,col_pos);
329 String value = line.substring(col_pos+2); // includes a space
330 GSXML.addMetadata(this.doc, meta_list, name, value);
331 }
332 node_list.appendChild(doc_node);
333
334 }
335 } catch (Exception e) {
336 logger.error("exception happened");
337 e.printStackTrace();
338 }
339 logger.debug("IViaProxy: returning result: ");
340 logger.debug(this.converter.getPrettyString(result));
341 return result;
342
343 }
344
345 protected Element processDocumentContentRetrieve(Element request) {
346 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
347 result.setAttribute(GSXML.FROM_ATT, DOC_CONTENT_SERVICE);
348 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
349
350 // Get the request doc_list
351 Element query_doc_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
352 if (query_doc_list == null) {
353 logger.error("DocumentContentRetrieve request specified no doc nodes.\n");
354 return result;
355 }
356
357 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
358 result.appendChild(doc_list);
359
360 // Get the documents
361 String[] doc_ids = GSXML.getAttributeValuesFromList(query_doc_list,
362 GSXML.NODE_ID_ATT);
363 for (int i = 0; i < doc_ids.length; i++) {
364 String doc_id = doc_ids[i];
365 Element doc_node = getDocument(doc_id);
366 doc_list.appendChild(doc_node);
367 }
368 return result;
369
370 }
371
372
373 /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */
374 protected Element getDocument(String doc_id) {
375
376 String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id;
377 StringBuffer buffer = new StringBuffer();
378 try {
379 BufferedReader reader = makeConnection(url_string);
380
381 String line;
382 while((line = reader.readLine())!= null) {
383 buffer.append(line);
384 }
385
386
387 } catch (Exception e) {
388 logger.error("exception happened");
389 e.printStackTrace();
390 }
391
392 String node_content = buffer.toString();
393 String escaped_content = GSXML.xmlSafe(node_content);
394
395 StringBuffer processed_content = new StringBuffer(escaped_content.length());
396 processed_content.append("<nodeContent>");
397 int pos = 0;
398 int lastpos = 0;
399 while ((pos = escaped_content.indexOf("&lt;a ", lastpos))!= -1) {
400 processed_content.append(escaped_content.substring(lastpos, pos));
401 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
402 if (endpos == -1) {
403 break;
404 }
405 String link = escaped_content.substring(pos, endpos+10);
406 link = convertLink(link);
407 processed_content.append(link);
408 lastpos = endpos+10;
409 }
410 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
411 processed_content.append("</nodeContent>");
412
413 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
414 doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_id);
415
416 Document content_doc = this.converter.getDOM(processed_content.toString());
417 if (content_doc != null) {
418 Element content_element = content_doc.getDocumentElement();
419 doc_node.appendChild(this.doc.importNode(content_element, true));
420 } else {
421 logger.error("Couldn't parse the node content");
422 }
423 return doc_node;
424
425 }
426
427 /** converts a url from an <a> element into a greenstone suitable one */
428 protected String convertLink(String aref) {
429
430 if (aref.indexOf("href=&quot;http") != -1) {
431 return aref; // an external link
432 }
433 String type = "other";
434 if (aref.indexOf("/cgi-bin/canned_search")!=-1) {
435 type="query";
436 } else if (aref.indexOf("/cgi-bin/click_through") != -1) {
437 type = "external";
438 } else if (aref.indexOf("/cgi-bin/view_record") != -1) {
439 type="document";
440 }
441
442 int href_start = aref.indexOf("href=&quot;")+11;
443 int href_end = aref.indexOf("&gt;", href_start);
444 String href = aref.substring(href_start, href_end);
445 String link_content = aref.substring(href_end+4, aref.length()-10);
446
447 if (type.equals("external")) {
448 // the external link is everything after the http at the end.
449 String address = href.substring(href.lastIndexOf("http"));
450 address = address.replaceAll("%3[aA]", ":");
451 address = address.replaceAll("%2[fF]", "/");
452
453 return "&lt;a href=\""+address+"\"&gt;"+link_content+"&lt;/a&gt;";
454 }
455 if (type.equals("other")) {
456 return "other type of link ("+link_content+")";
457 }
458 StringBuffer result = new StringBuffer();
459 result.append("<link type='");
460 result.append(type);
461 result.append("'");
462 if (type.equals("query")) {
463 result.append(" service='TextQuery'");
464 }
465 result.append(">");
466 // add in the parameters
467 href = href.substring(href.indexOf("?")+1);
468 String [] params = href.split("&amp;");
469 for (int i=0; i<params.length; i++) {
470 String param = params[i];
471 int eq_pos = param.indexOf("=");
472 if (eq_pos != -1) {
473
474 result.append("<param name='"+param.substring(0, eq_pos)+"' value='"+param.substring(eq_pos+1)+"'/>");
475 }
476 }
477 result.append(link_content);
478 result.append("</link>");
479
480 return result.toString();
481
482 }
483
484 // iVia craps out if we ask for a metadata which is not valid. So need
485 // to make sure we only ask for acceptable fields.
486 protected boolean isAcceptableMetadata(String meta) {
487 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
488 if (valid_metadata.indexOf(","+meta+",")!=-1) {
489 return true;
490 }
491 return false;
492 }
493 protected BufferedReader makeConnection(String url_string) {
494 BufferedReader reader = null;
495 try {
496 URL url = new URL(url_string);
497 HttpURLConnection connection = (HttpURLConnection)url.openConnection();
498 InputStream input = connection.getInputStream();
499 reader = new BufferedReader(new InputStreamReader(input));
500 } catch (java.net.MalformedURLException e) {
501
502 logger.error("Malformed URL: "+url_string);
503 } catch (java.io.IOException e) {
504 logger.error("An error occurred during IO to url "+url_string);
505 }
506 return reader;
507 }
508
509}
Note: See TracBrowser for help on using the repository browser.