source: greenstone3/trunk/src/java/org/greenstone/gsdl3/service/IViaProxy.java@ 14224

Last change on this file since 14224 was 14224, checked in by xiao, 17 years ago

change getFirstChild() to getFirstElementChild() in case an extra line break or white space added before the first element child which might cause a cast exception.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.util.*;
5
6// XML classes
7import org.w3c.dom.Element;
8import org.w3c.dom.Document;
9import org.w3c.dom.NodeList;
10
11import java.util.HashMap;
12import java.io.File;
13import java.io.InputStream;
14import java.io.BufferedReader;
15import java.io.InputStreamReader;
16import java.io.IOException;
17import java.net.HttpURLConnection;
18import java.net.URLConnection;
19import java.net.URL;
20import java.net.Authenticator;
21import java.net.MalformedURLException;
22
23import org.apache.log4j.*;
24
25/**
26 *
27 * @author <a href="mailto:[email protected]">Katherine Don</a>
28 * @version $Revision: 14224 $
29 */
30
31public class IViaProxy
32 extends ServiceRack {
33
34 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.IViaProxy.class.getName());
35
36 // the services on offer
37 // these strings must match what is found in the properties file
38 protected static final String TEXT_QUERY_SERVICE = "TextQuery";
39 protected static final String DOC_CONTENT_SERVICE = "DocumentContentRetrieve";
40 protected static final String DOC_META_SERVICE = "DocumentMetadataRetrieve";
41 protected static final String QUERY_PARAM = "query";
42 protected static final String FIELD_PARAM = "fields";
43 // have standard gs param names for hits per page, and start page
44 // these need to be mapped to iVia params
45 protected static final String GS_HITS_PARAM = "hitsPerPage";
46 protected static final String IM_HITS_PARAM = "no_of_records_per_page";
47 protected static final String GS_START_PAGE_PARAM = "startPage";
48 protected static final String IM_START_PAGE_PARAM = "start_page_no";
49
50 protected String ivia_server_url = null;
51
52 public boolean configure(Element info, Element extra_info) {
53
54 if (!super.configure(info, extra_info)){
55 return false;
56 }
57
58 Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer");
59 if (server_elem == null) {
60 logger.error("no iViaServer element found");
61 return false;
62 }
63 ivia_server_url = server_elem.getAttribute("url");
64 if (ivia_server_url.equals("")) {
65 logger.error("no url for the iViaServer element");
66 return false;
67 }
68 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
69 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
70 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
71 this.short_service_info.appendChild(tq_service);
72
73 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
74 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
75 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
76 this.short_service_info.appendChild(dc_service);
77
78 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
79 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
80 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
81 this.short_service_info.appendChild(dm_service);
82
83 //
84 // add some format info to service map if there is any
85 String path = GSPath.appendLink(GSXML.SEARCH_ELEM, GSXML.FORMAT_ELEM);
86 Element format = (Element) GSXML.getNodeByPath(extra_info, path);
87 if (format != null) {
88 this.format_info_map.put(TEXT_QUERY_SERVICE, this.doc.importNode(format, true));
89 }
90
91
92 // look for document display format
93 path = GSPath.appendLink(GSXML.DISPLAY_ELEM, GSXML.FORMAT_ELEM);
94 Element display_format = (Element)GSXML.getNodeByPath(extra_info, path);
95 if (display_format != null) {
96 this.format_info_map.put(DOC_CONTENT_SERVICE, this.doc.importNode(display_format, true));
97 // shoudl we make a copy?
98 }
99
100 return true;
101
102 }
103
104 protected Element getServiceDescription(String service, String lang, String subset) {
105
106 if (service.equals(TEXT_QUERY_SERVICE)) {
107 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
108 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
109 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
110 if (subset == null || subset.equals(GSXML.DISPLAY_TEXT_ELEM + GSXML.LIST_MODIFIER)) {
111 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(TEXT_QUERY_SERVICE+".name", lang)));
112 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_SUBMIT, getTextString(TEXT_QUERY_SERVICE+".submit", lang)));
113 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(TEXT_QUERY_SERVICE+".description", lang)));
114 }
115 if (subset == null || subset.equals(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER)) {
116 Element param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
117 tq_service.appendChild(param_list);
118 Element param = GSXML.createParameterDescription(this.doc, QUERY_PARAM, getTextString("param."+QUERY_PARAM, lang), GSXML.PARAM_TYPE_STRING, null, null, null);
119 param_list.appendChild(param);
120 String [] field_ids = {"kw", "au", "su", "ti", "de", "fu"};
121 String [] field_names = {
122 getTextString("param."+FIELD_PARAM+".kw", lang),
123 getTextString("param."+FIELD_PARAM+".au", lang),
124 getTextString("param."+FIELD_PARAM+".su", lang),
125 getTextString("param."+FIELD_PARAM+".ti", lang),
126 getTextString("param."+FIELD_PARAM+".de", lang),
127 getTextString("param."+FIELD_PARAM+".fu", lang) };
128
129 param = GSXML.createParameterDescription(this.doc, FIELD_PARAM, getTextString("param."+FIELD_PARAM, lang), GSXML.PARAM_TYPE_ENUM_MULTI, "kw,au,su,ti,de,fu", field_ids, field_names);
130 param_list.appendChild(param);
131
132
133 String [] hits_options = {"10", "30", "50"};
134 param = GSXML.createParameterDescription(this.doc, GS_HITS_PARAM, getTextString("param."+GS_HITS_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, "10", hits_options, hits_options);
135 param_list.appendChild(param);
136
137 param = GSXML.createParameterDescription(this.doc, GS_START_PAGE_PARAM, "", GSXML.PARAM_TYPE_INVISIBLE, "1", null, null);
138 param_list.appendChild(param);
139 }
140 return tq_service;
141 }
142 if (service.equals(DOC_META_SERVICE)) {
143 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
144 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
145 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
146 return dm_service;
147
148 }
149 if (service.equals(DOC_CONTENT_SERVICE)) {
150 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
151 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
152 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
153 return dc_service;
154
155
156 }
157 return null;
158 }
159
160 /** Process a text query - implemented by concrete subclasses */
161 protected Element processTextQuery(Element request) {
162
163 // Create a new (empty) result message
164 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
165 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
166 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
167 Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
168 result.appendChild(doc_node_list);
169
170
171 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
172 if (param_list == null) {
173 logger.error("TextQuery request had no paramList.");
174 return result; // Return the empty result
175 }
176
177 // Process the request parameters
178 HashMap params = GSXML.extractParams(param_list, false);
179
180 // Make sure a query has been specified
181 String query = (String) params.get(QUERY_PARAM);
182 if (query == null || query.equals("")) {
183 return result; // Return the empty result
184 }
185 // tidy whitespace
186 query = query.replaceAll("\\s+", "+");
187 String url_string = ivia_server_url+"/cgi-bin/canned_search?theme=gsdl3&query="+query;
188
189 // check for fields
190 String fields = (String) params.get(FIELD_PARAM);
191 if (fields != null && !fields.equals("")) {
192 url_string += "&fields="+fields;
193 }
194
195 //check for hits per page
196 String hits_per_page = (String) params.get(GS_HITS_PARAM);
197 if (hits_per_page != null && !hits_per_page.equals("")) {
198 url_string += "&"+IM_HITS_PARAM+"="+hits_per_page;
199 }
200
201 // check for start page
202 String start_page = (String) params.get(GS_START_PAGE_PARAM);
203 if (start_page != null && !start_page.equals("")) {
204 url_string += "&"+IM_START_PAGE_PARAM+"="+start_page;
205 }
206 String results_num = null;
207 String doc_ids = null;
208 try {
209 logger.debug("IViaProxy, sending "+url_string);
210 BufferedReader reader = makeConnection(url_string);
211 results_num = reader.readLine();
212 doc_ids = reader.readLine();
213
214 } catch (Exception e) {
215 logger.error("exception happened during query");
216 e.printStackTrace();
217 return result;
218 }
219
220 if (results_num.startsWith("Resources: ")) {
221 results_num = results_num.substring(11);
222 } else {
223 logger.error("badly formatted results line: "+results_num);
224 return result;
225 }
226 if (doc_ids.startsWith("Ids: ")) {
227 doc_ids = doc_ids.substring(5).trim();
228 } else {
229 logger.error("badly formatted docs line: "+doc_ids);
230 return result;
231 }
232
233 // get the num docs and add to a metadata list
234 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
235 result.appendChild(metadata_list);
236
237 // Add a metadata element specifying the number of matching documents
238 long numdocs = Long.parseLong(results_num);
239 GSXML.addMetadata(this.doc, metadata_list, "numDocsMatched", ""+numdocs);
240 String [] ids = doc_ids.split(" ");
241
242 for (int d=0; d<ids.length; d++) {
243 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
244 doc_node.setAttribute(GSXML.NODE_ID_ATT, ids[d]);
245 doc_node_list.appendChild(doc_node);
246 }
247 logger.debug("IViaProxy result:");
248 logger.debug(this.converter.getString(result));
249 return result;
250
251 }
252
253 protected Element processDocumentMetadataRetrieve(Element request) {
254 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
255 result.setAttribute(GSXML.FROM_ATT, DOC_META_SERVICE);
256 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
257
258 // Get the parameters of the request
259 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
260 if (param_list == null) {
261 logger.error("missing paramList.\n");
262 return result; // Return the empty result
263 }
264
265 // The metadata information required
266 StringBuffer field_list = new StringBuffer();
267 Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
268 while (param != null) {
269 // Identify the metadata information desired
270 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
271 String metadata = GSXML.getValue(param);
272 if (isAcceptableMetadata(metadata)) {
273 field_list.append(metadata);
274 field_list.append(",");
275 }
276 }
277 param = (Element) param.getNextSibling();
278 }
279
280 if (field_list.length()==0) {
281 logger.error("no metadata specified.\n");
282 return result;
283 }
284
285 // Get the documents
286 Element request_node_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
287 if (request_node_list == null) {
288 logger.error("DocumentMetadataRetrieve request had no "+GSXML.DOC_NODE_ELEM+"List.\n");
289 return result;
290 }
291
292 StringBuffer record_id_list = new StringBuffer();
293
294 NodeList request_nodes = request_node_list.getChildNodes();
295 for (int i = 0; i < request_nodes.getLength(); i++) {
296 Element request_node = (Element) request_nodes.item(i);
297 String node_id = request_node.getAttribute(GSXML.NODE_ID_ATT);
298 record_id_list.append(node_id);
299 record_id_list.append(",");
300 }
301
302 // do the query to the iVia server
303 String url_string = ivia_server_url+"/cgi-bin/view_record_set?theme=gsdl3&record_id_list="+record_id_list.toString()+"&field_list="+field_list.toString();
304
305 Element node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
306 result.appendChild(node_list);
307 try {
308 BufferedReader reader = makeConnection(url_string);
309 String line;
310 while ((line = reader.readLine()) != null) {
311 if (!line.startsWith("Record:")) {
312 continue;
313 }
314 // the first line is the record
315 line=line.substring(8);
316 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
317 doc_node.setAttribute(GSXML.NODE_ID_ATT, line);
318 Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
319 doc_node.appendChild(meta_list);
320 while ((line = reader.readLine()) != null) {
321 //metadata entry
322 int col_pos = line.indexOf(':');
323 if (col_pos == -1) {
324 // end of the metadata for this doc
325 break;
326 }
327 String name = line.substring(0,col_pos);
328 String value = line.substring(col_pos+2); // includes a space
329 GSXML.addMetadata(this.doc, meta_list, name, value);
330 }
331 node_list.appendChild(doc_node);
332
333 }
334 } catch (Exception e) {
335 logger.error("exception happened");
336 e.printStackTrace();
337 }
338 logger.debug("IViaProxy: returning result: ");
339 logger.debug(this.converter.getPrettyString(result));
340 return result;
341
342 }
343
344 protected Element processDocumentContentRetrieve(Element request) {
345 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
346 result.setAttribute(GSXML.FROM_ATT, DOC_CONTENT_SERVICE);
347 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
348
349 // Get the request doc_list
350 Element query_doc_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
351 if (query_doc_list == null) {
352 logger.error("DocumentContentRetrieve request specified no doc nodes.\n");
353 return result;
354 }
355
356 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
357 result.appendChild(doc_list);
358
359 // Get the documents
360 String[] doc_ids = GSXML.getAttributeValuesFromList(query_doc_list,
361 GSXML.NODE_ID_ATT);
362 for (int i = 0; i < doc_ids.length; i++) {
363 String doc_id = doc_ids[i];
364 Element doc_node = getDocument(doc_id);
365 doc_list.appendChild(doc_node);
366 }
367 return result;
368
369 }
370
371
372 /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */
373 protected Element getDocument(String doc_id) {
374
375 String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id;
376 StringBuffer buffer = new StringBuffer();
377 try {
378 BufferedReader reader = makeConnection(url_string);
379
380 String line;
381 while((line = reader.readLine())!= null) {
382 buffer.append(line);
383 }
384
385
386 } catch (Exception e) {
387 logger.error("exception happened");
388 e.printStackTrace();
389 }
390
391 String node_content = buffer.toString();
392 String escaped_content = GSXML.xmlSafe(node_content);
393
394 StringBuffer processed_content = new StringBuffer(escaped_content.length());
395 processed_content.append("<nodeContent>");
396 int pos = 0;
397 int lastpos = 0;
398 while ((pos = escaped_content.indexOf("&lt;a ", lastpos))!= -1) {
399 processed_content.append(escaped_content.substring(lastpos, pos));
400 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
401 if (endpos == -1) {
402 break;
403 }
404 String link = escaped_content.substring(pos, endpos+10);
405 link = convertLink(link);
406 processed_content.append(link);
407 lastpos = endpos+10;
408 }
409 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
410 processed_content.append("</nodeContent>");
411
412 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
413 doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_id);
414
415 Document content_doc = this.converter.getDOM(processed_content.toString());
416 if (content_doc != null) {
417 Element content_element = content_doc.getDocumentElement();
418 doc_node.appendChild(this.doc.importNode(content_element, true));
419 } else {
420 logger.error("Couldn't parse the node content");
421 }
422 return doc_node;
423
424 }
425
426 /** converts a url from an <a> element into a greenstone suitable one */
427 protected String convertLink(String aref) {
428
429 if (aref.indexOf("href=&quot;http") != -1) {
430 return aref; // an external link
431 }
432 String type = "other";
433 if (aref.indexOf("/cgi-bin/canned_search")!=-1) {
434 type="query";
435 } else if (aref.indexOf("/cgi-bin/click_through") != -1) {
436 type = "external";
437 } else if (aref.indexOf("/cgi-bin/view_record") != -1) {
438 type="document";
439 }
440
441 int href_start = aref.indexOf("href=&quot;")+11;
442 int href_end = aref.indexOf("&gt;", href_start);
443 String href = aref.substring(href_start, href_end);
444 String link_content = aref.substring(href_end+4, aref.length()-10);
445
446 if (type.equals("external")) {
447 // the external link is everything after the http at the end.
448 String address = href.substring(href.lastIndexOf("http"));
449 address = address.replaceAll("%3[aA]", ":");
450 address = address.replaceAll("%2[fF]", "/");
451
452 return "&lt;a href=\""+address+"\"&gt;"+link_content+"&lt;/a&gt;";
453 }
454 if (type.equals("other")) {
455 return "other type of link ("+link_content+")";
456 }
457 StringBuffer result = new StringBuffer();
458 result.append("<link type='");
459 result.append(type);
460 result.append("'");
461 if (type.equals("query")) {
462 result.append(" service='TextQuery'");
463 }
464 result.append(">");
465 // add in the parameters
466 href = href.substring(href.indexOf("?")+1);
467 String [] params = href.split("&amp;");
468 for (int i=0; i<params.length; i++) {
469 String param = params[i];
470 int eq_pos = param.indexOf("=");
471 if (eq_pos != -1) {
472
473 result.append("<param name='"+param.substring(0, eq_pos)+"' value='"+param.substring(eq_pos+1)+"'/>");
474 }
475 }
476 result.append(link_content);
477 result.append("</link>");
478
479 return result.toString();
480
481 }
482
483 // iVia craps out if we ask for a metadata which is not valid. So need
484 // to make sure we only ask for acceptable fields.
485 protected boolean isAcceptableMetadata(String meta) {
486 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
487 if (valid_metadata.indexOf(","+meta+",")!=-1) {
488 return true;
489 }
490 return false;
491 }
492 protected BufferedReader makeConnection(String url_string) {
493 BufferedReader reader = null;
494 try {
495 URL url = new URL(url_string);
496 HttpURLConnection connection = (HttpURLConnection)url.openConnection();
497 InputStream input = connection.getInputStream();
498 reader = new BufferedReader(new InputStreamReader(input));
499 } catch (java.net.MalformedURLException e) {
500
501 logger.error("Malformed URL: "+url_string);
502 } catch (java.io.IOException e) {
503 logger.error("An error occurred during IO to url "+url_string);
504 }
505 return reader;
506 }
507
508}
Note: See TracBrowser for help on using the repository browser.