source: greenstone3/trunk/src/java/org/greenstone/gsdl3/service/IViaProxy.java@ 14125

Last change on this file since 14125 was 13270, checked in by shaoqun, 18 years ago

replace Category class which is deprecated with Logger class

  • Property svn:keywords set to Author Date Id Revision
File size: 18.1 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.util.*;
5
6// XML classes
7import org.w3c.dom.Element;
8import org.w3c.dom.Document;
9import org.w3c.dom.NodeList;
10
11import java.util.HashMap;
12import java.io.File;
13import java.io.InputStream;
14import java.io.BufferedReader;
15import java.io.InputStreamReader;
16import java.io.IOException;
17import java.net.HttpURLConnection;
18import java.net.URLConnection;
19import java.net.URL;
20import java.net.Authenticator;
21import java.net.MalformedURLException;
22
23import org.apache.log4j.*;
24
25/**
26 *
27 * @author <a href="mailto:[email protected]">Katherine Don</a>
28 * @version $Revision: 13270 $
29 */
30
31public class IViaProxy
32 extends ServiceRack {
33
34 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.IViaProxy.class.getName());
35
36 // the services on offer
37 // these strings must match what is found in the properties file
38 protected static final String TEXT_QUERY_SERVICE = "TextQuery";
39 protected static final String DOC_CONTENT_SERVICE = "DocumentContentRetrieve";
40 protected static final String DOC_META_SERVICE = "DocumentMetadataRetrieve";
41 protected static final String QUERY_PARAM = "query";
42 protected static final String FIELD_PARAM = "fields";
43 // have standard gs param names for hits per page, and start page
44 // these need to be mapped to iVia params
45 protected static final String GS_HITS_PARAM = "hitsPerPage";
46 protected static final String IM_HITS_PARAM = "no_of_records_per_page";
47 protected static final String GS_START_PAGE_PARAM = "startPage";
48 protected static final String IM_START_PAGE_PARAM = "start_page_no";
49
50 protected String ivia_server_url = null;
51
52 public boolean configure(Element info, Element extra_info) {
53
54 if (!super.configure(info, extra_info)){
55 return false;
56 }
57
58 Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer");
59 if (server_elem == null) {
60 logger.error("no iViaServer element found");
61 return false;
62 }
63 ivia_server_url = server_elem.getAttribute("url");
64 if (ivia_server_url.equals("")) {
65 logger.error("no url for the iViaServer element");
66 return false;
67 }
68 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
69 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
70 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
71 this.short_service_info.appendChild(tq_service);
72
73 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
74 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
75 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
76 this.short_service_info.appendChild(dc_service);
77
78 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
79 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
80 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
81 this.short_service_info.appendChild(dm_service);
82
83 //
84 // add some format info to service map if there is any
85 String path = GSPath.appendLink(GSXML.SEARCH_ELEM, GSXML.FORMAT_ELEM);
86 Element format = (Element) GSXML.getNodeByPath(extra_info, path);
87 if (format != null) {
88 this.format_info_map.put(TEXT_QUERY_SERVICE, this.doc.importNode(format, true));
89 }
90
91
92 // look for document display format
93 path = GSPath.appendLink(GSXML.DISPLAY_ELEM, GSXML.FORMAT_ELEM);
94 Element display_format = (Element)GSXML.getNodeByPath(extra_info, path);
95 if (display_format != null) {
96 this.format_info_map.put(DOC_CONTENT_SERVICE, this.doc.importNode(display_format, true));
97 // shoudl we make a copy?
98 }
99
100 return true;
101
102 }
103
104 protected Element getServiceDescription(String service, String lang, String subset) {
105
106 if (service.equals(TEXT_QUERY_SERVICE)) {
107 Element tq_service = this.doc.createElement(GSXML.SERVICE_ELEM);
108 tq_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_QUERY);
109 tq_service.setAttribute(GSXML.NAME_ATT, TEXT_QUERY_SERVICE);
110 if (subset == null || subset.equals(GSXML.DISPLAY_TEXT_ELEM + GSXML.LIST_MODIFIER)) {
111 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_NAME, getTextString(TEXT_QUERY_SERVICE+".name", lang)));
112 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_SUBMIT, getTextString(TEXT_QUERY_SERVICE+".submit", lang)));
113 tq_service.appendChild(GSXML.createDisplayTextElement(this.doc, GSXML.DISPLAY_TEXT_DESCRIPTION, getTextString(TEXT_QUERY_SERVICE+".description", lang)));
114 }
115 if (subset == null || subset.equals(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER)) {
116 Element param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
117 tq_service.appendChild(param_list);
118 Element param = GSXML.createParameterDescription(this.doc, QUERY_PARAM, getTextString("param."+QUERY_PARAM, lang), GSXML.PARAM_TYPE_STRING, null, null, null);
119 param_list.appendChild(param);
120 String [] field_ids = {"kw", "au", "su", "ti", "de", "fu"};
121 String [] field_names = {
122 getTextString("param."+FIELD_PARAM+".kw", lang),
123 getTextString("param."+FIELD_PARAM+".au", lang),
124 getTextString("param."+FIELD_PARAM+".su", lang),
125 getTextString("param."+FIELD_PARAM+".ti", lang),
126 getTextString("param."+FIELD_PARAM+".de", lang),
127 getTextString("param."+FIELD_PARAM+".fu", lang) };
128
129 param = GSXML.createParameterDescription(this.doc, FIELD_PARAM, getTextString("param."+FIELD_PARAM, lang), GSXML.PARAM_TYPE_ENUM_MULTI, "kw,au,su,ti,de,fu", field_ids, field_names);
130 param_list.appendChild(param);
131
132
133 String [] hits_options = {"10", "30", "50"};
134 param = GSXML.createParameterDescription(this.doc, GS_HITS_PARAM, getTextString("param."+GS_HITS_PARAM, lang), GSXML.PARAM_TYPE_ENUM_SINGLE, "10", hits_options, hits_options);
135 param_list.appendChild(param);
136
137 param = GSXML.createParameterDescription(this.doc, GS_START_PAGE_PARAM, "", GSXML.PARAM_TYPE_INVISIBLE, "1", null, null);
138 param_list.appendChild(param);
139 }
140 return tq_service;
141 }
142 if (service.equals(DOC_META_SERVICE)) {
143 Element dm_service = this.doc.createElement(GSXML.SERVICE_ELEM);
144 dm_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
145 dm_service.setAttribute(GSXML.NAME_ATT, DOC_META_SERVICE);
146 return dm_service;
147
148 }
149 if (service.equals(DOC_CONTENT_SERVICE)) {
150 Element dc_service = this.doc.createElement(GSXML.SERVICE_ELEM);
151 dc_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
152 dc_service.setAttribute(GSXML.NAME_ATT, DOC_CONTENT_SERVICE);
153 return dc_service;
154
155
156 }
157 return null;
158 }
159
160 /** Process a text query - implemented by concrete subclasses */
161 protected Element processTextQuery(Element request) {
162
163 // Create a new (empty) result message
164 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
165 result.setAttribute(GSXML.FROM_ATT, TEXT_QUERY_SERVICE);
166 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
167 Element doc_node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
168 result.appendChild(doc_node_list);
169
170
171 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
172 if (param_list == null) {
173 logger.error("TextQuery request had no paramList.");
174 return result; // Return the empty result
175 }
176
177 // Process the request parameters
178 HashMap params = GSXML.extractParams(param_list, false);
179
180 // Make sure a query has been specified
181 String query = (String) params.get(QUERY_PARAM);
182 if (query == null || query.equals("")) {
183 return result; // Return the empty result
184 }
185 // tidy whitespace
186 query = query.replaceAll("\\s+", "+");
187 String url_string = ivia_server_url+"/cgi-bin/canned_search?theme=gsdl3&query="+query;
188
189 // check for fields
190 String fields = (String) params.get(FIELD_PARAM);
191 if (fields != null && !fields.equals("")) {
192 url_string += "&fields="+fields;
193 }
194
195 //check for hits per page
196 String hits_per_page = (String) params.get(GS_HITS_PARAM);
197 if (hits_per_page != null && !hits_per_page.equals("")) {
198 url_string += "&"+IM_HITS_PARAM+"="+hits_per_page;
199 }
200
201 // check for start page
202 String start_page = (String) params.get(GS_START_PAGE_PARAM);
203 if (start_page != null && !start_page.equals("")) {
204 url_string += "&"+IM_START_PAGE_PARAM+"="+start_page;
205 }
206 String results_num = null;
207 String doc_ids = null;
208 try {
209 logger.debug("IViaProxy, sending "+url_string);
210 BufferedReader reader = makeConnection(url_string);
211 results_num = reader.readLine();
212 doc_ids = reader.readLine();
213
214 } catch (Exception e) {
215 logger.error("exception happened during query");
216 e.printStackTrace();
217 return result;
218 }
219
220 if (results_num.startsWith("Resources: ")) {
221 results_num = results_num.substring(11);
222 } else {
223 logger.error("badly formatted results line: "+results_num);
224 return result;
225 }
226 if (doc_ids.startsWith("Ids: ")) {
227 doc_ids = doc_ids.substring(5).trim();
228 } else {
229 logger.error("badly formatted docs line: "+doc_ids);
230 return result;
231 }
232
233 // get the num docs and add to a metadata list
234 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
235 result.appendChild(metadata_list);
236
237 // Add a metadata element specifying the number of matching documents
238 long numdocs = Long.parseLong(results_num);
239 GSXML.addMetadata(this.doc, metadata_list, "numDocsMatched", ""+numdocs);
240 String [] ids = doc_ids.split(" ");
241
242 for (int d=0; d<ids.length; d++) {
243 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
244 doc_node.setAttribute(GSXML.NODE_ID_ATT, ids[d]);
245 doc_node_list.appendChild(doc_node);
246 }
247 logger.debug("IViaProxy result:");
248 logger.debug(this.converter.getString(result));
249 return result;
250
251 }
252
253 protected Element processDocumentMetadataRetrieve(Element request) {
254 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
255 result.setAttribute(GSXML.FROM_ATT, DOC_META_SERVICE);
256 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
257
258 // Get the parameters of the request
259 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
260 if (param_list == null) {
261 logger.error("missing paramList.\n");
262 return result; // Return the empty result
263 }
264
265 // The metadata information required
266 StringBuffer field_list = new StringBuffer();
267 Element param = (Element) param_list.getFirstChild();
268 while (param != null) {
269 // Identify the metadata information desired
270 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
271 String metadata = GSXML.getValue(param);
272 if (isAcceptableMetadata(metadata)) {
273 field_list.append(metadata);
274 field_list.append(",");
275 }
276 }
277 param = (Element) param.getNextSibling();
278 }
279
280 if (field_list.length()==0) {
281 logger.error("no metadata specified.\n");
282 return result;
283 }
284
285 // Get the documents
286 Element request_node_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
287 if (request_node_list == null) {
288 logger.error("DocumentMetadataRetrieve request had no "+GSXML.DOC_NODE_ELEM+"List.\n");
289 return result;
290 }
291
292 StringBuffer record_id_list = new StringBuffer();
293
294 NodeList request_nodes = request_node_list.getChildNodes();
295 for (int i = 0; i < request_nodes.getLength(); i++) {
296 Element request_node = (Element) request_nodes.item(i);
297 String node_id = request_node.getAttribute(GSXML.NODE_ID_ATT);
298 record_id_list.append(node_id);
299 record_id_list.append(",");
300 }
301
302 // do the query to the iVia server
303 String url_string = ivia_server_url+"/cgi-bin/view_record_set?theme=gsdl3&record_id_list="+record_id_list.toString()+"&field_list="+field_list.toString();
304
305 Element node_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
306 result.appendChild(node_list);
307 try {
308 BufferedReader reader = makeConnection(url_string);
309 String line;
310 while ((line = reader.readLine()) != null) {
311 if (!line.startsWith("Record:")) {
312 continue;
313 }
314 // the first line is the record
315 line=line.substring(8);
316 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
317 doc_node.setAttribute(GSXML.NODE_ID_ATT, line);
318 Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
319 doc_node.appendChild(meta_list);
320 while ((line = reader.readLine()) != null) {
321 //metadata entry
322 int col_pos = line.indexOf(':');
323 if (col_pos == -1) {
324 // end of the metadata for this doc
325 break;
326 }
327 String name = line.substring(0,col_pos);
328 String value = line.substring(col_pos+2); // includes a space
329 GSXML.addMetadata(this.doc, meta_list, name, value);
330 }
331 node_list.appendChild(doc_node);
332
333 }
334 } catch (Exception e) {
335 logger.error("exception happened");
336 e.printStackTrace();
337 }
338 logger.debug("IViaProxy: returning result: ");
339 logger.debug(this.converter.getPrettyString(result));
340 return result;
341
342 }
343
344 protected Element processDocumentContentRetrieve(Element request) {
345 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
346 result.setAttribute(GSXML.FROM_ATT, DOC_CONTENT_SERVICE);
347 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
348
349 // Get the request doc_list
350 Element query_doc_list = (Element) GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
351 if (query_doc_list == null) {
352 logger.error("DocumentContentRetrieve request specified no doc nodes.\n");
353 return result;
354 }
355
356 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
357 result.appendChild(doc_list);
358
359 // Get the documents
360 String[] doc_ids = GSXML.getAttributeValuesFromList(query_doc_list,
361 GSXML.NODE_ID_ATT);
362 for (int i = 0; i < doc_ids.length; i++) {
363 String doc_id = doc_ids[i];
364 Element doc_node = getDocument(doc_id);
365 doc_list.appendChild(doc_node);
366 }
367 return result;
368
369 }
370
371
372 /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */
373 protected Element getDocument(String doc_id) {
374
375 String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id;
376 StringBuffer buffer = new StringBuffer();
377 try {
378 BufferedReader reader = makeConnection(url_string);
379
380 String line;
381 while((line = reader.readLine())!= null) {
382 buffer.append(line);
383 }
384
385
386 } catch (Exception e) {
387 logger.error("exception happened");
388 e.printStackTrace();
389 }
390
391 String node_content = buffer.toString();
392 String escaped_content = GSXML.xmlSafe(node_content);
393
394 StringBuffer processed_content = new StringBuffer(escaped_content.length());
395 processed_content.append("<nodeContent>");
396 int pos = 0;
397 int lastpos = 0;
398 while ((pos = escaped_content.indexOf("&lt;a ", lastpos))!= -1) {
399 processed_content.append(escaped_content.substring(lastpos, pos));
400 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
401 if (endpos == -1) {
402 break;
403 }
404 String link = escaped_content.substring(pos, endpos+10);
405 link = convertLink(link);
406 processed_content.append(link);
407 lastpos = endpos+10;
408 }
409 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
410 processed_content.append("</nodeContent>");
411
412 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
413 doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_id);
414
415 Document content_doc = this.converter.getDOM(processed_content.toString());
416 if (content_doc != null) {
417 Element content_element = content_doc.getDocumentElement();
418 doc_node.appendChild(this.doc.importNode(content_element, true));
419 } else {
420 logger.error("Couldn't parse the node content");
421 }
422 return doc_node;
423
424 }
425
426 /** converts a url from an <a> element into a greenstone suitable one */
427 protected String convertLink(String aref) {
428
429 if (aref.indexOf("href=&quot;http") != -1) {
430 return aref; // an external link
431 }
432 String type = "other";
433 if (aref.indexOf("/cgi-bin/canned_search")!=-1) {
434 type="query";
435 } else if (aref.indexOf("/cgi-bin/click_through") != -1) {
436 type = "external";
437 } else if (aref.indexOf("/cgi-bin/view_record") != -1) {
438 type="document";
439 }
440
441 int href_start = aref.indexOf("href=&quot;")+11;
442 int href_end = aref.indexOf("&gt;", href_start);
443 String href = aref.substring(href_start, href_end);
444 String link_content = aref.substring(href_end+4, aref.length()-10);
445
446 if (type.equals("external")) {
447 // the external link is everything after the http at the end.
448 String address = href.substring(href.lastIndexOf("http"));
449 address = address.replaceAll("%3[aA]", ":");
450 address = address.replaceAll("%2[fF]", "/");
451
452 return "&lt;a href=\""+address+"\"&gt;"+link_content+"&lt;/a&gt;";
453 }
454 if (type.equals("other")) {
455 return "other type of link ("+link_content+")";
456 }
457 StringBuffer result = new StringBuffer();
458 result.append("<link type='");
459 result.append(type);
460 result.append("'");
461 if (type.equals("query")) {
462 result.append(" service='TextQuery'");
463 }
464 result.append(">");
465 // add in the parameters
466 href = href.substring(href.indexOf("?")+1);
467 String [] params = href.split("&amp;");
468 for (int i=0; i<params.length; i++) {
469 String param = params[i];
470 int eq_pos = param.indexOf("=");
471 if (eq_pos != -1) {
472
473 result.append("<param name='"+param.substring(0, eq_pos)+"' value='"+param.substring(eq_pos+1)+"'/>");
474 }
475 }
476 result.append(link_content);
477 result.append("</link>");
478
479 return result.toString();
480
481 }
482
483 // iVia craps out if we ask for a metadata which is not valid. So need
484 // to make sure we only ask for acceptable fields.
485 protected boolean isAcceptableMetadata(String meta) {
486 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
487 if (valid_metadata.indexOf(","+meta+",")!=-1) {
488 return true;
489 }
490 return false;
491 }
492 protected BufferedReader makeConnection(String url_string) {
493 BufferedReader reader = null;
494 try {
495 URL url = new URL(url_string);
496 HttpURLConnection connection = (HttpURLConnection)url.openConnection();
497 InputStream input = connection.getInputStream();
498 reader = new BufferedReader(new InputStreamReader(input));
499 } catch (java.net.MalformedURLException e) {
500
501 logger.error("Malformed URL: "+url_string);
502 } catch (java.io.IOException e) {
503 logger.error("An error occurred during IO to url "+url_string);
504 }
505 return reader;
506 }
507
508}
Note: See TracBrowser for help on using the repository browser.