source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/IViaRetrieve.java@ 13576

Last change on this file since 13576 was 13572, checked in by kjdon, 17 years ago

getNodeContent() now takes a lang param

  • Property svn:keywords set to Author Date Id Revision
File size: 7.8 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.core.GSException;
5import org.greenstone.gsdl3.util.*;
6
7// XML classes
8import org.w3c.dom.Element;
9import org.w3c.dom.Document;
10import org.w3c.dom.NodeList;
11
12import java.util.HashMap;
13import java.util.ArrayList;
14import java.io.File;
15import java.io.InputStream;
16import java.io.BufferedReader;
17import java.io.InputStreamReader;
18import java.io.IOException;
19import java.net.HttpURLConnection;
20import java.net.URLConnection;
21import java.net.URL;
22import java.net.Authenticator;
23import java.net.MalformedURLException;
24
25import org.apache.log4j.*;
26
27/**
28 *
29 * @author <a href="mailto:[email protected]">Katherine Don</a>
30 * @version $Revision: 13572 $
31 * Modified by <a href="mailto:[email protected]">Chi-Yu Huang</a>
32 */
33
34public class IViaRetrieve
35 extends AbstractDocumentRetrieve {
36
37 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.IViaRetrieve.class.getName());
38
39
40 protected String ivia_server_url = null;
41
42 public IViaRetrieve() {
43 does_structure = false;
44 }
45
46 //Configure IViaRetrieve Service
47 public boolean configure(Element info, Element extra_info)
48 {
49 if (!super.configure(info, extra_info)) {
50 return false;
51 }
52
53 Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer");
54 if (server_elem == null) {
55 logger.error("no iViaServer element found");
56 return false;
57 }
58 ivia_server_url = server_elem.getAttribute("url");
59 if (ivia_server_url.equals("")) {
60 logger.error("no url for the iViaServer element");
61 return false;
62 }
63 return true;
64
65 }
66
67 /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */
68 protected Element getNodeContent(String doc_id, String lang)
69 throws GSException {
70
71 String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id;
72
73 StringBuffer buffer = new StringBuffer();
74 try {
75 BufferedReader reader = Misc.makeHttpConnection(url_string);
76 String line;
77 while((line = reader.readLine())!= null) {
78 buffer.append(line);
79 }
80 } catch (java.net.MalformedURLException e) {
81 throw new GSException("Malformed URL: "+url_string, GSXML.ERROR_TYPE_SYSTEM);
82 } catch (java.io.IOException e) {
83 throw new GSException("IOException during connection to "+url_string+": "+e.toString(),GSXML.ERROR_TYPE_SYSTEM);
84 }
85
86 String node_content = buffer.toString();
87
88 String escaped_content = GSXML.xmlSafe(node_content);
89
90 StringBuffer processed_content = new StringBuffer(escaped_content.length());
91 processed_content.append("<nodeContent>");
92 int pos = 0;
93 int lastpos = 0;
94 while ((pos = escaped_content.indexOf("&lt;a ", lastpos))!= -1) {
95 processed_content.append(escaped_content.substring(lastpos, pos));
96 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
97 if (endpos == -1) {
98 break;
99 }
100 String link = escaped_content.substring(pos, endpos+10);
101 link = convertLink(link);
102 processed_content.append(link);
103 lastpos = endpos+10;
104 }
105 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
106 processed_content.append("</nodeContent>");
107
108 Document content_doc = this.converter.getDOM(processed_content.toString());
109 if (content_doc == null) {
110 logger.error("Couldn't parse node content");
111 logger.error(processed_content.toString());
112 return null;
113 }
114
115 Element content_element = content_doc.getDocumentElement();
116
117 return (Element)this.doc.importNode(content_element,true);
118 }
119
120 /** converts a url from an <a> element into a greenstone suitable one */
121 protected String convertLink(String aref) {
122 if (aref.indexOf("href=&quot;http") != -1) {
123 return aref; // an external link
124 }
125 String type = "other";
126 if (aref.indexOf("/cgi-bin/canned_search")!=-1) {
127 type="query";
128 } else if (aref.indexOf("/cgi-bin/click_through") != -1) {
129 type = "external";
130 } else if (aref.indexOf("/cgi-bin/view_record") != -1) {
131 type="document";
132 }
133
134 int href_start = aref.indexOf("href=&quot;")+11;
135 int href_end = aref.indexOf("&gt;", href_start);
136 String href = aref.substring(href_start, href_end);
137 String link_content = aref.substring(href_end+4, aref.length()-10);
138
139 if (type.equals("external")) {
140 // the external link is everything after the http at the end.
141 String address = href.substring(href.lastIndexOf("http"));
142 address = address.replaceAll("%3[aA]", ":");
143 address = address.replaceAll("%2[fF]", "/");
144
145 return "&lt;a href=\""+address+"\"&gt;"+link_content+"&lt;/a&gt;";
146 }
147 if (type.equals("other")) {
148 return "other type of link ("+link_content+")";
149 }
150 StringBuffer result = new StringBuffer();
151 result.append("<link type='");
152 result.append(type);
153 result.append("'");
154 if (type.equals("query")) {
155 result.append(" service='TextQuery'");
156 }
157 result.append(">");
158 // add in the parameters
159 href = href.substring(href.indexOf("?")+1);
160 String [] params = href.split("&amp;");
161 for (int i=0; i<params.length; i++) {
162 String param = params[i];
163 int eq_pos = param.indexOf("=");
164 if (eq_pos != -1) {
165
166 result.append("<param name='"+param.substring(0, eq_pos)+"' value='"+param.substring(eq_pos+1)+"'/>");
167 }
168 }
169 result.append(link_content);
170 result.append("</link>");
171
172 return result.toString();
173 }
174
175 // iVia craps out if we ask for a metadata which is not valid. So need
176 // to make sure we only ask for acceptable fields.
177 protected boolean isAcceptableMetadata(String meta) {
178 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
179 if (valid_metadata.indexOf(","+meta+",")!=-1) {
180 return true;
181 }
182 return false;
183 }
184
185 protected String translateId(String oid){
186 int p = oid.lastIndexOf('.');
187 if (p != oid.length()-3) {
188 logger.info("translateoid error: '.' is not the third to last char!!");
189 return oid;
190 }
191 String top = oid.substring(0, p);
192 return top;
193 }
194
195 protected String translateExternalId(String id){
196 return id;
197 }
198
199 protected String getDocType(String node_id){
200 return GSXML.DOC_TYPE_SIMPLE;
201 }
202 protected String getRootId(String node_id){
203 return node_id;
204 }
205
206 protected ArrayList getChildrenIds(String node_id){
207 return null;
208 }
209
210 protected String getParentId(String node_id){
211 return null;
212 }
213
214 protected Element getMetadataList (String doc_id,
215 boolean all_metadata,
216 ArrayList metadata_names)
217 throws GSException {
218
219 Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
220
221 // do the query to the iVia server
222 StringBuffer field_list= new StringBuffer();
223 boolean metadata_found = false;
224
225 for (int i=0; i<metadata_names.size();i++){
226 if (isAcceptableMetadata((String)metadata_names.get(i))){
227 metadata_found = true;
228 field_list.append((String)metadata_names.get(i));
229 field_list.append(",");
230 }
231 }
232 if (!metadata_found){
233 return meta_list;
234 }
235
236 String url_string = ivia_server_url+"/cgi-bin/view_record_set?theme=gsdl3&record_id_list="+doc_id+"&field_list="+field_list.toString();
237 try {
238 BufferedReader reader = Misc.makeHttpConnection(url_string);
239 String line;
240 while ((line = reader.readLine()) != null) {
241 //metadata entry
242 int col_pos = line.indexOf(':');
243 if (col_pos == -1) {
244 // end of the metadata for this doc
245 break;
246 }
247 String name = line.substring(0,col_pos);
248 String value = line.substring(col_pos+2); // includes a space
249 GSXML.addMetadata(this.doc, meta_list, name, value);
250 }
251 } catch (java.net.MalformedURLException e) {
252 throw new GSException("Malformed URL: "+url_string, GSXML.ERROR_TYPE_SYSTEM);
253 } catch (java.io.IOException e) {
254 throw new GSException("IOException: "+e.toString(), GSXML.ERROR_TYPE_SYSTEM);
255 }
256 return meta_list;
257 }
258
259 protected String getStructureInfo(String doc_id, String info_type){
260 return "";
261 }
262}
Note: See TracBrowser for help on using the repository browser.