source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/IViaRetrieve.java@ 32453

Last change on this file since 32453 was 32452, checked in by kjdon, 6 years ago

removed author email

  • Property svn:keywords set to Author Date Id Revision
File size: 7.7 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.core.GSException;
5import org.greenstone.util.Misc;
6import org.greenstone.gsdl3.util.*;
7
8// XML classes
9import org.w3c.dom.Element;
10import org.w3c.dom.Document;
11import org.w3c.dom.NodeList;
12
13import java.util.HashMap;
14import java.util.ArrayList;
15import java.io.File;
16import java.io.InputStream;
17import java.io.BufferedReader;
18import java.io.InputStreamReader;
19import java.io.IOException;
20import java.net.HttpURLConnection;
21import java.net.URLConnection;
22import java.net.URL;
23import java.net.Authenticator;
24import java.net.MalformedURLException;
25
26import org.apache.log4j.*;
27
28public class IViaRetrieve
29 extends AbstractDocumentRetrieve {
30
31 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.IViaRetrieve.class.getName());
32
33
34 protected String ivia_server_url = null;
35
36 public IViaRetrieve() {
37 does_structure = false;
38 }
39
40 //Configure IViaRetrieve Service
41 public boolean configure(Element info, Element extra_info)
42 {
43 if (!super.configure(info, extra_info)) {
44 return false;
45 }
46
47 Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer");
48 if (server_elem == null) {
49 logger.error("no iViaServer element found");
50 return false;
51 }
52 ivia_server_url = server_elem.getAttribute("url");
53 if (ivia_server_url.equals("")) {
54 logger.error("no url for the iViaServer element");
55 return false;
56 }
57 return true;
58
59 }
60
61 /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */
62 protected Element getNodeContent(Document doc, String doc_id, String lang)
63 throws GSException {
64
65 String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id;
66
67 StringBuffer buffer = new StringBuffer();
68 try {
69 BufferedReader reader = Misc.makeHttpConnection(url_string);
70 String line;
71 while((line = reader.readLine())!= null) {
72 buffer.append(line);
73 }
74 } catch (java.net.MalformedURLException e) {
75 throw new GSException("Malformed URL: "+url_string, GSXML.ERROR_TYPE_SYSTEM);
76 } catch (java.io.IOException e) {
77 throw new GSException("IOException during connection to "+url_string+": "+e.toString(),GSXML.ERROR_TYPE_SYSTEM);
78 }
79
80 String node_content = buffer.toString();
81
82 String escaped_content = GSXML.xmlSafe(node_content);
83
84 StringBuffer processed_content = new StringBuffer(escaped_content.length());
85 processed_content.append("<nodeContent>");
86 int pos = 0;
87 int lastpos = 0;
88 while ((pos = escaped_content.indexOf("&lt;a ", lastpos))!= -1) {
89 processed_content.append(escaped_content.substring(lastpos, pos));
90 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
91 if (endpos == -1) {
92 break;
93 }
94 String link = escaped_content.substring(pos, endpos+10);
95 link = convertLink(link);
96 processed_content.append(link);
97 lastpos = endpos+10;
98 }
99 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
100 processed_content.append("</nodeContent>");
101
102 Document content_doc = this.converter.getDOM(processed_content.toString());
103 if (content_doc == null) {
104 logger.error("Couldn't parse node content");
105 logger.error(processed_content.toString());
106 return null;
107 }
108
109 Element content_element = content_doc.getDocumentElement();
110
111 return (Element)doc.importNode(content_element,true);
112 }
113
114 /** converts a url from an <a> element into a greenstone suitable one */
115 protected String convertLink(String aref) {
116 if (aref.indexOf("href=&quot;http") != -1) {
117 return aref; // an external link
118 }
119 String type = "other";
120 if (aref.indexOf("/cgi-bin/canned_search")!=-1) {
121 type="query";
122 } else if (aref.indexOf("/cgi-bin/click_through") != -1) {
123 type = "external";
124 } else if (aref.indexOf("/cgi-bin/view_record") != -1) {
125 type="document";
126 }
127
128 int href_start = aref.indexOf("href=&quot;")+11;
129 int href_end = aref.indexOf("&gt;", href_start);
130 String href = aref.substring(href_start, href_end);
131 String link_content = aref.substring(href_end+4, aref.length()-10);
132
133 if (type.equals("external")) {
134 // the external link is everything after the http at the end.
135 String address = href.substring(href.lastIndexOf("http"));
136 address = address.replaceAll("%3[aA]", ":");
137 address = address.replaceAll("%2[fF]", "/");
138
139 return "&lt;a href=\""+address+"\"&gt;"+link_content+"&lt;/a&gt;";
140 }
141 if (type.equals("other")) {
142 return "other type of link ("+link_content+")";
143 }
144 StringBuffer result = new StringBuffer();
145 result.append("<link type='");
146 result.append(type);
147 result.append("'");
148 if (type.equals("query")) {
149 result.append(" service='TextQuery'");
150 }
151 result.append(">");
152 // add in the parameters
153 href = href.substring(href.indexOf("?")+1);
154 String [] params = href.split("&amp;");
155 for (int i=0; i<params.length; i++) {
156 String param = params[i];
157 int eq_pos = param.indexOf("=");
158 if (eq_pos != -1) {
159
160 result.append("<param name='"+param.substring(0, eq_pos)+"' value='"+param.substring(eq_pos+1)+"'/>");
161 }
162 }
163 result.append(link_content);
164 result.append("</link>");
165
166 return result.toString();
167 }
168
169 // iVia craps out if we ask for a metadata which is not valid. So need
170 // to make sure we only ask for acceptable fields.
171 protected boolean isAcceptableMetadata(String meta) {
172 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
173 if (valid_metadata.indexOf(","+meta+",")!=-1) {
174 return true;
175 }
176 return false;
177 }
178
179 protected String translateId(String oid){
180 int p = oid.lastIndexOf('.');
181 if (p != oid.length()-3) {
182 logger.info("translateoid error: '.' is not the third to last char!!");
183 return oid;
184 }
185 String top = oid.substring(0, p);
186 return top;
187 }
188
189 protected String translateExternalId(String id){
190 return id;
191 }
192
193 protected String getDocType(String node_id){
194 return GSXML.DOC_TYPE_SIMPLE;
195 }
196 protected String getRootId(String node_id){
197 return node_id;
198 }
199
200 protected ArrayList<String> getChildrenIds(String node_id){
201 return null;
202 }
203
204 protected String getParentId(String node_id){
205 return null;
206 }
207
208 protected Element getMetadataList (Document doc, String doc_id,
209 boolean all_metadata,
210 ArrayList<String> metadata_names,
211 String lang)
212 throws GSException {
213
214 Element meta_list = doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
215
216 // do the query to the iVia server
217 StringBuffer field_list= new StringBuffer();
218 boolean metadata_found = false;
219
220 for (int i=0; i<metadata_names.size();i++){
221 if (isAcceptableMetadata(metadata_names.get(i))){
222 metadata_found = true;
223 field_list.append(metadata_names.get(i));
224 field_list.append(",");
225 }
226 }
227 if (!metadata_found){
228 return meta_list;
229 }
230
231 String url_string = ivia_server_url+"/cgi-bin/view_record_set?theme=gsdl3&record_id_list="+doc_id+"&field_list="+field_list.toString();
232 try {
233 BufferedReader reader = Misc.makeHttpConnection(url_string);
234 String line;
235 while ((line = reader.readLine()) != null) {
236 //metadata entry
237 int col_pos = line.indexOf(':');
238 if (col_pos == -1) {
239 // end of the metadata for this doc
240 break;
241 }
242 String name = line.substring(0,col_pos);
243 String value = line.substring(col_pos+2); // includes a space
244 GSXML.addMetadata(meta_list, name, value);
245 }
246 } catch (java.net.MalformedURLException e) {
247 throw new GSException("Malformed URL: "+url_string, GSXML.ERROR_TYPE_SYSTEM);
248 } catch (java.io.IOException e) {
249 throw new GSException("IOException: "+e.toString(), GSXML.ERROR_TYPE_SYSTEM);
250 }
251 return meta_list;
252 }
253
254 protected String getStructureInfo(String doc_id, String info_type){
255 return "";
256 }
257}
Note: See TracBrowser for help on using the repository browser.