source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/IViaRetrieve.java@ 26198

Last change on this file since 26198 was 25727, checked in by kjdon, 12 years ago

getting rid of my email address

  • Property svn:keywords set to Author Date Id Revision
File size: 7.8 KB
Line 
1package org.greenstone.gsdl3.service;
2
3// Greenstone classes
4import org.greenstone.gsdl3.core.GSException;
5import org.greenstone.util.Misc;
6import org.greenstone.gsdl3.util.*;
7
8// XML classes
9import org.w3c.dom.Element;
10import org.w3c.dom.Document;
11import org.w3c.dom.NodeList;
12
13import java.util.HashMap;
14import java.util.ArrayList;
15import java.io.File;
16import java.io.InputStream;
17import java.io.BufferedReader;
18import java.io.InputStreamReader;
19import java.io.IOException;
20import java.net.HttpURLConnection;
21import java.net.URLConnection;
22import java.net.URL;
23import java.net.Authenticator;
24import java.net.MalformedURLException;
25
26import org.apache.log4j.*;
27
28/**
29 *
30 * @author Katherine Don
31 * @version $Revision: 25727 $
32 * Modified by <a href="mailto:[email protected]">Chi-Yu Huang</a>
33 */
34
35public class IViaRetrieve
36 extends AbstractDocumentRetrieve {
37
38 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.IViaRetrieve.class.getName());
39
40
41 protected String ivia_server_url = null;
42
43 public IViaRetrieve() {
44 does_structure = false;
45 }
46
47 //Configure IViaRetrieve Service
48 public boolean configure(Element info, Element extra_info)
49 {
50 if (!super.configure(info, extra_info)) {
51 return false;
52 }
53
54 Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer");
55 if (server_elem == null) {
56 logger.error("no iViaServer element found");
57 return false;
58 }
59 ivia_server_url = server_elem.getAttribute("url");
60 if (ivia_server_url.equals("")) {
61 logger.error("no url for the iViaServer element");
62 return false;
63 }
64 return true;
65
66 }
67
68 /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */
69 protected Element getNodeContent(String doc_id, String lang)
70 throws GSException {
71
72 String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id;
73
74 StringBuffer buffer = new StringBuffer();
75 try {
76 BufferedReader reader = Misc.makeHttpConnection(url_string);
77 String line;
78 while((line = reader.readLine())!= null) {
79 buffer.append(line);
80 }
81 } catch (java.net.MalformedURLException e) {
82 throw new GSException("Malformed URL: "+url_string, GSXML.ERROR_TYPE_SYSTEM);
83 } catch (java.io.IOException e) {
84 throw new GSException("IOException during connection to "+url_string+": "+e.toString(),GSXML.ERROR_TYPE_SYSTEM);
85 }
86
87 String node_content = buffer.toString();
88
89 String escaped_content = GSXML.xmlSafe(node_content);
90
91 StringBuffer processed_content = new StringBuffer(escaped_content.length());
92 processed_content.append("<nodeContent>");
93 int pos = 0;
94 int lastpos = 0;
95 while ((pos = escaped_content.indexOf("&lt;a ", lastpos))!= -1) {
96 processed_content.append(escaped_content.substring(lastpos, pos));
97 int endpos = escaped_content.indexOf("&lt;/a&gt;", pos);
98 if (endpos == -1) {
99 break;
100 }
101 String link = escaped_content.substring(pos, endpos+10);
102 link = convertLink(link);
103 processed_content.append(link);
104 lastpos = endpos+10;
105 }
106 processed_content.append(escaped_content.substring(lastpos)); // get the last bit
107 processed_content.append("</nodeContent>");
108
109 Document content_doc = this.converter.getDOM(processed_content.toString());
110 if (content_doc == null) {
111 logger.error("Couldn't parse node content");
112 logger.error(processed_content.toString());
113 return null;
114 }
115
116 Element content_element = content_doc.getDocumentElement();
117
118 return (Element)this.doc.importNode(content_element,true);
119 }
120
121 /** converts a url from an <a> element into a greenstone suitable one */
122 protected String convertLink(String aref) {
123 if (aref.indexOf("href=&quot;http") != -1) {
124 return aref; // an external link
125 }
126 String type = "other";
127 if (aref.indexOf("/cgi-bin/canned_search")!=-1) {
128 type="query";
129 } else if (aref.indexOf("/cgi-bin/click_through") != -1) {
130 type = "external";
131 } else if (aref.indexOf("/cgi-bin/view_record") != -1) {
132 type="document";
133 }
134
135 int href_start = aref.indexOf("href=&quot;")+11;
136 int href_end = aref.indexOf("&gt;", href_start);
137 String href = aref.substring(href_start, href_end);
138 String link_content = aref.substring(href_end+4, aref.length()-10);
139
140 if (type.equals("external")) {
141 // the external link is everything after the http at the end.
142 String address = href.substring(href.lastIndexOf("http"));
143 address = address.replaceAll("%3[aA]", ":");
144 address = address.replaceAll("%2[fF]", "/");
145
146 return "&lt;a href=\""+address+"\"&gt;"+link_content+"&lt;/a&gt;";
147 }
148 if (type.equals("other")) {
149 return "other type of link ("+link_content+")";
150 }
151 StringBuffer result = new StringBuffer();
152 result.append("<link type='");
153 result.append(type);
154 result.append("'");
155 if (type.equals("query")) {
156 result.append(" service='TextQuery'");
157 }
158 result.append(">");
159 // add in the parameters
160 href = href.substring(href.indexOf("?")+1);
161 String [] params = href.split("&amp;");
162 for (int i=0; i<params.length; i++) {
163 String param = params[i];
164 int eq_pos = param.indexOf("=");
165 if (eq_pos != -1) {
166
167 result.append("<param name='"+param.substring(0, eq_pos)+"' value='"+param.substring(eq_pos+1)+"'/>");
168 }
169 }
170 result.append(link_content);
171 result.append("</link>");
172
173 return result.toString();
174 }
175
176 // iVia craps out if we ask for a metadata which is not valid. So need
177 // to make sure we only ask for acceptable fields.
178 protected boolean isAcceptableMetadata(String meta) {
179 String valid_metadata = ",title,url,ivia_description,keywords,subjects,";
180 if (valid_metadata.indexOf(","+meta+",")!=-1) {
181 return true;
182 }
183 return false;
184 }
185
186 protected String translateId(String oid){
187 int p = oid.lastIndexOf('.');
188 if (p != oid.length()-3) {
189 logger.info("translateoid error: '.' is not the third to last char!!");
190 return oid;
191 }
192 String top = oid.substring(0, p);
193 return top;
194 }
195
196 protected String translateExternalId(String id){
197 return id;
198 }
199
200 protected String getDocType(String node_id){
201 return GSXML.DOC_TYPE_SIMPLE;
202 }
203 protected String getRootId(String node_id){
204 return node_id;
205 }
206
207 protected ArrayList<String> getChildrenIds(String node_id){
208 return null;
209 }
210
211 protected String getParentId(String node_id){
212 return null;
213 }
214
215 protected Element getMetadataList (String doc_id,
216 boolean all_metadata,
217 ArrayList<String> metadata_names)
218 throws GSException {
219
220 Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
221
222 // do the query to the iVia server
223 StringBuffer field_list= new StringBuffer();
224 boolean metadata_found = false;
225
226 for (int i=0; i<metadata_names.size();i++){
227 if (isAcceptableMetadata(metadata_names.get(i))){
228 metadata_found = true;
229 field_list.append(metadata_names.get(i));
230 field_list.append(",");
231 }
232 }
233 if (!metadata_found){
234 return meta_list;
235 }
236
237 String url_string = ivia_server_url+"/cgi-bin/view_record_set?theme=gsdl3&record_id_list="+doc_id+"&field_list="+field_list.toString();
238 try {
239 BufferedReader reader = Misc.makeHttpConnection(url_string);
240 String line;
241 while ((line = reader.readLine()) != null) {
242 //metadata entry
243 int col_pos = line.indexOf(':');
244 if (col_pos == -1) {
245 // end of the metadata for this doc
246 break;
247 }
248 String name = line.substring(0,col_pos);
249 String value = line.substring(col_pos+2); // includes a space
250 GSXML.addMetadata(this.doc, meta_list, name, value);
251 }
252 } catch (java.net.MalformedURLException e) {
253 throw new GSException("Malformed URL: "+url_string, GSXML.ERROR_TYPE_SYSTEM);
254 } catch (java.io.IOException e) {
255 throw new GSException("IOException: "+e.toString(), GSXML.ERROR_TYPE_SYSTEM);
256 }
257 return meta_list;
258 }
259
260 protected String getStructureInfo(String doc_id, String info_type){
261 return "";
262 }
263}
Note: See TracBrowser for help on using the repository browser.