package org.greenstone.gsdl3.service; // Greenstone classes import org.greenstone.gsdl3.core.GSException; import org.greenstone.gsdl3.util.*; // XML classes import org.w3c.dom.Element; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import java.util.HashMap; import java.util.ArrayList; import java.io.File; import java.io.InputStream; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import java.net.HttpURLConnection; import java.net.URLConnection; import java.net.URL; import java.net.Authenticator; import java.net.MalformedURLException; import org.apache.log4j.*; /** * * @author Katherine Don * @version $Revision: 13572 $ * Modified by Chi-Yu Huang */ public class IViaRetrieve extends AbstractDocumentRetrieve { static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.IViaRetrieve.class.getName()); protected String ivia_server_url = null; public IViaRetrieve() { does_structure = false; } //Configure IViaRetrieve Service public boolean configure(Element info, Element extra_info) { if (!super.configure(info, extra_info)) { return false; } Element server_elem = (Element)GSXML.getChildByTagName(info, "iViaServer"); if (server_elem == null) { logger.error("no iViaServer element found"); return false; } ivia_server_url = server_elem.getAttribute("url"); if (ivia_server_url.equals("")) { logger.error("no url for the iViaServer element"); return false; } return true; } /** gets a document by sending a request to iVia, then processes it and creates a documentNode around the text */ protected Element getNodeContent(String doc_id, String lang) throws GSException { String url_string = ivia_server_url+"/cgi-bin/view_record?theme=gsdl3&record_id="+doc_id; StringBuffer buffer = new StringBuffer(); try { BufferedReader reader = Misc.makeHttpConnection(url_string); String line; while((line = reader.readLine())!= null) { buffer.append(line); } } catch (java.net.MalformedURLException e) { throw new GSException("Malformed URL: "+url_string, GSXML.ERROR_TYPE_SYSTEM); } catch (java.io.IOException e) { throw new GSException("IOException during connection to "+url_string+": "+e.toString(),GSXML.ERROR_TYPE_SYSTEM); } String node_content = buffer.toString(); String escaped_content = GSXML.xmlSafe(node_content); StringBuffer processed_content = new StringBuffer(escaped_content.length()); processed_content.append(""); int pos = 0; int lastpos = 0; while ((pos = escaped_content.indexOf("<a ", lastpos))!= -1) { processed_content.append(escaped_content.substring(lastpos, pos)); int endpos = escaped_content.indexOf("</a>", pos); if (endpos == -1) { break; } String link = escaped_content.substring(pos, endpos+10); link = convertLink(link); processed_content.append(link); lastpos = endpos+10; } processed_content.append(escaped_content.substring(lastpos)); // get the last bit processed_content.append(""); Document content_doc = this.converter.getDOM(processed_content.toString()); if (content_doc == null) { logger.error("Couldn't parse node content"); logger.error(processed_content.toString()); return null; } Element content_element = content_doc.getDocumentElement(); return (Element)this.doc.importNode(content_element,true); } /** converts a url from an element into a greenstone suitable one */ protected String convertLink(String aref) { if (aref.indexOf("href="http") != -1) { return aref; // an external link } String type = "other"; if (aref.indexOf("/cgi-bin/canned_search")!=-1) { type="query"; } else if (aref.indexOf("/cgi-bin/click_through") != -1) { type = "external"; } else if (aref.indexOf("/cgi-bin/view_record") != -1) { type="document"; } int href_start = aref.indexOf("href="")+11; int href_end = aref.indexOf(">", href_start); String href = aref.substring(href_start, href_end); String link_content = aref.substring(href_end+4, aref.length()-10); if (type.equals("external")) { // the external link is everything after the http at the end. String address = href.substring(href.lastIndexOf("http")); address = address.replaceAll("%3[aA]", ":"); address = address.replaceAll("%2[fF]", "/"); return "<a href=\""+address+"\">"+link_content+"</a>"; } if (type.equals("other")) { return "other type of link ("+link_content+")"; } StringBuffer result = new StringBuffer(); result.append(""); // add in the parameters href = href.substring(href.indexOf("?")+1); String [] params = href.split("&"); for (int i=0; i"); } } result.append(link_content); result.append(""); return result.toString(); } // iVia craps out if we ask for a metadata which is not valid. So need // to make sure we only ask for acceptable fields. protected boolean isAcceptableMetadata(String meta) { String valid_metadata = ",title,url,ivia_description,keywords,subjects,"; if (valid_metadata.indexOf(","+meta+",")!=-1) { return true; } return false; } protected String translateId(String oid){ int p = oid.lastIndexOf('.'); if (p != oid.length()-3) { logger.info("translateoid error: '.' is not the third to last char!!"); return oid; } String top = oid.substring(0, p); return top; } protected String translateExternalId(String id){ return id; } protected String getDocType(String node_id){ return GSXML.DOC_TYPE_SIMPLE; } protected String getRootId(String node_id){ return node_id; } protected ArrayList getChildrenIds(String node_id){ return null; } protected String getParentId(String node_id){ return null; } protected Element getMetadataList (String doc_id, boolean all_metadata, ArrayList metadata_names) throws GSException { Element meta_list = this.doc.createElement(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER); // do the query to the iVia server StringBuffer field_list= new StringBuffer(); boolean metadata_found = false; for (int i=0; i