package org.greenstone.gsdl3_extension.mat.servlet; import java.io.*; import java.net.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.servlet.*; import javax.servlet.http.*; import javax.xml.parsers.*; import org.xml.sax.*; import org.w3c.dom.*; import org.greenstone.gsdl3.util.GlobalProperties; import org.greenstone.gsdl3.util.GSPath; public class MatServlet extends HttpServlet { private int port_number = 0; private String oaiPrefix =""; private String titleString; private String h1String; private String maxRecord; private GlobalProperties globalProperty; private String fileSeparator; private String cssString; private String headerString; private String javaScript; private String headerString2; private String gsdl3Home; private String logoURL; private String description; private String hostName; private String proxyHost="http.proxyHost"; private String proxyPort="http.proxyPort"; private String proxyHostContent = ""; private String proxyPortContent = ""; public void doGet(HttpServletRequest request,HttpServletResponse response)throws ServletException, IOException { port_number = request.getLocalPort(); loadRuntimeSettings(); PrintWriter out = response.getWriter(); response.setContentType("text/html"); response.setHeader("pragma", "no-cache"); out.println(""); out.println(headerString); out.println("
"); out.println(""); out.println(h1String); out.println(""+description+"
"); out.println("Please wait ....
It's downloading OAI records
Please wait ...
It's building collection now.
"); out.println(""); buildCollection(out,req.getParameter("matShell"),req.getParameter("collectionName"),req.getParameter("collectionURL"),req.getParameter("oaiPrefix")); } else if (req.getParameter("collName") != null){ out.println(headerString); out.write("\r\n"); out.write("\r\n"); out.write("\r\n"); out.write("\r\n"); out.println(""); out.println(h1String); analyzeCollection(out, req.getParameter("collName"),req.getParameter("collURL"),req.getParameter("collHost"),req.getParameter("oaiPrefix")); } else { out.println(headerString); out.println(""); out.println(h1String); java.net.URL oaiURL; String oaiURLString = req.getParameter("oaiurl"); if (! (oaiURLString.startsWith("http://"))) { oaiURLString = "http://" + oaiURLString; // add on protocol if missing } // URL checks // check if Java can make a URL from the string try { oaiURL = new URL(oaiURLString); } catch (MalformedURLException e) { out.println("Malformed URL Exception caught: " + e.getMessage() + "
"); out.println("The system cannot recognise the URL you have entered.
"); return; // go no further } //out.println("host: " + oaiURL.getHost() + "
" ); //out.println("protocol: " + oaiURL.getProtocol() + "
" ); // disallow anything with waikato as part of the host // note: this blocks things like waikato.uiuc.edu - // but guess this is highly unlikely to occur // (also turns off the researchcommons) // could use either of these approaches, waikato or waikato.ac.nz if (oaiURL.getHost().indexOf("researchcommons.waikato.ac.nz") == -1 ) { // not the RC /* if((oaiURL.getHost().indexOf("waikato") != -1 ) || (oaiURL.getHost().indexOf("waikato.ac.nz") != -1 ) ) { out.println("This service cannot be used to access Waikato URLs
"); return; }*/ } // need to prevent machine names on their own, e.g. smith // being allowed through // approach 1: explicit blacklisting if (oaiURL.getHost().equals("smith") || oaiURL.getHost().equals("wesson") ) { out.println("This service cannot be used to access these URLs
"); return; } // approach 2: require at least one . in the host URL if (oaiURL.getHost().indexOf(".") == -1 ) { // i.e. no . in URL host out.println("This service cannot be used to access URLs of this form.
"); return; } // jones.cs would get through to this point, might this mean something // from the perspective of the host machine?? Does it matter? // will automatic domain completion be applied? by java? by 'the network'? // require 2 dots in the host? does that help at all ? // domain suffix whitelisting is impractical here, as we'd have to // list all countries in the world // do an OAI verb=identify check to make sure it is an OAI server // trim the URL back and then add on verb=Identify //System.setProperty("http.proxyHost", "wwwcache.cs.waikato.ac.nz"); //System.setProperty("http.proxyPort","80"); //System.setProperty("http.nonProxyHosts", "localhost|*.waikato.ac.nz"); Properties systemSettings = System.getProperties(); if( proxyHostContent.equals("") || proxyPortContent.equals("")){} else{ systemSettings.put(proxyHost, proxyHostContent); systemSettings.put(proxyPort, proxyPortContent); } URL url = oaiURL; String identifyVerb = ""; if (! url.getPath().endsWith("?")) { identifyVerb += "?"; } identifyVerb += "verb=Identify"; URL identifyURL = new URL (url.toString() + identifyVerb); HttpURLConnection connection = (HttpURLConnection)identifyURL.openConnection(); //URLConnection connection = identifyURL.openConnection(); connection.connect(); //BufferedReader inIdentify = new BufferedReader(new InputStreamReader( //connection.getInputStream())); Document identifyDocument; try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); identifyDocument = builder.parse( connection.getInputStream() ); Element oaiElement = identifyDocument.getDocumentElement(); if (oaiElement.getTagName() == "OAI-PMH" ) { //out.println("OAI-PMH element found...
"); } else { out.println("Error: OAI-PMH element not found..exiting
"); return; } NodeList identifyNodeList = identifyDocument.getElementsByTagName("Identify"); Node identifyNode = null; if (identifyNodeList.getLength() == 1 ) { identifyNode = identifyNodeList.item(0); System.out.println("Error: Identify node not found... exiting
"); return; } NodeList identifyChildList = identifyNode.getChildNodes(); out.println("Repository Name: | " + identifyChildList.item(i).getTextContent() + " |
Base URL: | " + baseURL + " |
Base URL: | " + oaiURLString + " |
Choose one metadata prefix to use:
"); out.println(""); out.println("Warning: Generating the statistics and visualization will take some time:
"); /* out.println("No.of Records | Estimated Time"); out.println(" |
100 | 5 minutes"); out.println(" |
500 | 10 minutes"); out.println(" |
1000 | 18 minutes"); out.println(" |
2000 | 30 minutes"); out.println(" |
This tool is designed to work with Dublin Core metadata: note that the mapping of qualified Dublin Core to simple Dublin Core (as in oai_dc
) may affect the results.");
}
else {
out.println("
Error: no metadata prefixes found... exiting
"); return; } } catch (SAXParseException spe) { out.println(spe.getMessage()); } catch (SAXException sxe) { out.println(sxe.toString()); } catch (ParserConfigurationException pce) { out.println(pce.toString()); } catch (IOException ioe) { out.println(ioe.toString()); } // check we get a valid XML document back // check OAI-PMH element // check we have aBuilding collection...
"); String host = "http://"+hostName+":"+port_number+"/greenstone3/mat/"; out.println(""); out.println("" + matShell2 + ""); out.flush(); Process p2 = processShell(matShell2, out); if (p2.exitValue() == 0){ out.println("
Collection built.
"); } else{ out.println("Collection not built.
"); } out.print("Downloading OAI documents...
"); out.flush(); // move to correct directory String os = "linux"; if(fileSeparator == "\\"){ gsdl3Home.replace("\\", "/"); os = "windows"; } String gsdl3Root = GSPath.removeLastLink(gsdl3Home); if(os.equals("windows")){ gsdl3Root = gsdl3Root.replace("/","\\"); gsdl3Home = gsdl3Home.replace("/", "\\"); } String maxRecords = Records; String cacheDir = gsdl3Root +fileSeparator+"ext"+fileSeparator+"mat"+fileSeparator+"tmp"+fileSeparator + collName; // downloading //String gs3Root = "/research/cc108/greenstone3Project"; String gs3Root = gsdl3Root; String collectDir = gsdl3Home + fileSeparator +"sites"+fileSeparator+"localsite"+ fileSeparator +"collect"; String logFile = gsdl3Root +fileSeparator+"ext"+fileSeparator+"mat"+fileSeparator+"tmp"+fileSeparator + "log.txt"; //source /research/cc108/greenstone3Project/gs2build/bin/script/mat-colbuild-download.bash String matShell = "source "+ gsdl3Root + fileSeparator + "gs2build" + fileSeparator + "bin" + fileSeparator + "script" + fileSeparator + "mat-colbuild-download.bash " + collName + " " + oaiURLString + " " + cacheDir + " " + maxRecords + " " + metadataprefix + " " + gs3Root + " " + collectDir + " " + logFile; out.println("" + matShell + ""); out.flush(); Process p = processShell(matShell, out); if (p.exitValue() == 0){ out.println("
Collection downloaded.
"); } else { out.println("Collection not downloaded properly.
"); } out.flush(); //p = null; p.destroy(); // return the web page //out.println(req.getRequestURL()); // building //source "+ gsdl3Root + fileSeparator+"ext"+fileSeparator+"mat"+fileSeparator + "bin" + fileSeparator + "script"+ fileSeparator + "mat-colbuild-download.bash " //String matShell2 = "source /research/cc108/greenstone3Project/gs2build/bin/script/mat-colbuild.bash " String matShell2 = "source "+ gsdl3Root + fileSeparator + "gs2build" + fileSeparator + "bin" + fileSeparator + "script" + fileSeparator + "mat-colbuild.bash " + collName + " " + oaiURLString + " " + cacheDir + " " + maxRecords + " " + metadataprefix + " " + gs3Root + " " + collectDir + " " + logFile; out.println(""); out.println("