package org.greenstone.gsdl3_extension.mat.servlet; import java.io.*; import java.net.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.servlet.*; import javax.servlet.http.*; import javax.xml.parsers.*; import org.xml.sax.*; import org.w3c.dom.*; import org.greenstone.gsdl3.util.GlobalProperties; import org.greenstone.gsdl3.util.GSPath; public class MatServlet extends HttpServlet { private int port_number = 0; private String oaiPrefix =""; private String titleString; private String h1String; private String maxRecord; private GlobalProperties globalProperty; private String fileSeparator; private String cssString; private String headerString; private String javaScript; private String headerString2; private String gsdl3Home; private String logoURL; private String description; private String hostName; private String proxyHost="http.proxyHost"; private String proxyPort="http.proxyPort"; private String proxyHostContent = ""; private String proxyPortContent = ""; public void doGet(HttpServletRequest request,HttpServletResponse response)throws ServletException, IOException { port_number = request.getLocalPort(); loadRuntimeSettings(); PrintWriter out = response.getWriter(); response.setContentType("text/html"); response.setHeader("pragma", "no-cache"); out.println(""); out.println(headerString); out.println(""); out.println(""); out.println(h1String); out.println("

"+description+"

"); out.println("
"); out.println("

OAI URL:

"); out.println("

"); out.println("
"); out.println (""); out.println (""); out.close(); } protected void doPost(HttpServletRequest req, HttpServletResponse res)throws ServletException, IOException { port_number = req.getLocalPort(); loadRuntimeSettings(); res.setContentType("text/html"); res.setHeader("pragma", "no-cache"); PrintWriter out = res.getWriter(); out.println(""); if (req.getParameter("metadataPrefix") != null) { oaiPrefix = req.getParameter("metadataPrefix"); String maxRecords = req.getParameter("maxrecords"); Pattern pa = Pattern.compile("[0-9]{1,5}"); Matcher ma = pa.matcher(maxRecords); out.println(headerString2); out.println(""); out.println(h1String); out.println("

Please wait ....
It's downloading OAI records

"); out.println(""); if(ma.matches()){ int num = Integer.parseInt(maxRecords); if(num>0){ downloadCollection(out, req, res, oaiPrefix, maxRecords); } else{ downloadCollection(out, req, res, oaiPrefix, maxRecord); } } else{ downloadCollection(out, req, res, oaiPrefix, maxRecord); } } else if(req.getParameter("matShell") != null){ out.println(headerString2); out.println(""); out.println(h1String); out.println("

Please wait ...

It's building collection now.

"); out.println(""); buildCollection(out,req.getParameter("matShell"),req.getParameter("collectionName"),req.getParameter("collectionURL"),req.getParameter("oaiPrefix")); } else if (req.getParameter("collName") != null){ out.println(headerString); out.write("\r\n"); out.write("\r\n"); out.write("\r\n"); out.write("\r\n"); out.println(""); out.println(h1String); analyzeCollection(out, req.getParameter("collName"),req.getParameter("collURL"),req.getParameter("collHost"),req.getParameter("oaiPrefix")); } else { out.println(headerString); out.println(""); out.println(h1String); java.net.URL oaiURL; String oaiURLString = req.getParameter("oaiurl"); if (! (oaiURLString.startsWith("http://"))) { oaiURLString = "http://" + oaiURLString; // add on protocol if missing } // URL checks // check if Java can make a URL from the string try { oaiURL = new URL(oaiURLString); } catch (MalformedURLException e) { out.println("

Malformed URL Exception caught: " + e.getMessage() + "

"); out.println("

The system cannot recognise the URL you have entered.

"); return; // go no further } //out.println("

host: " + oaiURL.getHost() + "

" ); //out.println("

protocol: " + oaiURL.getProtocol() + "

" ); // disallow anything with waikato as part of the host // note: this blocks things like waikato.uiuc.edu - // but guess this is highly unlikely to occur // (also turns off the researchcommons) // could use either of these approaches, waikato or waikato.ac.nz if (oaiURL.getHost().indexOf("researchcommons.waikato.ac.nz") == -1 ) { // not the RC /* if((oaiURL.getHost().indexOf("waikato") != -1 ) || (oaiURL.getHost().indexOf("waikato.ac.nz") != -1 ) ) { out.println("

This service cannot be used to access Waikato URLs

"); return; }*/ } // need to prevent machine names on their own, e.g. smith // being allowed through // approach 1: explicit blacklisting if (oaiURL.getHost().equals("smith") || oaiURL.getHost().equals("wesson") ) { out.println("

This service cannot be used to access these URLs

"); return; } // approach 2: require at least one . in the host URL if (oaiURL.getHost().indexOf(".") == -1 ) { // i.e. no . in URL host out.println("

This service cannot be used to access URLs of this form.

"); return; } // jones.cs would get through to this point, might this mean something // from the perspective of the host machine?? Does it matter? // will automatic domain completion be applied? by java? by 'the network'? // require 2 dots in the host? does that help at all ? // domain suffix whitelisting is impractical here, as we'd have to // list all countries in the world // do an OAI verb=identify check to make sure it is an OAI server // trim the URL back and then add on verb=Identify //System.setProperty("http.proxyHost", "wwwcache.cs.waikato.ac.nz"); //System.setProperty("http.proxyPort","80"); //System.setProperty("http.nonProxyHosts", "localhost|*.waikato.ac.nz"); Properties systemSettings = System.getProperties(); if( proxyHostContent.equals("") || proxyPortContent.equals("")){} else{ systemSettings.put(proxyHost, proxyHostContent); systemSettings.put(proxyPort, proxyPortContent); } URL url = oaiURL; String identifyVerb = ""; if (! url.getPath().endsWith("?")) { identifyVerb += "?"; } identifyVerb += "verb=Identify"; URL identifyURL = new URL (url.toString() + identifyVerb); HttpURLConnection connection = (HttpURLConnection)identifyURL.openConnection(); //URLConnection connection = identifyURL.openConnection(); connection.connect(); //BufferedReader inIdentify = new BufferedReader(new InputStreamReader( //connection.getInputStream())); Document identifyDocument; try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); identifyDocument = builder.parse( connection.getInputStream() ); Element oaiElement = identifyDocument.getDocumentElement(); if (oaiElement.getTagName() == "OAI-PMH" ) { //out.println("

OAI-PMH element found...

"); } else { out.println("

Error: OAI-PMH element not found..exiting

"); return; } NodeList identifyNodeList = identifyDocument.getElementsByTagName("Identify"); Node identifyNode = null; if (identifyNodeList.getLength() == 1 ) { identifyNode = identifyNodeList.item(0); System.out.println("\n"); } else { out.println("

Error: Identify node not found... exiting

"); return; } NodeList identifyChildList = identifyNode.getChildNodes(); out.println(""); for (int i=0; i < identifyChildList.getLength(); i++) { if (identifyChildList.item(i).getNodeName() == "repositoryName" ) { out.println(""); } if (identifyChildList.item(i).getNodeName() == "baseURL" ) { String baseURL = identifyChildList.item(i).getTextContent(); if(baseURL.startsWith("http://")){ out.println(""); } else{ out.println(""); } } } out.println("
Repository Name: " + identifyChildList.item(i).getTextContent() + "
Base URL:" + baseURL + "
Base URL:" + oaiURLString + "
"); } catch (SAXParseException spe) { out.println(spe.getMessage()); } catch (SAXException sxe) { out.println(sxe.toString()); } catch (ParserConfigurationException pce) { out.println(pce.toString()); } catch (IOException ioe) { out.println(ioe.toString()); } // now get the metadata prefixes String metadataFormats = ""; if (! url.getPath().endsWith("?")) { metadataFormats += "?"; } metadataFormats += "verb=ListMetadataFormats"; URL metadataFormatsURL = new URL (url.toString() + metadataFormats); URLConnection connection2 = metadataFormatsURL.openConnection(); connection.connect(); Document document2; try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); document2 = builder.parse( connection2.getInputStream() ); NodeList prefixList = document2.getElementsByTagName("metadataPrefix"); // create UI based on response if ( prefixList.getLength() > 0 ) { out.println("

Choose one metadata prefix to use:

"); out.println("
"); out.println(""); for (int i=0; i < prefixList.getLength(); i++) { out.println(""); } //System.out.println(""); out.println("
"); String id = "radioID" + i; String prefix = prefixList.item(i).getTextContent(); if (prefix.equals("oai_dc")) { out.println(""); //System.out.println("Dublin Core<\\MetadataElement>\n"); } else { out.println(""); //System.out.println(""+prefix+"<\\MetadataElement>\n"); } out.print("
"); out.println("
"); // need to pass oaiurl through as well (again) out.println("
"); out.println("Max records:
"); // submit button out.println("

"); out.println("
"); out.println("

Warning: Generating the statistics and visualization will take some time:

"); /* out.println(""); out.println("
No.of Records Estimated Time"); out.println("
100 5 minutes"); out.println("
500 10 minutes"); out.println("
1000 18 minutes"); out.println("
2000 30 minutes"); out.println("
"); */ out.println("

This tool is designed to work with Dublin Core metadata: note that the mapping of qualified Dublin Core to simple Dublin Core (as in oai_dc) may affect the results."); } else { out.println("

Error: no metadata prefixes found... exiting

"); return; } } catch (SAXParseException spe) { out.println(spe.getMessage()); } catch (SAXException sxe) { out.println(sxe.toString()); } catch (ParserConfigurationException pce) { out.println(pce.toString()); } catch (IOException ioe) { out.println(ioe.toString()); } // check we get a valid XML document back // check OAI-PMH element // check we have a and elements // maybe check the baseURL against the query we issued? // if we get here we have a valid non-waikato non-local OAI server // /* process stuff here */ // test with University home page - as it doesn't require // dealing with the proxy server // URL url = new URL("http://waikato.ac.nz"); // URL url = new URL(oaiURL); // need to go through the proxy here // http://dn.codegear.com/article/29783 // http://java.ittoolbox.com/groups/technical-functional/java-l/response-to-proxy-authentication-exception-71438 // http://www.jguru.com/faq/view.jsp?EID=13186 // http://www.developer.com/java/other/article.php/1551421 // System.setProperty("http.proxyHost","http://proxy.scms.waikato.ac.nz"); // System.setProperty("http.proxyPort","80"); // System.setProperty("http.nonProxyHosts", "localhost|*.waikato.ac.nz"); // build a GS 3 collection from the OAI URL // http://www.javaworld.com/javaworld/jw-12-2000/jw-1229-traps.html // http://codon.kribb.re.kr/wiki/display/Java/Using+Runtime.exec+to+invoke+child+process out.print(""); out.close(); } // endif } // end doPost protected void buildCollection( PrintWriter out, String matShell2, String collName, String oaiURLString, String oaiPrefix) throws ServletException, IOException{ out.println("

"); out.println(""); out.println("

Building collection...

"); String host = "http://"+hostName+":"+port_number+"/greenstone3/mat/"; out.println("
"); out.println("
"); out.println("
"); out.println("
"); out.println("
"); out.println("
"); out.println("

" + matShell2 + "

"); out.flush(); Process p2 = processShell(matShell2, out); if (p2.exitValue() == 0){ out.println("

Collection built.

"); } else{ out.println("

Collection not built.

"); } out.print("
"); p2 = null; p2.destroy(); out.close(); } private void downloadCollection( PrintWriter out, HttpServletRequest req, HttpServletResponse res, String oaiPrefix, String Records) throws ServletException, IOException { String oaiURLString = req.getParameter("oaiURL"); String metadataprefix = req.getParameter("metadataPrefix"); String collName = generateCollName(); String host = "http://"+hostName+":"+port_number+"/"; out.println("

"); out.println(""); out.println("

Downloading OAI documents...

"); out.flush(); // move to correct directory String os = "linux"; if(fileSeparator == "\\"){ gsdl3Home.replace("\\", "/"); os = "windows"; } String gsdl3Root = GSPath.removeLastLink(gsdl3Home); if(os.equals("windows")){ gsdl3Root = gsdl3Root.replace("/","\\"); gsdl3Home = gsdl3Home.replace("/", "\\"); } String maxRecords = Records; String cacheDir = gsdl3Root +fileSeparator+"ext"+fileSeparator+"mat"+fileSeparator+"tmp"+fileSeparator + collName; // downloading //String gs3Root = "/research/cc108/greenstone3Project"; String gs3Root = gsdl3Root; String collectDir = gsdl3Home + fileSeparator +"sites"+fileSeparator+"localsite"+ fileSeparator +"collect"; String logFile = gsdl3Root +fileSeparator+"ext"+fileSeparator+"mat"+fileSeparator+"tmp"+fileSeparator + "log.txt"; //source /research/cc108/greenstone3Project/gs2build/bin/script/mat-colbuild-download.bash String matShell = "source "+ gsdl3Root + fileSeparator + "gs2build" + fileSeparator + "bin" + fileSeparator + "script" + fileSeparator + "mat-colbuild-download.bash " + collName + " " + oaiURLString + " " + cacheDir + " " + maxRecords + " " + metadataprefix + " " + gs3Root + " " + collectDir + " " + logFile; out.println("

" + matShell + "

"); out.flush(); Process p = processShell(matShell, out); if (p.exitValue() == 0){ out.println("

Collection downloaded.

"); } else { out.println("

Collection not downloaded properly.

"); } out.flush(); //p = null; p.destroy(); // return the web page //out.println(req.getRequestURL()); // building //source "+ gsdl3Root + fileSeparator+"ext"+fileSeparator+"mat"+fileSeparator + "bin" + fileSeparator + "script"+ fileSeparator + "mat-colbuild-download.bash " //String matShell2 = "source /research/cc108/greenstone3Project/gs2build/bin/script/mat-colbuild.bash " String matShell2 = "source "+ gsdl3Root + fileSeparator + "gs2build" + fileSeparator + "bin" + fileSeparator + "script" + fileSeparator + "mat-colbuild.bash " + collName + " " + oaiURLString + " " + cacheDir + " " + maxRecords + " " + metadataprefix + " " + gs3Root + " " + collectDir + " " + logFile; out.println("
"); out.println(""); out.println(""); out.println(""); out.println(""); out.println(""); //out.println("
"); out.println("
"); out.println("
"); out.print(""); out.flush(); out.close(); // return page //javascript to submit button in 5 secs //form // hidden inputs // collname // host ///////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////// // specify plugins as parameter to mkcol.pl //String makeColl = "perl -S mkcol.pl -creator cc108@cs.waikato.ac.nz " + collName; //out.println("

" + makeColl + "

"); // specify the OAI server in a config file??? // downloadfrom.pl -download_mode OAI -cache_dir -gli -url oaiURL.toString() -get_doc no // what about that 500 doc limit in OAIDownload.pm /* String downloadFrom = "downloadfrom.pl -download_mode OAI -cache_dir " + cacheDir + " -url " + oaiURL.toString() + " -max_records 10" + " -proxy_on" + " -proxy_host " + proxyHost + " -proxy_port " + proxyPort; */ // -proxy_on -proxy_host wwwcache.cs.waikato.ac.nz -proxy_port 80 // out.println("

" + downloadFrom + "

"); // /home/daven/research/greenstone3/gs2build/bin/script/build [options] collection-name /* String build = "build -indextype lucene -download file://" + cacheDir + " -log_events " //event log goes to greenstone3/gs2build/etc/events.txt + collName; */ // out.println("

" + build + "

"); // perl -S importfrom.pl collname // String importFrom = " perl -S importfrom.pl " + collName; // perl -S buildcol.pl collname // String buildColl = "perl -S buildcol.pl " + collName; // need to be in correct directory // String changeDir = "cd $GSDLHOME/collect/" + collName; // build.pl ?? // rebuild ? // rm -r index/* // String removeOld = "rm -r index/*"; // mv building/* index/ // String moveToIndex = "mv building/* index/"; ////////////////////////////////////// /* URLConnection connection = url.openConnection(); connection.connect(); Map headerMap = connection.getHeaderFields(); // gets the HTTP headers out.print("Analysing..."); out.println (""); out.println ("

Metadata Analysis Tool - Alpha

"); out.println("

Analysing the OAI URL: " + url.toString() + "

"); out.println("
");
	
	  Iterator keyValuePairs = headerMap.entrySet().iterator();
	  out.println("size = " + headerMap.size());
	  for (int i = 0; i < headerMap.size(); i++) {
	  out.println("i = " + i);
	  Map.Entry entry = (Map.Entry) keyValuePairs.next();
	  out.println(entry.getKey());
	  out.println(entry.getValue());
	  out.println();
	  }
	  
	  out.println(headerMap.toString());
	  out.println("
"); */ /* // get the source HTML and insert it into the page - messy out.println("

HTML source:

"); out.println("
");
	BufferedReader in = new BufferedReader(new InputStreamReader(
	connection.getInputStream()));
	String inputLine;
	while ((inputLine = in.readLine()) != null)
	out.println(inputLine);
	in.close();
	out.println("
"); */ //out.print(""); } /* produce a random 7 letter collection name */ private String generateCollName () { Random random = new Random(); StringBuffer message = new StringBuffer(); int offset = 97; // = "a" message.append( (char) ( random.nextInt( 26 ) + offset ) ); message.append( (char) ( random.nextInt( 26 ) + offset ) ); message.append( (char) ( random.nextInt( 26 ) + offset ) ); message.append( (char) ( random.nextInt( 26 ) + offset ) ); message.append( (char) ( random.nextInt( 26 ) + offset ) ); message.append( (char) ( random.nextInt( 26 ) + offset ) ); message.append( (char) ( random.nextInt( 26 ) + offset ) ); return message.toString(); } private Process processShell( String command, PrintWriter out) { String s= ""; try { String[] args = new String[]{"sh", "-c", command}; Process p = Runtime.getRuntime().exec(args); BufferedReader stdInput = new BufferedReader(new InputStreamReader(p.getInputStream())); BufferedReader stdError = new BufferedReader(new InputStreamReader(p.getErrorStream())); StringBuffer stdInputBuffer = new StringBuffer(); while ((s = stdInput.readLine()) != null) { stdInputBuffer.append(s+"\n"); if(s.contains("-->")){ s = s.replace("-->", ""); out.println(""); } else{ out.println(""); } out.flush(); } StringBuffer stdErrorBuffer = new StringBuffer(); while ((s = stdError.readLine()) != null) { stdErrorBuffer.append(s+"\n"); if(s.contains("-->")){ s = s.replace("-->", ""); out.println(""); } else{ out.println(""); } out.flush(); } out.println("

Here is the standard output:

\n"); out.println("

" + stdInputBuffer + "

"); out.println("

Here is the standard error (if any):

\n"); out.println("

" + stdErrorBuffer + "

"); out.flush(); //if (p.exitValue() != 0) if (false){ out.println("

An error occurred while building the collection.

"); out.println("

Here is the standard output:

\n"); out.println("

" + stdInputBuffer + "

"); out.println("

Here is the standard error (if any):

\n"); out.println("

" + stdErrorBuffer + "

"); } InputStream is = p.getInputStream(); is.close(); OutputStream os = p.getOutputStream(); os.close(); InputStream es = p.getErrorStream(); es.close(); stdInput.close(); stdError.close(); return p; } catch (IOException e) { out.println("exception happened - here's what I know: "); out.println(e.toString()); out.flush(); } return null; } private void analyzeCollection(PrintWriter out, String collectionName,String collectionURL,String collectionHost, String Prefix){ String collName = collectionName; String oaiURLString = collectionURL; String host = collectionHost; try{ DescribeMessager dm = new DescribeMessager(collName,oaiURLString); out.println("

Generating statistics and visualisations...

"); out.flush(); out.println("

please wait.

"); out.flush(); boolean status = dm.describeMatadata(out,collName,oaiURLString,Prefix); if(status){ out.println(""); out.println("View the report"); } out.println(""); }catch(Exception e){e.printStackTrace(out); out.println("

"+e.toString()+"

"); } out.close(); } private void loadRuntimeSettings(){ fileSeparator = File.separator; gsdl3Home = globalProperty.getGSDL3Home(); globalProperty = new GlobalProperties(); maxRecord ="10"; String os = "linux"; if(fileSeparator == "\\"){ gsdl3Home.replace("\\", "/"); os = "windows"; } String gsdl3Root = GSPath.removeLastLink(gsdl3Home); if(os.equals("windows")){ gsdl3Root = gsdl3Root.replace("/","\\"); gsdl3Home = gsdl3Home.replace("/", "\\"); } try{ Properties prop = new Properties(); FileInputStream fis = new FileInputStream(gsdl3Root+fileSeparator+"ext"+fileSeparator+"mat"+fileSeparator+"properties.xml"); prop.load(fis); titleString =""+ prop.getProperty("Servlet.Title")+""; h1String ="

"+ prop.getProperty("Servlet.Head")+"

"; cssString = ""; javaScript = "";; headerString2 = "" + titleString + "\n" + javaScript + cssString+"\n"; headerString ="" + titleString + "\n" + cssString + "\n"; logoURL =prop.getProperty("Servlet.Logo"); description = prop.getProperty("Servlet.Description"); proxyHostContent = prop.getProperty("Servlet.proxyHost"); proxyPortContent = prop.getProperty("Servlet.proxyPort"); }catch(Exception ex){ ex.printStackTrace(); } try { java.net.InetAddress localMachine = java.net.InetAddress.getLocalHost(); hostName = localMachine.getHostName(); }catch (java.net.UnknownHostException uhe) { uhe.printStackTrace(); } } }