package org.greenstone.atea; import java.io.*; import java.util.Properties; import java.util.zip.GZIPInputStream; import org.apache.log4j.Logger; import org.greenstone.util.SafeProcess; // from GS3's gutil.jar. For safely running wget/any process from Java. /** * Ensure you have OPENNLP_HOME set to apache-opennlp's full path. * Then go into the src folder of this extension before compiling or running. * * Compile as: * maori-lang-detection/src$ javac -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../lib/*" org/greenstone/atea/NZTLDProcessor.java * * Run as: * maori-lang-detection/src$ java -cp ".:$OPENNLP_HOME/lib/opennlp-tools-1.9.1.jar:../conf:../lib/*" org.greenstone.atea.NZTLDProcessor ../uniq-tld-nz-WET-urls-2019-08-13.txt */ public class NZTLDProcessor { private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName()); private Properties configProps = null; private boolean DEBUG_MODE = true; private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter(); private final String SITES_DIR = "../dwn-sites"; private static final String COMMONCRAWL_DATA_PREFIX = "https://commoncrawl.s3.amazonaws.com/"; private void log(String msg) { System.err.println(msg); logger.info(msg); } private void debug(String msg) { System.err.println(msg); logger.debug(msg); } private void error(String msg) { System.err.println("### ERROR: " + msg); logger.error(msg); } private void warn(String msg) { System.err.println("*** WARN: " + msg); logger.warn(msg); } public NZTLDProcessor(File seedURLsFile) throws Exception { log("In NZTLDProcessor constructor"); if(DEBUG_MODE) { warn("Debugger is turned ON!!!"); } // TODO: why do I have to provide the path to config.props when this path is already on // the classpath? // In fact, I don't need it on the classpath for the following to work. // How do I get it to work by specifying its path on the classpath and not here? //try (FileInputStream infile = new FileInputStream("../lib/config.properties")) { //try (InputStream infile = Class.forName("org.greenstone.atea.NZTLDProcessor").getClassLoader().getResourceAsStream("config.properties")) { try (InputStream infile = this.getClass().getClassLoader().getResourceAsStream("config.properties")) { configProps = new Properties(); configProps.load(infile); infile.close(); } catch(Exception e) { System.err.println("Exception attempting to read properties from config.properties."); //e.printStackTrace(); throw e; } try(BufferedReader br = new BufferedReader(new FileReader(seedURLsFile))) { String url = null; int maxURLs = 1; int urlCounter = 0; while((url = br.readLine()) != null) { log("Got URL: " + url); // skip urls containing "/crawldiagnostics/" and "/robotstxt/" if(url.indexOf("/robotstxt/") != -1) continue; if(url.indexOf("/crawldiagnostics/") != -1) continue; // convert the remaining WARC urls to WET urls url = url.replace("/warc/CC-MAIN-", "/wet/CC-MAIN-"); url = url.replace(".warc.gz", ".warc.wet.gz"); // add the prefix url = COMMONCRAWL_DATA_PREFIX + url; log("Final WET URL: " + url); processURL(url); if(DEBUG_MODE) { urlCounter++; if(urlCounter == maxURLs) break; } } br.close(); } catch (IOException ioe) { System.err.println("Exception attempting to read from seedURLsFile."); //ioe.printStackTrace(); throw ioe; } } public boolean processURL(String ccWETfileURL) { // launch wget on URL // when download done, recurse through downloaded dir // for each htm/html/no-extension file: parse out text from paragraphs (TODO: later Tika on non html/htm/no-extension files) // For each para, run MaoriTextDetector and store results in db // Remove download directory (or only remove if site doesn't contain Maori txt)? if(DEBUG_MODE) { ccWETfileURL = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-30/segments/1563195523840.34/wet/CC-MAIN-20190715175205-20190715200204-00034.warc.wet.gz"; } log("Processing WET file URL: " + ccWETfileURL); /* //if(wgetMirrorSite(ccWETfileURL)) { if(!DEBUG_MODE) { wgetMirrorSite(ccWETfileURL); } // TODO: take out debug_mode condition processDownloadedDir(ccWETfileURL); //} */ processWETfile(ccWETfileURL); return true; } public boolean processWETfile(String ccWETfileURL) { String zippedWETfileName = ccWETfileURL.substring(ccWETfileURL.lastIndexOf('/')+1); String unzippedWETfileName = zippedWETfileName.substring(0, zippedWETfileName.lastIndexOf(".gz")); File inZipFile = new File(SITES_DIR, zippedWETfileName); File WETfile = new File(SITES_DIR, unzippedWETfileName); if(WETfile.exists()) { log("Unzipped WET file " + WETfile + " already exists"); } else { if(inZipFile.exists()) { log("Not wgetting " + ccWETfileURL + " as " + inZipFile + " already exists"); } else if(!wgetWETfile(ccWETfileURL)) { return false; } // don't have the WET file yet. Get it from the zip file, which we know we should have by now boolean success = Utility.unzipFile(inZipFile, WETfile); log("Unzipped " + inZipFile + " to " + WETfile); // whether we succeeded or not, get rid of the zipped file: if(!inZipFile.delete()) { warn("Unable to delete zipped WET file: " + zippedWETfileName); } if(!success) { return false; } } // read in the giant WET file and return true; } /* // Run gunzip // To avoid making this linux specific, use Java to unzip, instead of running gunzip as process // https://www.mkyong.com/java/how-to-decompress-file-from-gzip-file/ public boolean unzipFile(File inZipFile, File outFile) { byte[] buffer = new byte[1024]; // try-with-resources will safely close streams/dispose resources on success or error and exceptions try ( GZIPInputStream gzis = new GZIPInputStream(new FileInputStream(inZipFile)); FileOutputStream out = new FileOutputStream(outFile); ) { int len; while ((len = gzis.read(buffer)) > 0) { out.write(buffer, 0, len); } //gzis.close(); //out.close(); log("Unzipped " + inZipFile + " to " + outFile); } catch(IOException ex) { error("Failed to unzip " + inZipFile); ex.printStackTrace(); return false; } return true; } */ // wget will be launched from the specified directory, SITES_DIR // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html public boolean wgetWETfile(String ccWETfileURL) { // use SafeProcess and wget command in lib/config.properties // DONE: set up logging String wgetCmd = configProps.getProperty("wget.file.cmd"); if(wgetCmd == null || wgetCmd.equals("")) { System.err.println("Invalid or empty wget.file.cmd in config.properties"); return false; } // replace the placeholder in the wget cmd for the seed url wgetCmd = wgetCmd.replace("%%FILE_URL%%", ccWETfileURL); log("Will launch wget with the command: " + wgetCmd); String[] wgetCommandArgs = wgetCmd.split(" "); SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR)); SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT); SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR); int success = wgetProcess.runProcess(outLineHandler, errLineHandler); if(success != 0) { System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\""); return false; // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz? } return true; } // wget will be launched from the specified directory, SITES_DIR // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html public boolean wgetMirrorSite(String nzDomainURL) { // use SafeProcess and wget command in lib/config.properties // DONE: set up logging String wgetCmd = configProps.getProperty("wget.mirror.cmd"); if(wgetCmd == null || wgetCmd.equals("")) { System.err.println("Invalid or empty wget.mirror.cmd in config.properties"); return false; } // replace the placeholder in the wget cmd for the seed url wgetCmd = wgetCmd.replace("%%BASE_URL%%", nzDomainURL); log("Will launch wget with the command: " + wgetCmd); String[] wgetCommandArgs = wgetCmd.split(" "); SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(SITES_DIR)); SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT); SafeProcess.LineByLineHandler errLineHandler = new ProcessLineHandler(logger, SafeProcess.STDERR); int success = wgetProcess.runProcess(outLineHandler, errLineHandler); if(success != 0) { System.err.println("Wget cmd \"" + wgetCmd + "\" returned unsuccessfully with the value \"" + success + "\""); return false; // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz? } return true; } // ----------- inner class for SafeProcess to Run Wget ---------- public class ProcessLineHandler extends SafeProcess.LineByLineHandler { //static Logger logger = Logger.getLogger(this.getClass().getName());; Logger logger; public ProcessLineHandler(Logger logger, int src) { super(src); // will set this.source to STDERR or STDOUT //logger = Logger.getLogger(this.getClass().getName()); this.logger = logger; } public void gotLine(String line) { // first non-null line // String form of this.source will be "stderr" or "stdout" String msg = SafeProcess.streamToString(source) + ": " + line; System.err.println(msg); //NZTLDProcessor.this.logger.info(msg); logger.info(msg); } public void gotException(Exception e) { String msg = "Error in reading process' " + SafeProcess.streamToString(source); //NZTLDProcessor.this.logger.error(msg, e); logger.error(msg, e); } } public boolean processDownloadedDir(String ccWETfileURL) { // recurse through the downloaded directory, then process each file File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL); log("*** Will process download dir " + downloadedSiteDir); if(!downloadedSiteDir.exists()) { error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); return false; } /* if(!downloadedSiteDir.isDirectory()) { error("Downloaded site " + downloadedSiteDir + " is not a directory!"); return false; // TODO: or redo wget with "www." prefixed to URL??? } */ recursivelyProcessDir(ccWETfileURL, downloadedSiteDir); debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex); return true; } private void recursivelyProcessDir(String ccWETfileURL, File file) { if(file.isDirectory()) { // recursive step // children array null iff IO Exception OR if file not a directory, // which it can't be since we tested isDirectory() on it just above File[] children = file.listFiles(extractableTxtFilesFilter); if(children == null) { error("IO error occurred when trying to list children of " + file); } else { // children array will be empty if 'file' dir was empty for(int i = 0; i < children.length; i++) { recursivelyProcessDir(ccWETfileURL, children[i]); } } } else { // base step processFile(ccWETfileURL, file); } } /* public boolean processDownloadedDir(String ccWETfileURL) { // recurse through the downloaded directory, then process each file File downloadedSiteDir = new File(SITES_DIR, ccWETfileURL); if(!downloadedSiteDir.exists()) { error("Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); return false; } File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter); if(files == null) { error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred"); return false; } recursivelyProcessDir(ccWETfileURL, files); return true; } private void recursivelyProcessDir(String ccWETfileURL, File[] children) { for(int i = 0; i < files.length; i++) { if(files[i].isDirectory()) { // recursive step // children array will be empty if dir empty // children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above File[] children = files[i].listFiles(extractableTxtFilesFilter); if(children == null) { error("IO error trying to list children of " + files[i]); } else { recursivelyProcessDir(ccWETfileURL, children); } } else { // base step processFile(ccWETfileURL, files[i]); } } }*/ public boolean processFile(String ccWETfileURL, File file) { // skip if js, css, robots.txt // if no-extension or htm or html, call processHTMLFile() // else: skip for now, TODO: later call Tika on other file types // TODO: update db with url info, page location and para? log("*** Processing file " + file); return true; } private class ExtractableTextFileFilter implements FilenameFilter { // skip if js, css, robots.txt // For now also skip the image files. Later, with Tika, may be able to extract text from // images though OCR of those imgs representing text? We accept files with no file extension, // e.g. waikato uni has lots of files without extension that contain html. // Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)"; //Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex); public boolean accept(File dir, String name) { return !(name.matches("robots.txt") || name.matches(unacceptableRegex)); //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work //return name.matches(".*\\.html$"); // works //return name.matches(".*\\.png($|\\?.*)"); // works } } public boolean processHTMLFile() { // Use JSoup to get paras // call processParas(text); return true; } public boolean processNonHTMLFile() { // Use Tika to get text // call processParas(text) return true; } public boolean processParas(String text) { // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous? // for each para, call MaoriTextDetector.java return true; } // --------------------------- End inner class -------------------- public static void main(String[] args) { if(args.length <= 0) { System.err.println("ERROR: Must provide input file of unique nz website addresses"); System.exit(-1); } String infilePath = args[0]; System.err.println("Requested to process URLS file: " + infilePath); File infile = new File(infilePath); if(!infile.exists()) { System.err.println("ERROR: File " + infilePath + " did not exist"); System.exit(-1); } try { NZTLDProcessor processor = new NZTLDProcessor(infile); } catch(Exception e) { e.printStackTrace(); } } }