Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33405

Timestamp:

2019-08-12T20:37:44+12:00 (5 years ago)

Author:

ak19

Message:

Even though we're probably not going to use this code after all, will commit my fixes and current state. Got location of wget downloaded site working, also filefilter. Added recursive traversal of downloaded folder to identify files to process.

File:

: 1 edited

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java (modified) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java

-              r33402
+              r33405
 /**
  * Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
+ *
+ * Then go into the src folder of this extension before compiling or running.
+ *
  * Compile as:
 …
+{
+    static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
+    Properties configProps = null;
+    boolean DEBUG_MODE = true;
+    private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
+    private Properties configProps = null;
+    private boolean DEBUG_MODE = true;
+    private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();
+    private final String sitesDir = "../dwn-sites";
     private void log(String msg) {
 …
+    }
+    private void debug(String msg) {
+    System.err.println(msg);
+    logger.debug(msg);
+    }
     private void error(String msg) {
     System.err.println("ERROR: " + msg);
 …
     public NZTLDProcessor(File seedURLsFile) throws Exception {
+    log("In NZTLDProcessor constructor");
     if(DEBUG_MODE) {
         warn("**** Debugger is turned ON!!!");
 …
     log("Processing seed URL: " + nzDomainURL);
+    wgetURL(nzDomainURL);
+    return true;
+    }
+    // TODO: Still need to get wget to be launched from a specified directory (../dwn_sites).
+    //if(wgetURL(nzDomainURL)) {
+    if(!DEBUG_MODE) { wgetURL(nzDomainURL); } // TODO: take out debug_mode condition
+    processDownloadedDir(nzDomainURL);
+    //}
+    return true;
+    }
+    // wget will be launched from the specified directory, ../dwn_sites
+    // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
     public boolean wgetURL(String nzDomainURL) {
     // use SafeProcess and wget command in lib/config.properties
 …
     String[] wgetCommandArgs = wgetCmd.split(" ");
     SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File("../dwn_sites"));
+    SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(sitesDir));
     SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
 …
         System.err.println("Wget cmd \"" + wgetCmd  + "\" returned unsuccessfully with the value \"" + success + "\"");
         return false;
+    }
+    return true;
+    }
+    public boolean processDownloadedDir() {
+    // recursion then process each file
+    return true;
+    }
+    public boolean processFile() {
+    // skip if js, css, robots.txt
+    // if no-extension or htm or html, call processHTMLFile()
+    // else: skip for now, TODO: later call Tika on other file types
+    // TODO: update db with url info, page location and para?
+    return true;
+    }
+    public boolean processHTMLFile() {
+    // Use JSoup to get paras
+    // call processParas(text);
+    return true;
+    }
+    public boolean processNonHTMLFile() {
+    // Use Tika to get text
+    // call processParas(text)
+    return true;
+    }
+    public boolean processParas(String text) {
+    // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
+    // for each para, call MaoriTextDetector.java
+    return true;
+    }
+            // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
+    }
+    return true;
+    }
     // ----------- inner class for SafeProcess to Run Wget ----------
 …
     public void gotLine(String line) { // first non-null line
         // String form of "src" will be "stderr" or "stdout"
+        // String form of this.source will be "stderr" or "stdout"
         String msg = SafeProcess.streamToString(source) + ": " + line;
         System.err.println(msg);
 …
+    }
     public void gotException(Exception e) {
+        //NZTLDProcessor.this.logger.error("Error in reading process' " + SafeProcess.streamToString(source), e);
+        logger.error("Error in reading process' " + SafeProcess.streamToString(source), e);
+    }
+        String msg = "Error in reading process' " + SafeProcess.streamToString(source);
+        //NZTLDProcessor.this.logger.error(msg, e);
+        logger.error(msg, e);
+    }
+    }
+    public boolean processDownloadedDir(String nzDomainURL) {
+    // recurse through the downloaded directory, then process each file
+    File downloadedSiteDir = new File(sitesDir, nzDomainURL);
+    log("*** Will process download dir " + downloadedSiteDir);
+    if(!downloadedSiteDir.exists()) {
+        error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
+        return false;
+    }
+    /*
+    if(!downloadedSiteDir.isDirectory()) {
+        error("*** Downloaded site " + downloadedSiteDir + " is not a directory!");
+        return false; // TODO: or redo wget with "www." prefixed to URL???
+    }
+    */
+    recursivelyProcessDir(nzDomainURL, downloadedSiteDir);
+    debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
+    return true;
+    }
+    private void recursivelyProcessDir(String nzDomainURL, File file) {
+    if(file.isDirectory()) { // recursive step
+        // children array null iff IO Exception OR if file not a directory,
+        // which it can't be since we tested isDirectory() on it just above
+        File[] children = file.listFiles(extractableTxtFilesFilter);
+        if(children == null) {
+        error("IO error occurred when trying to list children of " + file);
+        } else {
+        // children array will be empty if 'file' dir was empty
+        for(int i = 0; i < children.length; i++) {
+            recursivelyProcessDir(nzDomainURL, children[i]);
+        }
+        }
+    } else { // base step
+        processFile(nzDomainURL, file);
+    }
+    }
+    /*
+    public boolean processDownloadedDir(String nzDomainURL) {
+    // recurse through the downloaded directory, then process each file
+    File downloadedSiteDir = new File(sitesDir, nzDomainURL);
+    if(!downloadedSiteDir.exists()) {
+        error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
+        return false;
+    }
+    File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
+    if(files == null) {
+        error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
+        return false;
+    }
+    recursivelyProcessDir(nzDomainURL, files);
+    return true;
+    }
+    private void recursivelyProcessDir(String nzDomainURL, File[] children) {
+    for(int i = 0; i < files.length; i++) {
+        if(files[i].isDirectory()) { // recursive step
+        // children array will be empty if dir empty
+        // children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
+        File[] children = files[i].listFiles(extractableTxtFilesFilter);
+        if(children == null) {
+            error("IO error trying to list children of " + files[i]);
+        } else {
+            recursivelyProcessDir(nzDomainURL, children);
+        }
+        }
+        else { // base step
+        processFile(nzDomainURL, files[i]);
+        }
+    }
+    }*/
+    public boolean processFile(String nzDomainURL, File file) {
+    // skip if js, css, robots.txt
+    // if no-extension or htm or html, call processHTMLFile()
+    // else: skip for now, TODO: later call Tika on other file types
+    // TODO: update db with url info, page location and para?
+    log("*** Processing file " + file);
+    return true;
+    }
+    private class ExtractableTextFileFilter implements FilenameFilter {
+    // skip if js, css, robots.txt
+    // For now also skip the image files. Later, with Tika, may be able to extract text from
+    // images though OCR of those imgs representing text? We accept files with no file extension,
+    // e.g. waikato uni has lots of files without extension that contain html.
+    // Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected
+    public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)";
+    //Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);
+    public boolean accept(File dir, String name) {
+        return !(name.matches("robots.txt") || name.matches(unacceptableRegex));
+        //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work
+        //return name.matches(".*\\.html$"); // works
+        //return name.matches(".*\\.png($|\\?.*)"); // works
+    }
+    }
+    public boolean processHTMLFile() {
+    // Use JSoup to get paras
+    // call processParas(text);
+    return true;
+    }
+    public boolean processNonHTMLFile() {
+    // Use Tika to get text
+    // call processParas(text)
+    return true;
+    }
+    public boolean processParas(String text) {
+    // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
+    // for each para, call MaoriTextDetector.java
+    return true;
+    }
     // --------------------------- End inner class --------------------

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33405

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java

Download in other formats: