Changeset 33405


Ignore:
Timestamp:
2019-08-12T20:37:44+12:00 (5 years ago)
Author:
ak19
Message:

Even though we're probably not going to use this code after all, will commit my fixes and current state. Got location of wget downloaded site working, also filefilter. Added recursive traversal of downloaded folder to identify files to process.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java

    r33402 r33405  
    1111/**
    1212 * Ensure you have OPENNLP_HOME set to apache-opennlp's full path.
    13  *
     13 * Then go into the src folder of this extension before compiling or running.
    1414 *
    1515 * Compile as:
     
    2222{
    2323
    24     static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
    25    
    26     Properties configProps = null;
    27 
    28     boolean DEBUG_MODE = true;
    29 
     24    private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName());
     25   
     26    private Properties configProps = null;
     27    private boolean DEBUG_MODE = true;
     28
     29    private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();   
     30    private final String sitesDir = "../dwn-sites";
    3031
    3132    private void log(String msg) {
     
    3435    }
    3536
     37    private void debug(String msg) {
     38    System.err.println(msg);
     39    logger.debug(msg);
     40    }
     41   
    3642    private void error(String msg) {
    3743    System.err.println("ERROR: " + msg);
     
    4551   
    4652    public NZTLDProcessor(File seedURLsFile) throws Exception {
    47 
     53    log("In NZTLDProcessor constructor");
     54   
    4855    if(DEBUG_MODE) {
    4956        warn("**** Debugger is turned ON!!!");
     
    109116    log("Processing seed URL: " + nzDomainURL);
    110117
    111     wgetURL(nzDomainURL);
    112    
    113     return true;
    114     }
    115 
    116     // TODO: Still need to get wget to be launched from a specified directory (../dwn_sites).
     118    //if(wgetURL(nzDomainURL)) {
     119    if(!DEBUG_MODE) { wgetURL(nzDomainURL); } // TODO: take out debug_mode condition
     120   
     121    processDownloadedDir(nzDomainURL);
     122   
     123    //}
     124    return true;
     125    }
     126
     127    // wget will be launched from the specified directory, ../dwn_sites
     128    // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html
    117129    public boolean wgetURL(String nzDomainURL) {
    118130    // use SafeProcess and wget command in lib/config.properties
     
    132144
    133145    String[] wgetCommandArgs = wgetCmd.split(" ");
    134     SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File("../dwn_sites"));
     146    SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(sitesDir));
    135147   
    136148    SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT);
     
    142154        System.err.println("Wget cmd \"" + wgetCmd  + "\" returned unsuccessfully with the value \"" + success + "\"");
    143155        return false;
    144     }
    145    
    146     return true;
    147     }
    148 
    149     public boolean processDownloadedDir() {
    150     // recursion then process each file
    151 
    152     return true;
    153     }
    154 
    155     public boolean processFile() {
    156     // skip if js, css, robots.txt
    157     // if no-extension or htm or html, call processHTMLFile()
    158     // else: skip for now, TODO: later call Tika on other file types
    159 
    160     // TODO: update db with url info, page location and para?
    161    
    162     return true;
    163     }
    164 
    165 
    166     public boolean processHTMLFile() {
    167     // Use JSoup to get paras
    168 
    169     // call processParas(text);
    170 
    171     return true;
    172     }
    173 
    174     public boolean processNonHTMLFile() {
    175     // Use Tika to get text
    176 
    177     // call processParas(text)
    178 
    179     return true;
    180     }
    181 
    182     public boolean processParas(String text) {
    183 
    184     // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
    185    
    186     // for each para, call MaoriTextDetector.java
    187 
    188     return true;
    189     }
     156            // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz?
     157    }
     158   
     159    return true;
     160    }
     161
    190162   
    191163    // ----------- inner class for SafeProcess to Run Wget ----------
     
    204176    public void gotLine(String line) { // first non-null line
    205177
    206         // String form of "src" will be "stderr" or "stdout"
     178        // String form of this.source will be "stderr" or "stdout"
    207179        String msg = SafeProcess.streamToString(source) + ": " + line;
    208180        System.err.println(msg);
     
    211183    }
    212184    public void gotException(Exception e) {
    213         //NZTLDProcessor.this.logger.error("Error in reading process' " + SafeProcess.streamToString(source), e);
    214         logger.error("Error in reading process' " + SafeProcess.streamToString(source), e);
    215     }
    216 
     185        String msg = "Error in reading process' " + SafeProcess.streamToString(source);
     186        //NZTLDProcessor.this.logger.error(msg, e);
     187        logger.error(msg, e);
     188    }
     189
     190    }
     191
     192
     193    public boolean processDownloadedDir(String nzDomainURL) {
     194    // recurse through the downloaded directory, then process each file
     195
     196    File downloadedSiteDir = new File(sitesDir, nzDomainURL);
     197    log("*** Will process download dir " + downloadedSiteDir);
     198   
     199    if(!downloadedSiteDir.exists()) {
     200        error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
     201        return false;
     202    }
     203    /*
     204    if(!downloadedSiteDir.isDirectory()) {
     205        error("*** Downloaded site " + downloadedSiteDir + " is not a directory!");
     206        return false; // TODO: or redo wget with "www." prefixed to URL???
     207    }
     208    */
     209    recursivelyProcessDir(nzDomainURL, downloadedSiteDir);
     210
     211    debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex);
     212   
     213    return true;
     214    }
     215
     216    private void recursivelyProcessDir(String nzDomainURL, File file) {
     217
     218    if(file.isDirectory()) { // recursive step
     219        // children array null iff IO Exception OR if file not a directory,
     220        // which it can't be since we tested isDirectory() on it just above
     221        File[] children = file.listFiles(extractableTxtFilesFilter);
     222        if(children == null) {
     223        error("IO error occurred when trying to list children of " + file);
     224        } else {
     225        // children array will be empty if 'file' dir was empty
     226        for(int i = 0; i < children.length; i++) {
     227            recursivelyProcessDir(nzDomainURL, children[i]);
     228        }
     229        }
     230       
     231    } else { // base step
     232        processFile(nzDomainURL, file);     
     233    }
     234    }   
     235   
     236    /*
     237    public boolean processDownloadedDir(String nzDomainURL) {
     238    // recurse through the downloaded directory, then process each file
     239
     240    File downloadedSiteDir = new File(sitesDir, nzDomainURL);
     241    if(!downloadedSiteDir.exists()) {
     242        error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!");
     243        return false;
     244    }
     245
     246    File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter);
     247    if(files == null) {
     248        error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred");
     249        return false;
     250    }
     251    recursivelyProcessDir(nzDomainURL, files);
     252   
     253    return true;
     254    }
     255
     256    private void recursivelyProcessDir(String nzDomainURL, File[] children) {
     257    for(int i = 0; i < files.length; i++) {
     258        if(files[i].isDirectory()) { // recursive step
     259
     260        // children array will be empty if dir empty
     261        // children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above
     262        File[] children = files[i].listFiles(extractableTxtFilesFilter);
     263        if(children == null) {
     264            error("IO error trying to list children of " + files[i]);
     265        } else {
     266            recursivelyProcessDir(nzDomainURL, children);
     267        }
     268        }
     269        else { // base step
     270        processFile(nzDomainURL, files[i]);
     271        }
     272    }
     273    }*/
     274   
     275    public boolean processFile(String nzDomainURL, File file) {
     276    // skip if js, css, robots.txt
     277    // if no-extension or htm or html, call processHTMLFile()
     278    // else: skip for now, TODO: later call Tika on other file types
     279
     280    // TODO: update db with url info, page location and para?
     281
     282
     283    log("*** Processing file " + file);
     284   
     285    return true;
     286    }
     287
     288    private class ExtractableTextFileFilter implements FilenameFilter {
     289    // skip if js, css, robots.txt
     290   
     291    // For now also skip the image files. Later, with Tika, may be able to extract text from
     292    // images though OCR of those imgs representing text? We accept files with no file extension,
     293    // e.g. waikato uni has lots of files without extension that contain html.
     294    // Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected
     295    public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)";
     296    //Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex);
     297
     298   
     299    public boolean accept(File dir, String name) {
     300       
     301        return !(name.matches("robots.txt") || name.matches(unacceptableRegex));
     302        //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work
     303        //return name.matches(".*\\.html$"); // works
     304        //return name.matches(".*\\.png($|\\?.*)"); // works
     305    }
     306    }
     307
     308   
     309    public boolean processHTMLFile() {
     310    // Use JSoup to get paras
     311
     312    // call processParas(text);
     313
     314    return true;
     315    }
     316
     317    public boolean processNonHTMLFile() {
     318    // Use Tika to get text
     319
     320    // call processParas(text)
     321
     322    return true;
     323    }
     324
     325    public boolean processParas(String text) {
     326
     327    // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous?
     328   
     329    // for each para, call MaoriTextDetector.java
     330
     331    return true;
    217332    }
    218333    // --------------------------- End inner class --------------------
Note: See TracChangeset for help on using the changeset viewer.