Changeset 33405

Show
Ignore:
Timestamp:
12.08.2019 20:37:44 (10 days ago)
Author:
ak19
Message:

Even though we're probably not going to use this code after all, will commit my fixes and current state. Got location of wget downloaded site working, also filefilter. Added recursive traversal of downloaded folder to identify files to process.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/NZTLDProcessor.java

    r33402 r33405  
    1111/** 
    1212 * Ensure you have OPENNLP_HOME set to apache-opennlp's full path. 
    13  *  
     13 * Then go into the src folder of this extension before compiling or running. 
    1414 *  
    1515 * Compile as: 
     
    2222{ 
    2323 
    24     static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName()); 
    25      
    26     Properties configProps = null; 
    27  
    28     boolean DEBUG_MODE = true; 
    29  
     24    private static Logger logger = Logger.getLogger(org.greenstone.atea.NZTLDProcessor.class.getName()); 
     25     
     26    private Properties configProps = null; 
     27    private boolean DEBUG_MODE = true; 
     28 
     29    private final ExtractableTextFileFilter extractableTxtFilesFilter = new ExtractableTextFileFilter();     
     30    private final String sitesDir = "../dwn-sites"; 
    3031 
    3132    private void log(String msg) { 
     
    3435    } 
    3536 
     37    private void debug(String msg) { 
     38    System.err.println(msg); 
     39    logger.debug(msg); 
     40    } 
     41     
    3642    private void error(String msg) { 
    3743    System.err.println("ERROR: " + msg); 
     
    4551     
    4652    public NZTLDProcessor(File seedURLsFile) throws Exception {  
    47  
     53    log("In NZTLDProcessor constructor"); 
     54     
    4855    if(DEBUG_MODE) { 
    4956        warn("**** Debugger is turned ON!!!"); 
     
    109116    log("Processing seed URL: " + nzDomainURL); 
    110117 
    111     wgetURL(nzDomainURL); 
    112      
    113     return true; 
    114     } 
    115  
    116     // TODO: Still need to get wget to be launched from a specified directory (../dwn_sites). 
     118    //if(wgetURL(nzDomainURL)) { 
     119    if(!DEBUG_MODE) { wgetURL(nzDomainURL); } // TODO: take out debug_mode condition 
     120     
     121    processDownloadedDir(nzDomainURL); 
     122     
     123    //} 
     124    return true; 
     125    } 
     126 
     127    // wget will be launched from the specified directory, ../dwn_sites 
     128    // Wget exit status/return codes: https://www.gnu.org/software/wget/manual/html_node/Exit-Status.html 
    117129    public boolean wgetURL(String nzDomainURL) { 
    118130    // use SafeProcess and wget command in lib/config.properties 
     
    132144 
    133145    String[] wgetCommandArgs = wgetCmd.split(" "); 
    134     SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File("../dwn_sites")); 
     146    SafeProcess wgetProcess = new SafeProcess(wgetCommandArgs, null, new File(sitesDir)); 
    135147     
    136148    SafeProcess.LineByLineHandler outLineHandler = new ProcessLineHandler(logger, SafeProcess.STDOUT); 
     
    142154        System.err.println("Wget cmd \"" + wgetCmd  + "\" returned unsuccessfully with the value \"" + success + "\""); 
    143155        return false; 
    144     } 
    145      
    146     return true; 
    147     } 
    148  
    149     public boolean processDownloadedDir() { 
    150     // recursion then process each file 
    151  
    152     return true; 
    153     } 
    154  
    155     public boolean processFile() { 
    156     // skip if js, css, robots.txt 
    157     // if no-extension or htm or html, call processHTMLFile() 
    158     // else: skip for now, TODO: later call Tika on other file types 
    159  
    160     // TODO: update db with url info, page location and para? 
    161      
    162     return true; 
    163     } 
    164  
    165  
    166     public boolean processHTMLFile() { 
    167     // Use JSoup to get paras 
    168  
    169     // call processParas(text); 
    170  
    171     return true; 
    172     } 
    173  
    174     public boolean processNonHTMLFile() { 
    175     // Use Tika to get text  
    176  
    177     // call processParas(text) 
    178  
    179     return true; 
    180     } 
    181  
    182     public boolean processParas(String text) { 
    183  
    184     // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous? 
    185      
    186     // for each para, call MaoriTextDetector.java 
    187  
    188     return true; 
    189     } 
     156            // TODO: why is return val = 6 ("Username/password authentication failure") for 3MB of downloads from www.waikato.ac.nz? 
     157    } 
     158     
     159    return true; 
     160    } 
     161 
    190162     
    191163    // ----------- inner class for SafeProcess to Run Wget ---------- 
     
    204176    public void gotLine(String line) { // first non-null line 
    205177 
    206         // String form of "src" will be "stderr" or "stdout" 
     178        // String form of this.source will be "stderr" or "stdout" 
    207179        String msg = SafeProcess.streamToString(source) + ": " + line; 
    208180        System.err.println(msg); 
     
    211183    } 
    212184    public void gotException(Exception e) { 
    213         //NZTLDProcessor.this.logger.error("Error in reading process' " + SafeProcess.streamToString(source), e); 
    214         logger.error("Error in reading process' " + SafeProcess.streamToString(source), e); 
    215     } 
    216  
     185        String msg = "Error in reading process' " + SafeProcess.streamToString(source); 
     186        //NZTLDProcessor.this.logger.error(msg, e); 
     187        logger.error(msg, e); 
     188    } 
     189 
     190    } 
     191 
     192 
     193    public boolean processDownloadedDir(String nzDomainURL) { 
     194    // recurse through the downloaded directory, then process each file 
     195 
     196    File downloadedSiteDir = new File(sitesDir, nzDomainURL); 
     197    log("*** Will process download dir " + downloadedSiteDir); 
     198     
     199    if(!downloadedSiteDir.exists()) { 
     200        error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); 
     201        return false; 
     202    } 
     203    /* 
     204    if(!downloadedSiteDir.isDirectory()) { 
     205        error("*** Downloaded site " + downloadedSiteDir + " is not a directory!"); 
     206        return false; // TODO: or redo wget with "www." prefixed to URL??? 
     207    } 
     208    */ 
     209    recursivelyProcessDir(nzDomainURL, downloadedSiteDir); 
     210 
     211    debug("Have set filefilter regex to exclude: " + ExtractableTextFileFilter.unacceptableRegex); 
     212     
     213    return true; 
     214    } 
     215 
     216    private void recursivelyProcessDir(String nzDomainURL, File file) { 
     217 
     218    if(file.isDirectory()) { // recursive step 
     219        // children array null iff IO Exception OR if file not a directory, 
     220        // which it can't be since we tested isDirectory() on it just above 
     221        File[] children = file.listFiles(extractableTxtFilesFilter); 
     222        if(children == null) { 
     223        error("IO error occurred when trying to list children of " + file); 
     224        } else { 
     225        // children array will be empty if 'file' dir was empty 
     226        for(int i = 0; i < children.length; i++) { 
     227            recursivelyProcessDir(nzDomainURL, children[i]); 
     228        } 
     229        } 
     230         
     231    } else { // base step 
     232        processFile(nzDomainURL, file);      
     233    } 
     234    }    
     235     
     236    /* 
     237    public boolean processDownloadedDir(String nzDomainURL) { 
     238    // recurse through the downloaded directory, then process each file 
     239 
     240    File downloadedSiteDir = new File(sitesDir, nzDomainURL); 
     241    if(!downloadedSiteDir.exists()) { 
     242        error("*** Expected downloaded site " + downloadedSiteDir + ", but it did not exist!"); 
     243        return false; 
     244    } 
     245 
     246    File[] files = downloadedSiteDir.listFiles(extractableTxtFilesFilter); 
     247    if(files == null) { 
     248        error("Downloaded site dir " + downloadSiteDir + " was a file or an IO exception occurred"); 
     249        return false; 
     250    } 
     251    recursivelyProcessDir(nzDomainURL, files); 
     252     
     253    return true; 
     254    } 
     255 
     256    private void recursivelyProcessDir(String nzDomainURL, File[] children) { 
     257    for(int i = 0; i < files.length; i++) { 
     258        if(files[i].isDirectory()) { // recursive step 
     259 
     260        // children array will be empty if dir empty 
     261        // children null iff IOException OR if files[i] were a file, which it can't be since we tested isDirectory on it just above 
     262        File[] children = files[i].listFiles(extractableTxtFilesFilter); 
     263        if(children == null) { 
     264            error("IO error trying to list children of " + files[i]); 
     265        } else { 
     266            recursivelyProcessDir(nzDomainURL, children); 
     267        } 
     268        } 
     269        else { // base step 
     270        processFile(nzDomainURL, files[i]); 
     271        } 
     272    } 
     273    }*/ 
     274     
     275    public boolean processFile(String nzDomainURL, File file) { 
     276    // skip if js, css, robots.txt 
     277    // if no-extension or htm or html, call processHTMLFile() 
     278    // else: skip for now, TODO: later call Tika on other file types 
     279 
     280    // TODO: update db with url info, page location and para? 
     281 
     282 
     283    log("*** Processing file " + file); 
     284     
     285    return true; 
     286    } 
     287 
     288    private class ExtractableTextFileFilter implements FilenameFilter { 
     289    // skip if js, css, robots.txt 
     290     
     291    // For now also skip the image files. Later, with Tika, may be able to extract text from 
     292    // images though OCR of those imgs representing text? We accept files with no file extension, 
     293    // e.g. waikato uni has lots of files without extension that contain html. 
     294    // Examples: Files like *.png but also *.png?v=QE50XMk2oY should be rejected 
     295    public static final String unacceptableRegex = ".*\\.(json|js|css|svg|ico|jpe?g|png|gif|tif?)($|\\?.*)"; 
     296    //Pattern unacceptableRegexPattern = Pattern.compile(unacceptableRegex); 
     297 
     298     
     299    public boolean accept(File dir, String name) { 
     300         
     301        return !(name.matches("robots.txt") || name.matches(unacceptableRegex)); 
     302        //return !(name.matches("robots.txt") || name.matches("/\\.png/")); // doesn't work 
     303        //return name.matches(".*\\.html$"); // works 
     304        //return name.matches(".*\\.png($|\\?.*)"); // works 
     305    } 
     306    } 
     307 
     308     
     309    public boolean processHTMLFile() { 
     310    // Use JSoup to get paras 
     311 
     312    // call processParas(text); 
     313 
     314    return true; 
     315    } 
     316 
     317    public boolean processNonHTMLFile() { 
     318    // Use Tika to get text  
     319 
     320    // call processParas(text) 
     321 
     322    return true; 
     323    } 
     324 
     325    public boolean processParas(String text) { 
     326 
     327    // Split on (double) new line for paras? If too short, then add text from next para unless there's no more. Else add to previous? 
     328     
     329    // for each para, call MaoriTextDetector.java 
     330 
     331    return true; 
    217332    } 
    218333    // --------------------------- End inner class --------------------