Changeset 33488

Show
Ignore:
Timestamp:
17.09.2019 14:48:36 (4 weeks ago)
Author:
ak19
Message:

new function createSeedURLsFiles() in WETProcessor that replaces the bash script bin/script/unique_mri_domains_from_cc.sh

Location:
gs3-extensions/maori-lang-detection
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/bin/script/unique_mri_domains_from_cc.sh

    r33471 r33488  
    6969     
    7070     
    71 done < <(cat sorted_MRI_urls.txt)  
     71done < <(cat sorted_MRI_urls.txt) 
    7272 
    7373echo "**************************************" 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33480 r33488  
    55import java.util.Properties; 
    66import java.util.zip.GZIPInputStream; 
     7import java.util.Iterator; 
     8import java.util.Set; 
     9import java.util.TreeSet; 
    710 
    811import org.apache.log4j.Logger; 
     
    301304    } 
    302305    } 
     306 
     307 
     308    /** 
     309     * Takes as input the keepURLs.txt file generated by running WETProcessor instances. 
     310     * As output produces the URL seed list and regex-urlfilter text files required by nutch, 
     311     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 
     312     */ 
     313    public static void createSeedURLsFiles(File urlsFile, File seedURLsFile, File urlFilterFile) { 
     314    // Maintain Sets of unique domains and urls 
     315    // TreeSet: by default, "the elements are ordered using their natural ordering" 
     316    // (or by a Comparator provided at set creation time). 
     317    // Whereas HashSet doesn't guarantee ordering. 
     318    // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations. 
     319 
     320    Set<String> domainsSet = new TreeSet<String>(); 
     321    Set<String> urlsSet = new TreeSet<String>(); 
     322 
     323    final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* 
     324     
     325    try ( 
     326         BufferedReader reader = new BufferedReader(new FileReader(urlsFile)); 
     327         ) { 
     328         
     329        // read a URL at a time from urlsFile 
     330        String url = null; 
     331        String domain = null; 
     332        while((url = reader.readLine()) != null) { // readLine removes newline separator 
     333         
     334        // work out domain. This retains any www. or subdomain prefix: 
     335        int startIndex = url.indexOf("//"); // http:// or https:// prefix 
     336        startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 
     337        domain = url.substring(startIndex); 
     338        int endIndex = domain.indexOf("/"); 
     339        if(endIndex == -1) endIndex = domain.length(); 
     340        domain = domain.substring(0, endIndex); 
     341 
     342        //if(!domainsMap.containsKey(domain)) { 
     343        urlsSet.add(url); 
     344        domainsSet.add(domain); 
     345        //} 
     346        } 
     347    } catch (IOException ioe) { 
     348        ioe.printStackTrace(); 
     349        System.err.println("\n@@@@@@@@@ Error reading in urls from file " + urlsFile); 
     350    } 
     351     
     352    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 
     353        Iterator<String> i = urlsSet.iterator(); 
     354        while(i.hasNext()) { 
     355        String url = i.next(); 
     356        seedURLsWriter.write(url + "\n"); 
     357        } 
     358         
     359    } catch (IOException ioe) { 
     360        ioe.printStackTrace(); 
     361        System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile); 
     362    } 
     363 
     364    try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) { 
     365        Iterator<String> i = domainsSet.iterator(); 
     366        // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 
     367        while(i.hasNext()) { 
     368        String domain = i.next(); 
     369        domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";         
     370        urlFilterWriter.write(domain + "\n"); 
     371        } 
     372         
     373    } catch (IOException ioe) { 
     374        ioe.printStackTrace(); 
     375        System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile); 
     376    } 
     377    } 
     378 
    303379     
    304  
    305380    //public static int getRecordCount() { return recordCount; } 
    306381     
     
    412487         
    413488    } 
     489 
     490    File seedURLsFile = new File(outFolder, "seedURLs.txt"); 
     491    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 
     492    WETProcessor.createSeedURLsFiles(WETProcessor.keepURLsFile, seedURLsFile, urlFilterFile); 
    414493     
    415494    return;