Changeset 33488 for gs3-extensions


Ignore:
Timestamp:
2019-09-17T14:48:36+12:00 (5 years ago)
Author:
ak19
Message:

new function createSeedURLsFiles() in WETProcessor that replaces the bash script bin/script/unique_mri_domains_from_cc.sh

Location:
gs3-extensions/maori-lang-detection
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/bin/script/unique_mri_domains_from_cc.sh

    r33471 r33488  
    6969   
    7070   
    71 done < <(cat sorted_MRI_urls.txt) 
     71done < <(cat sorted_MRI_urls.txt)
    7272
    7373echo "**************************************"
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java

    r33480 r33488  
    55import java.util.Properties;
    66import java.util.zip.GZIPInputStream;
     7import java.util.Iterator;
     8import java.util.Set;
     9import java.util.TreeSet;
    710
    811import org.apache.log4j.Logger;
     
    301304    }
    302305    }
     306
     307
     308    /**
     309     * Takes as input the keepURLs.txt file generated by running WETProcessor instances.
     310     * As output produces the URL seed list and regex-urlfilter text files required by nutch,
     311     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
     312     */
     313    public static void createSeedURLsFiles(File urlsFile, File seedURLsFile, File urlFilterFile) {
     314    // Maintain Sets of unique domains and urls
     315    // TreeSet: by default, "the elements are ordered using their natural ordering"
     316    // (or by a Comparator provided at set creation time).
     317    // Whereas HashSet doesn't guarantee ordering.
     318    // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations.
     319
     320    Set<String> domainsSet = new TreeSet<String>();
     321    Set<String> urlsSet = new TreeSet<String>();
     322
     323    final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)*
     324   
     325    try (
     326         BufferedReader reader = new BufferedReader(new FileReader(urlsFile));
     327         ) {
     328       
     329        // read a URL at a time from urlsFile
     330        String url = null;
     331        String domain = null;
     332        while((url = reader.readLine()) != null) { // readLine removes newline separator
     333       
     334        // work out domain. This retains any www. or subdomain prefix:
     335        int startIndex = url.indexOf("//"); // http:// or https:// prefix
     336        startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
     337        domain = url.substring(startIndex);
     338        int endIndex = domain.indexOf("/");
     339        if(endIndex == -1) endIndex = domain.length();
     340        domain = domain.substring(0, endIndex);
     341
     342        //if(!domainsMap.containsKey(domain)) {
     343        urlsSet.add(url);
     344        domainsSet.add(domain);
     345        //}
     346        }
     347    } catch (IOException ioe) {
     348        ioe.printStackTrace();
     349        System.err.println("\n@@@@@@@@@ Error reading in urls from file " + urlsFile);
     350    }
     351   
     352    try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) {
     353        Iterator<String> i = urlsSet.iterator();
     354        while(i.hasNext()) {
     355        String url = i.next();
     356        seedURLsWriter.write(url + "\n");
     357        }
     358       
     359    } catch (IOException ioe) {
     360        ioe.printStackTrace();
     361        System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile);
     362    }
     363
     364    try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) {
     365        Iterator<String> i = domainsSet.iterator();
     366        // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
     367        while(i.hasNext()) {
     368        String domain = i.next();
     369        domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/";       
     370        urlFilterWriter.write(domain + "\n");
     371        }
     372       
     373    } catch (IOException ioe) {
     374        ioe.printStackTrace();
     375        System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile);
     376    }
     377    }
     378
    303379   
    304 
    305380    //public static int getRecordCount() { return recordCount; }
    306381   
     
    412487       
    413488    }
     489
     490    File seedURLsFile = new File(outFolder, "seedURLs.txt");
     491    File urlFilterFile = new File(outFolder, "regex-urlfilter.txt");
     492    WETProcessor.createSeedURLsFiles(WETProcessor.keepURLsFile, seedURLsFile, urlFilterFile);
    414493   
    415494    return;
Note: See TracChangeset for help on using the changeset viewer.