Changeset 33562 for gs3-extensions


Ignore:
Timestamp:
2019-10-11T21:52:40+13:00 (5 years ago)
Author:
ak19
Message:
  1. The sites-too-big-to-exhaustively-crawl.txt is now a csv file of a semi-custom format, and the Java code now uses the Apache Commons CSV jar file (v1.7 for Java 8) to parse the contents thereof. 2. Tidied up code to reuse reference to ClassLoader.
Location:
gs3-extensions/maori-lang-detection
Files:
3 added
2 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt

    r33561 r33562  
    1313# FORMAT OF THIS FILE'S CONTENTS:
    1414#    <topsite-base-url>,<value>
    15 # where <value> can be empty or one of SUBDOMAIN-COPY, SINGLEPAGE, <url-form-without-protocol>
     15# where <value> can or is one of
     16#    empty, SUBDOMAIN-COPY, FOLLOW-LINKS-WITHIN-TOPSITE, SINGLEPAGE, <url-form-without-protocol>
    1617#
    17 #   - if value is empty: if seedurl contains topsite-base-url, the seedurl will go into the file
    18 #     unprocessed-topsite-matches.txt and the site/page won't be crawled.
     18#   - if value is left empty: if seedurl contains topsite-base-url, the seedurl will go into the
     19#     file unprocessed-topsite-matches.txt and the site/page won't be crawled.
    1920#     The user will be notified to inspect the file unprocessed-topsite-matches.txt.
    2021#   - SINGLEPAGE: if seedurl matches topsite-base-url, then only download the page at that seedurl.
     
    4546#     crawl to just mi.wikipedia.org.
    4647#     Remember to leave out any protocol <from url-form-without-protocol>.
    47 
    48 # column 3: whether nutch should do fetch all or not
    49 # column 4: number of crawl iterations
     48#
     49# TODO If useful:
     50#   column 3: whether nutch should do fetch all or not
     51#   column 4: number of crawl iterations
    5052
    5153# docs.google.com is a special case: not all pages are public and any interlinking is likely to
    52 # be intentional. But SUBDOMAIN-COPY does not work: as seedURL's domain becomes docs.google.com
    53 # which, when combined with SUBDOMAIN-COPY, the Java code treats as a special case so that
    54 # any seedURL on docs.google.com ends up pushed out into the "unprocessed....txt" text file.
    55 #docs.google.com,SUBDOMAIN-COPY
     54# be intentional. Grab all linked pages, for link depth set with nutch's crawl, as long as the
     55# links are within the given topsite-base-url
    5656docs.google.com,FOLLOW-LINKS-WITHIN-TOPSITE
    5757
     58# Just crawl a single page for these:
    5859drive.google.com,SINGLEPAGE
    5960forms.office.com,SINGLEPAGE
     
    6364# Special case of yale.edu: its Rapa-Nui pages are on blacklist, but we want this page + its photos
    6465# The page's containing folder is whitelisted in case the photos are there.
    65 korora.econ.yale.edu,,SINGLEPAGE
     66korora.econ.yale.edu,SINGLEPAGE
    6667
    6768000webhost.com
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33561 r33562  
    33
    44import java.io.*;
     5import java.nio.charset.StandardCharsets;
    56import java.util.Properties;
    67import java.util.zip.GZIPInputStream;
     
    1213import java.util.TreeSet;
    1314
     15import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
    1416import org.apache.log4j.Logger;
     17
    1518
    1619/**
     
    132135
    133136    private int wetFileCount = 0;
    134    
     137
     138    private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
     139   
    135140    public CCWETProcessor(File inFolder, File outFolder) throws Exception {
    136141    this.commoncrawlDir = inFolder;
     
    138143
    139144    // load up the properties from the config file
    140     try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
     145    try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
    141146        configProperties = new Properties();
    142147        configProperties.load(infile);     
     
    201206    System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
    202207    topSitesMap = new HashMap<String, String>();
    203     //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt");
    204 
     208   
     209    // Read in our csv file of topsites and what to do when one hits a match with a seedURL
     210    // and put these in our topSitesMap
     211    // https://commons.apache.org/proper/commons-csv/apidocs/index.html
     212    // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html 
     213    //https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
     214    CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
     215        .withCommentMarker('#')
     216        .withSkipHeaderRecord()
     217        .withIgnoreSurroundingSpaces();
     218   
     219    File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
     220    // CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
    205221    try (
    206          BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8"));
     222         CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
    207223         ) {
    208 
    209         String str = null;
    210         while((str = reader.readLine()) != null) {
    211         str = str.trim();
    212         if(str.equals("") || str.startsWith("#")) {
    213             continue;
    214         }
    215 
    216         // comma separated list of values
    217         int splitindex = str.indexOf(",");
    218         if(splitindex == -1) {
    219             topSitesMap.put(str, "");
    220         } else {
    221             String topsite = str.substring(0, splitindex).trim();
    222             String allowed_url_pattern = str.substring(splitindex+1).trim();
    223             topSitesMap.put(topsite, allowed_url_pattern);
    224         }
    225         }
    226     } catch (IOException ioe) {
    227         ioe.printStackTrace();
    228         System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt");
    229     }
    230    
     224        for (CSVRecord csvRecord : parser) {
     225        String topsite = csvRecord.get(0);
     226        String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
     227        topSitesMap.put(topsite, allowed_url_pattern);
     228
     229        //System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
     230       
     231        }
     232    } catch(Exception e) {
     233        e.printStackTrace();
     234        System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData);
     235    }
     236   
     237   
     238 
    231239    //System.err.println("Prematurely terminating for testing purposes.");
    232240    //System.exit(-1);
     
    669677   
    670678    // if filterListFilename does not exist in the conf folder, just return
    671     if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
     679    if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
    672680        System.err.println(filterListFilename + " does not exist");
    673681        return;     
     
    675683
    676684    try (
    677          BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
     685         BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
    678686         ) {
    679687        String filter = null;
     
    842850
    843851    File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
    844        
     852   
    845853    for(int i = 0; i < ccrawlFolders.length; i++) {
    846854        File ccrawlFolder = ccrawlFolders[i];
     
    848856        ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);       
    849857    }
    850 
    851858   
    852859    // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls
Note: See TracChangeset for help on using the changeset viewer.