Changeset 33562

Show
Ignore:
Timestamp:
11.10.2019 21:52:40 (5 weeks ago)
Author:
ak19
Message:

1. The sites-too-big-to-exhaustively-crawl.txt is now a csv file of a semi-custom format, and the Java code now uses the Apache Commons CSV jar file (v1.7 for Java 8) to parse the contents thereof. 2. Tidied up code to reuse reference to ClassLoader?.

Location:
gs3-extensions/maori-lang-detection
Files:
3 added
2 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt

    r33561 r33562  
    1313# FORMAT OF THIS FILE'S CONTENTS: 
    1414#    <topsite-base-url>,<value> 
    15 # where <value> can be empty or one of SUBDOMAIN-COPY, SINGLEPAGE, <url-form-without-protocol> 
     15# where <value> can or is one of 
     16#    empty, SUBDOMAIN-COPY, FOLLOW-LINKS-WITHIN-TOPSITE, SINGLEPAGE, <url-form-without-protocol> 
    1617# 
    17 #   - if value is empty: if seedurl contains topsite-base-url, the seedurl will go into the file 
    18 #     unprocessed-topsite-matches.txt and the site/page won't be crawled. 
     18#   - if value is left empty: if seedurl contains topsite-base-url, the seedurl will go into the 
     19#     file unprocessed-topsite-matches.txt and the site/page won't be crawled. 
    1920#     The user will be notified to inspect the file unprocessed-topsite-matches.txt. 
    2021#   - SINGLEPAGE: if seedurl matches topsite-base-url, then only download the page at that seedurl. 
     
    4546#     crawl to just mi.wikipedia.org. 
    4647#     Remember to leave out any protocol <from url-form-without-protocol>. 
    47  
    48 # column 3: whether nutch should do fetch all or not 
    49 # column 4: number of crawl iterations 
     48# 
     49# TODO If useful: 
     50#   column 3: whether nutch should do fetch all or not 
     51#   column 4: number of crawl iterations 
    5052 
    5153# docs.google.com is a special case: not all pages are public and any interlinking is likely to 
    52 # be intentional. But SUBDOMAIN-COPY does not work: as seedURL's domain becomes docs.google.com 
    53 # which, when combined with SUBDOMAIN-COPY, the Java code treats as a special case so that 
    54 # any seedURL on docs.google.com ends up pushed out into the "unprocessed....txt" text file. 
    55 #docs.google.com,SUBDOMAIN-COPY 
     54# be intentional. Grab all linked pages, for link depth set with nutch's crawl, as long as the 
     55# links are within the given topsite-base-url 
    5656docs.google.com,FOLLOW-LINKS-WITHIN-TOPSITE 
    5757 
     58# Just crawl a single page for these: 
    5859drive.google.com,SINGLEPAGE 
    5960forms.office.com,SINGLEPAGE 
     
    6364# Special case of yale.edu: its Rapa-Nui pages are on blacklist, but we want this page + its photos 
    6465# The page's containing folder is whitelisted in case the photos are there. 
    65 korora.econ.yale.edu,,SINGLEPAGE 
     66korora.econ.yale.edu,SINGLEPAGE 
    6667 
    6768000webhost.com 
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33561 r33562  
    33 
    44import java.io.*; 
     5import java.nio.charset.StandardCharsets; 
    56import java.util.Properties; 
    67import java.util.zip.GZIPInputStream; 
     
    1213import java.util.TreeSet; 
    1314 
     15import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi 
    1416import org.apache.log4j.Logger; 
     17 
    1518 
    1619/** 
     
    132135 
    133136    private int wetFileCount = 0; 
    134      
     137 
     138    private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader(); 
     139     
    135140    public CCWETProcessor(File inFolder, File outFolder) throws Exception { 
    136141    this.commoncrawlDir = inFolder; 
     
    138143 
    139144    // load up the properties from the config file 
    140     try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) { 
     145    try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) { 
    141146        configProperties = new Properties(); 
    142147        configProperties.load(infile);       
     
    201206    System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite."); 
    202207    topSitesMap = new HashMap<String, String>(); 
    203     //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt"); 
    204  
     208     
     209    // Read in our csv file of topsites and what to do when one hits a match with a seedURL 
     210    // and put these in our topSitesMap 
     211    // https://commons.apache.org/proper/commons-csv/apidocs/index.html 
     212    // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html   
     213    //https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html 
     214    CSVFormat customisedCSVFormat = CSVFormat.DEFAULT 
     215        .withCommentMarker('#') 
     216        .withSkipHeaderRecord() 
     217        .withIgnoreSurroundingSpaces(); 
     218     
     219    File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile()); 
     220    // CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources 
    205221    try ( 
    206          BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8")); 
     222         CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat); 
    207223         ) { 
    208  
    209         String str = null; 
    210         while((str = reader.readLine()) != null) { 
    211         str = str.trim(); 
    212         if(str.equals("") || str.startsWith("#")) { 
    213             continue; 
    214         } 
    215  
    216         // comma separated list of values 
    217         int splitindex = str.indexOf(","); 
    218         if(splitindex == -1) { 
    219             topSitesMap.put(str, ""); 
    220         } else { 
    221             String topsite = str.substring(0, splitindex).trim(); 
    222             String allowed_url_pattern = str.substring(splitindex+1).trim(); 
    223             topSitesMap.put(topsite, allowed_url_pattern); 
    224         } 
    225         } 
    226     } catch (IOException ioe) { 
    227         ioe.printStackTrace(); 
    228         System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt"); 
    229     } 
    230      
     224        for (CSVRecord csvRecord : parser) { 
     225        String topsite = csvRecord.get(0); 
     226        String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : ""; 
     227        topSitesMap.put(topsite, allowed_url_pattern); 
     228 
     229        //System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern); 
     230         
     231        } 
     232    } catch(Exception e) { 
     233        e.printStackTrace(); 
     234        System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData); 
     235    } 
     236     
     237     
     238  
    231239    //System.err.println("Prematurely terminating for testing purposes."); 
    232240    //System.exit(-1); 
     
    669677     
    670678    // if filterListFilename does not exist in the conf folder, just return 
    671     if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) { 
     679    if(MY_CLASSLOADER.getResource(filterListFilename) == null) { 
    672680        System.err.println(filterListFilename + " does not exist"); 
    673681        return;      
     
    675683 
    676684    try ( 
    677          BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8")); 
     685         BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8")); 
    678686         ) { 
    679687        String filter = null; 
     
    842850 
    843851    File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter()); 
    844          
     852     
    845853    for(int i = 0; i < ccrawlFolders.length; i++) { 
    846854        File ccrawlFolder = ccrawlFolders[i]; 
     
    848856        ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);         
    849857    } 
    850  
    851858     
    852859    // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls