Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33562

Timestamp:

2019-10-11T21:52:40+13:00 (5 years ago)

Author:

ak19

Message:

The sites-too-big-to-exhaustively-crawl.txt is now a csv file of a semi-custom format, and the Java code now uses the Apache Commons CSV jar file (v1.7 for Java 8) to parse the contents thereof. 2. Tidied up code to reuse reference to ClassLoader.

Location:

gs3-extensions/maori-lang-detection

Files:

: 3 added
: 2 edited

conf/sites-too-big-to-exhaustively-crawl.txt (modified) (3 diffs)
lib/LICENSE.txt (added)
lib/NOTICE.txt (added)
lib/commons-csv-1.7.jar (added)
src/org/greenstone/atea/CCWETProcessor.java (modified) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt

-              r33561
+              r33562
 # FORMAT OF THIS FILE'S CONTENTS:
 #    <topsite-base-url>,<value>
+# where <value> can be empty or one of SUBDOMAIN-COPY, SINGLEPAGE, <url-form-without-protocol>
+# where <value> can or is one of
+#    empty, SUBDOMAIN-COPY, FOLLOW-LINKS-WITHIN-TOPSITE, SINGLEPAGE, <url-form-without-protocol>
+#
 #   - if value is empty: if seedurl contains topsite-base-url, the seedurl will go into the file
 #     unprocessed-topsite-matches.txt and the site/page won't be crawled.
+#   - if value is left empty: if seedurl contains topsite-base-url, the seedurl will go into the
+#     file unprocessed-topsite-matches.txt and the site/page won't be crawled.
 #     The user will be notified to inspect the file unprocessed-topsite-matches.txt.
 #   - SINGLEPAGE: if seedurl matches topsite-base-url, then only download the page at that seedurl.
 …
 #     crawl to just mi.wikipedia.org.
 #     Remember to leave out any protocol <from url-form-without-protocol>.
+# column 3: whether nutch should do fetch all or not
+# column 4: number of crawl iterations
+#
+# TODO If useful:
+#   column 3: whether nutch should do fetch all or not
+#   column 4: number of crawl iterations
 # docs.google.com is a special case: not all pages are public and any interlinking is likely to
+# be intentional. But SUBDOMAIN-COPY does not work: as seedURL's domain becomes docs.google.com
+# which, when combined with SUBDOMAIN-COPY, the Java code treats as a special case so that
+# any seedURL on docs.google.com ends up pushed out into the "unprocessed....txt" text file.
+#docs.google.com,SUBDOMAIN-COPY
+# be intentional. Grab all linked pages, for link depth set with nutch's crawl, as long as the
+# links are within the given topsite-base-url
 docs.google.com,FOLLOW-LINKS-WITHIN-TOPSITE
+# Just crawl a single page for these:
 drive.google.com,SINGLEPAGE
 forms.office.com,SINGLEPAGE
 …
 # Special case of yale.edu: its Rapa-Nui pages are on blacklist, but we want this page + its photos
 # The page's containing folder is whitelisted in case the photos are there.
 korora.econ.yale.edu,,SINGLEPAGE
+korora.econ.yale.edu,SINGLEPAGE
 webhost.com

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

-              r33561
+              r33562
 import java.io.*;
+import java.nio.charset.StandardCharsets;
 import java.util.Properties;
 import java.util.zip.GZIPInputStream;
 …
 import java.util.TreeSet;
+import org.apache.commons.csv.*; // https://commons.apache.org/proper/commons-csv/download_csv.cgi
 import org.apache.log4j.Logger;
 /**
 …
     private int wetFileCount = 0;
+    private static ClassLoader MY_CLASSLOADER = org.greenstone.atea.CCWETProcessor.class.getClassLoader();
     public CCWETProcessor(File inFolder, File outFolder) throws Exception {
     this.commoncrawlDir = inFolder;
 …
     // load up the properties from the config file
     try (InputStream infile = org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("config.properties")) {
+    try (InputStream infile = MY_CLASSLOADER.getResourceAsStream("config.properties")) {
         configProperties = new Properties();
         configProperties.load(infile);
 …
     System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
     topSitesMap = new HashMap<String, String>();
+    //File topSitesFile = new File(outFolder, "sites-too-big-to-exhaustively-crawl.txt");
+    // Read in our csv file of topsites and what to do when one hits a match with a seedURL
+    // and put these in our topSitesMap
+    // https://commons.apache.org/proper/commons-csv/apidocs/index.html
+    // https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVParser.html
+    //https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html
+    CSVFormat customisedCSVFormat = CSVFormat.DEFAULT
+        .withCommentMarker('#')
+        .withSkipHeaderRecord()
+        .withIgnoreSurroundingSpaces();
+    File topSitesCSVData = new File(MY_CLASSLOADER.getResource("sites-too-big-to-exhaustively-crawl.txt").getFile());
+    // CSVParser is AutoCloseable and throws exceptions, so putting it in a try-with-resources
     try (
          BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream("sites-too-big-to-exhaustively-crawl.txt"), "UTF-8"));
+         CSVParser parser = CSVParser.parse(topSitesCSVData, StandardCharsets.UTF_8, customisedCSVFormat);
          ) {
+        String str = null;
+        while((str = reader.readLine()) != null) {
+        str = str.trim();
+        if(str.equals("") || str.startsWith("#")) {
+            continue;
+        }
+        // comma separated list of values
+        int splitindex = str.indexOf(",");
+        if(splitindex == -1) {
+            topSitesMap.put(str, "");
+        } else {
+            String topsite = str.substring(0, splitindex).trim();
+            String allowed_url_pattern = str.substring(splitindex+1).trim();
+            topSitesMap.put(topsite, allowed_url_pattern);
+        }
+        }
+    } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error reading in from top sites file conf/sites-too-big-to-exhaustively-crawl.txt");
+    }
+        for (CSVRecord csvRecord : parser) {
+        String topsite = csvRecord.get(0);
+        String allowed_url_pattern = (csvRecord.size() >= 2) ? csvRecord.get(1) : "";
+        topSitesMap.put(topsite, allowed_url_pattern);
+        //System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
+        }
+    } catch(Exception e) {
+        e.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData);
+    }
     //System.err.println("Prematurely terminating for testing purposes.");
     //System.exit(-1);
 …
     // if filterListFilename does not exist in the conf folder, just return
     if(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResource(filterListFilename) == null) {
+    if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
         System.err.println(filterListFilename + " does not exist");
         return;
 …
     try (
          BufferedReader reader = new BufferedReader(new InputStreamReader(org.greenstone.atea.CCWETProcessor.class.getClassLoader().getResourceAsStream(filterListFilename), "UTF-8"));
+         BufferedReader reader = new BufferedReader(new InputStreamReader(MY_CLASSLOADER.getResourceAsStream(filterListFilename), "UTF-8"));
          ) {
         String filter = null;
 …
     File[] ccrawlFolders = commoncrawlDir.listFiles(new CCrawlWETFolderFilenameFilter());
     for(int i = 0; i < ccrawlFolders.length; i++) {
         File ccrawlFolder = ccrawlFolders[i];
 …
         ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
+    }
     // create the global files of all domains, seedURLs and regex-urlfilters across all wet files of all commoncrawls

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33562

Legend:

gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

Download in other formats: