Context Navigation

← Previous Change
Next Change →

CCWETProcessor.java

Timestamp:

2019-10-16T20:00:09+13:00 (5 years ago)

Author:

ak19

Message:

batchcrawl.sh now does what it should have from the start, which is to move the log.out and UNFINISHED files into the output folder instead of leaving them in the input folder, as the input to_crawl folder can and does get replaced all the time, every time I regenerate it after black/white/greylisting more urls. 2. Blacklisted more adult sites, greylisted more product sites and .ru, .pl and .tk domains with whitelisting in the whitelist file. 3. CCWETProcessor now looks out for additional adult sites based on URL and adds them to its blacklist in memory (not the file) and logs the domain for checking and manually adding to the blacklist file.

File:

: 1 edited

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java (modified) (21 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

-              r33568
+              r33569
     } catch(Exception e) {
+        System.err.println("Exception attempting to read properties from config.properties.");
+        logger.error("Exception attempting to read properties from config.properties.");
+        e.printStackTrace();
+        error("Exception attempting to read properties from config.properties.", e);
+    }
     if(configProperties.size() == 0) {
         System.err.println("*** Warning: no values read into config properties. Using defaults.");
+        warn("*** Warning: no values read into config properties. Using defaults.");
+    }
 …
     // prepare our blacklist, greylist (for inspection) and whitelist
     System.err.println("Loading blacklist.");
+    info("Loading blacklist.");
     blackList = new HashMap<String, Integer>();
     initURLFilterList(blackList, "url-blacklist-filter.txt");
     System.err.println("Loading greylist.");
+    info("Loading greylist.");
     greyList = new HashMap<String, Integer>();
     initURLFilterList(greyList, "url-greylist-filter.txt");
     System.err.println("Loading whitelist.");
+    info("Loading whitelist.");
     whiteList = new HashMap<String, Integer>();
     initURLFilterList(whiteList, "url-whitelist-filter.txt");
     // Create the map of topSites
     System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
+    info("Loading map of topsites with regex of allowable url patterns for each topsite.");
     topSitesMap = new HashMap<String, String>();
 …
         topSitesMap.put(topsite, allowed_url_pattern);
         //System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
+        //debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
+        }
     } catch(Exception e) {
+        e.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData);
+    }
+        error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
+    }
     //System.err.println("Prematurely terminating for testing purposes.");
+    //debug("Prematurely terminating for testing purposes.");
     //System.exit(-1);
+    }
 …
             urlsSet = new TreeSet<String>();
             urlsSet.add(url);
             domainsToURLsMap.put(domainWithProtocol, urlsSet);
+            domainsToURLsMap.put(domainWithProtocol, urlsSet);
         } else {
             urlsSet = domainsToURLsMap.get(domainWithProtocol);
 …
+        }
     } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
+        error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
+    }
 …
     String value = topSitesMap.get("wikipedia.org");
     if(value == null) {
         System.err.println("### wikipedia.org had null value");
+        debug("### wikipedia.org had null value");
     } else {
         System.err.println("### wikipedia.org had value: " + value);
+        debug("### wikipedia.org had value: " + value);
     } // DEBUG
         */
 …
         /*if(domain.contains("docs.google.com")) {
             System.err.println("domain with protocol: " + domainWithProtocol);
             System.err.println("domain: " + domain);
+            debug("domain with protocol: " + domainWithProtocol);
+            debug("domain: " + domain);
             }*/
 …
         } catch (IOException ioe) {
+            ioe.printStackTrace();
+            System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
+            error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
+        }
 …
     } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error writing to one of: ");
+        System.err.println("\t" + seedURLsFile);
+        System.err.println("\t" + urlFilterFile);
+        System.err.println("\t" + domainURLsFile);
+        System.err.println("\t" + topSiteMatchesFile);
+        error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
+                   + "\n\t" + urlFilterFile
+                   + "\n\t" + domainURLsFile
+                   + "\n\t" + topSiteMatchesFile, ioe);
+    }
     /*
     // BEGIN DEBUG
     System.err.println("@@@@ TopSitesMap contains: ");
+    debug("@@@@ TopSitesMap contains: ");
     for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
         String topSite = entry.getKey();
         String urlPattern = entry.getValue();
         System.err.println(topSite + " - " + urlPattern);
+        debug(topSite + " - " + urlPattern);
     } // END DEBUG
     */
 …
         if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
         // there's an entry for the URL in the topSitesMap
         System.err.println("##### A top site matches URL domain " + domain);
+        debug("##### A top site matches URL domain " + domain);
         // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
 …
      */
     public boolean isBlacklisted(String url) {
+    return isListedInFilterList(blackList, url);
+    boolean isBlackListed = isListedInFilterList(blackList, url);
+    // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
+    // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
+    String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol
+    if(!isBlackListed && url.contains("jasmin")) {
+        warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
+        blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
+    }
+    return isBlackListed;
+    }
 …
     // if filterListFilename does not exist in the conf folder, just return
     if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
         System.err.println(filterListFilename + " does not exist");
+        warn("Filter list filename: " + filterListFilename + " does not exist");
         return;
+    }
 …
             filter = filter.substring(1);
             list.put(filter, LIST_ENTRY_STARTSWITH);
             System.err.println("Match filter startswith: " + filter);
+            //debug("Match filter startswith: " + filter);
+        }
         else if(filter.endsWith("$")) {
             filter = filter.substring(0, filter.length()-1);
             list.put(filter, LIST_ENTRY_ENDSWITH);
+            //debug("@@@ Match filter endswith: " + filter);
+        }
         else {
             list.put(filter, LIST_ENTRY_CONTAINS);
+        }
         //System.err.println("Got filter: " + filter);
+        //debug("Got filter: " + filter);
+        }
     } catch (IOException ioe) {
+        ioe.printStackTrace();
+        System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
+        error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
+    }
 …
     for(int i = 0; i < WETFiles.length; i++) {
         File WETFile = WETFiles[i];
         logger.debug("Processing WETfile: " + WETFile);
+        debug("Processing WETfile: " + WETFile);
         // Any .gz files listed means they haven't been unzipped yet. So unzip.
 …
         // Check the unzipped WETFile exists
+        if(!WETFile.exists() || !WETFile.isFile()) {
+        System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
+        logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
+        if(!WETFile.exists() || !WETFile.isFile()) {
+        error("Error: " + WETFile + " does not exist (failure to unzip?)");
         return;
+        }
 …
     // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
+    public static void info(String msg) {
+    System.err.println(msg);
+    logger.info(msg);
+    }
+    public static void debug(String msg) {
+    System.err.println(msg);
+    logger.debug(msg);
+    }
+    public static void warn(String msg) {
+    System.err.println(msg);
+    logger.warn(msg);
+    }
+    public static void error(String msg) {
+    System.err.println(msg);
+    logger.error(msg);
+    }
+    public static void error(String msg, Exception e) {
+    logger.error(msg, e);
+    System.err.println(msg);
+    e.printStackTrace();
+    }
     public static void printUsage() {
     System.err.println("Run this program as:");
     System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
+    info("Run this program as:");
+    info("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
+    }
 …
+        }
         else {
         System.err.println("File " + f + " is not a directory");
+        info("File " + f + " is not a directory");
+        }
         return false;
 …
     File commoncrawlDir = new File(args[0]);
     if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
         System.out.println("Error: " + args[0] + " does not exist or is not a directory");
+        error("Error: " + args[0] + " does not exist or is not a directory");
         return;
+    }
 …
     File outFolder = new File(args[1]);
     if(!outFolder.exists() || !outFolder.isDirectory()) {
         System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
+        error("Error: " + args[1] + " does not exist or is not a directory.");
         return;
+    }
 …
     for(int i = 0; i < ccrawlFolders.length; i++) {
         File ccrawlFolder = ccrawlFolders[i];
         System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
+        info("About to process commoncrawl WET files folder: " + ccrawlFolder);
         ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);
+    }
 …
     ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
     System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
     System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
+    info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
+    info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
     } catch(Exception e) {
     // can get an exception when instantiating CCWETProcessor instance
+    e.printStackTrace();
+    System.err.println(e.getMessage());
+    error(e.getMessage(), e);
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33569 for gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

Download in other formats: