Ignore:
Timestamp:
2019-10-16T20:00:09+13:00 (5 years ago)
Author:
ak19
Message:
  1. batchcrawl.sh now does what it should have from the start, which is to move the log.out and UNFINISHED files into the output folder instead of leaving them in the input folder, as the input to_crawl folder can and does get replaced all the time, every time I regenerate it after black/white/greylisting more urls. 2. Blacklisted more adult sites, greylisted more product sites and .ru, .pl and .tk domains with whitelisting in the whitelist file. 3. CCWETProcessor now looks out for additional adult sites based on URL and adds them to its blacklist in memory (not the file) and logs the domain for checking and manually adding to the blacklist file.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33568 r33569  
    150150       
    151151    } catch(Exception e) {
    152         System.err.println("Exception attempting to read properties from config.properties.");
    153         logger.error("Exception attempting to read properties from config.properties.");
    154         e.printStackTrace();
     152        error("Exception attempting to read properties from config.properties.", e);
    155153    }
    156154
    157155    if(configProperties.size() == 0) {
    158         System.err.println("*** Warning: no values read into config properties. Using defaults.");
     156        warn("*** Warning: no values read into config properties. Using defaults.");
    159157    }
    160158   
     
    192190
    193191    // prepare our blacklist, greylist (for inspection) and whitelist
    194     System.err.println("Loading blacklist.");
     192    info("Loading blacklist.");
    195193    blackList = new HashMap<String, Integer>();
    196194    initURLFilterList(blackList, "url-blacklist-filter.txt");
    197195   
    198     System.err.println("Loading greylist.");
     196    info("Loading greylist.");
    199197    greyList = new HashMap<String, Integer>();
    200198    initURLFilterList(greyList, "url-greylist-filter.txt");
    201199   
    202     System.err.println("Loading whitelist.");
     200    info("Loading whitelist.");
    203201    whiteList = new HashMap<String, Integer>();
    204202    initURLFilterList(whiteList, "url-whitelist-filter.txt");
    205203
    206204    // Create the map of topSites
    207     System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
     205    info("Loading map of topsites with regex of allowable url patterns for each topsite.");
    208206    topSitesMap = new HashMap<String, String>();
    209207   
     
    228226        topSitesMap.put(topsite, allowed_url_pattern);
    229227
    230         //System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
     228        //debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
    231229       
    232230        }
    233231    } catch(Exception e) {
    234         e.printStackTrace();
    235         System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData);
    236     }
    237    
    238    
     232        error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
     233    }   
    239234 
    240     //System.err.println("Prematurely terminating for testing purposes.");
     235    //debug("Prematurely terminating for testing purposes.");
    241236    //System.exit(-1);
    242237    }
     
    309304            urlsSet = new TreeSet<String>();
    310305            urlsSet.add(url);
    311             domainsToURLsMap.put(domainWithProtocol, urlsSet);
     306            domainsToURLsMap.put(domainWithProtocol, urlsSet);         
    312307        } else {
    313308            urlsSet = domainsToURLsMap.get(domainWithProtocol);
     
    317312        }
    318313    } catch (IOException ioe) {
    319         ioe.printStackTrace();
    320         System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
     314        error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
    321315    }
    322316
     
    356350    String value = topSitesMap.get("wikipedia.org");
    357351    if(value == null) {
    358         System.err.println("### wikipedia.org had null value");
     352        debug("### wikipedia.org had null value");
    359353    } else {
    360         System.err.println("### wikipedia.org had value: " + value);
     354        debug("### wikipedia.org had value: " + value);
    361355    } // DEBUG
    362356        */
     
    370364
    371365        /*if(domain.contains("docs.google.com")) {
    372             System.err.println("domain with protocol: " + domainWithProtocol);
    373             System.err.println("domain: " + domain);
     366            debug("domain with protocol: " + domainWithProtocol);
     367            debug("domain: " + domain);
    374368            }*/
    375369       
     
    495489           
    496490        } catch (IOException ioe) {
    497             ioe.printStackTrace();
    498             System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
     491            error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
    499492        }
    500493       
     
    502495
    503496    } catch (IOException ioe) {
    504         ioe.printStackTrace();
    505         System.err.println("\n@@@@@@@@@ Error writing to one of: ");
    506         System.err.println("\t" + seedURLsFile);
    507         System.err.println("\t" + urlFilterFile);
    508         System.err.println("\t" + domainURLsFile); 
    509         System.err.println("\t" + topSiteMatchesFile);   
     497        error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
     498                   + "\n\t" + urlFilterFile
     499                   + "\n\t" + domainURLsFile
     500                   + "\n\t" + topSiteMatchesFile, ioe);
    510501    }   
    511502
    512503    /*
    513504    // BEGIN DEBUG
    514     System.err.println("@@@@ TopSitesMap contains: ");
     505    debug("@@@@ TopSitesMap contains: ");
    515506    for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
    516507        String topSite = entry.getKey();
    517508        String urlPattern = entry.getValue();       
    518         System.err.println(topSite + " - " + urlPattern);
     509        debug(topSite + " - " + urlPattern);
    519510    } // END DEBUG
    520511    */
     
    587578        if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
    588579        // there's an entry for the URL in the topSitesMap
    589         System.err.println("##### A top site matches URL domain " + domain);
     580        debug("##### A top site matches URL domain " + domain);
    590581
    591582        // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
     
    648639     */
    649640    public boolean isBlacklisted(String url) {
    650     return isListedInFilterList(blackList, url);
     641    boolean isBlackListed = isListedInFilterList(blackList, url);
     642
     643    // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
     644    // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
     645    String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol
     646    if(!isBlackListed && url.contains("jasmin")) {
     647        warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
     648        blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
     649    }
     650    return isBlackListed;
    651651    }
    652652   
     
    680680    // if filterListFilename does not exist in the conf folder, just return
    681681    if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
    682         System.err.println(filterListFilename + " does not exist");
     682        warn("Filter list filename: " + filterListFilename + " does not exist");
    683683        return;     
    684684    }
     
    702702            filter = filter.substring(1);
    703703            list.put(filter, LIST_ENTRY_STARTSWITH);
    704             System.err.println("Match filter startswith: " + filter);
     704            //debug("Match filter startswith: " + filter);
    705705        }
    706706        else if(filter.endsWith("$")) {
    707707            filter = filter.substring(0, filter.length()-1);
    708708            list.put(filter, LIST_ENTRY_ENDSWITH);
     709            //debug("@@@ Match filter endswith: " + filter);
    709710        }
    710711        else {
    711712            list.put(filter, LIST_ENTRY_CONTAINS);
    712713        }
    713         //System.err.println("Got filter: " + filter);
     714        //debug("Got filter: " + filter);
    714715        }
    715716       
    716717    } catch (IOException ioe) {
    717         ioe.printStackTrace();
    718         System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
     718        error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
    719719    }
    720720   
     
    739739    for(int i = 0; i < WETFiles.length; i++) {
    740740        File WETFile = WETFiles[i];     
    741         logger.debug("Processing WETfile: " + WETFile);
     741        debug("Processing WETfile: " + WETFile);
    742742
    743743        // Any .gz files listed means they haven't been unzipped yet. So unzip.
     
    754754        // Check the unzipped WETFile exists       
    755755
    756         if(!WETFile.exists() || !WETFile.isFile()) {
    757         System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
    758         logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
     756        if(!WETFile.exists() || !WETFile.isFile()) {       
     757        error("Error: " + WETFile + " does not exist (failure to unzip?)");
    759758        return;
    760759        }
     
    777776
    778777    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
     778    public static void info(String msg) {
     779    System.err.println(msg);
     780    logger.info(msg);
     781    }
     782    public static void debug(String msg) {
     783    System.err.println(msg);
     784    logger.debug(msg);
     785    }
     786    public static void warn(String msg) {
     787    System.err.println(msg);
     788    logger.warn(msg);
     789    }
     790    public static void error(String msg) {
     791    System.err.println(msg);
     792    logger.error(msg);
     793    }
     794    public static void error(String msg, Exception e) {
     795    logger.error(msg, e);
     796    System.err.println(msg);
     797    e.printStackTrace();
     798    }
     799   
    779800    public static void printUsage() {
    780     System.err.println("Run this program as:");
    781     System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");   
     801    info("Run this program as:");
     802    info("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
    782803    }
    783804
     
    824845        }
    825846        else {
    826         System.err.println("File " + f + " is not a directory");
     847        info("File " + f + " is not a directory");
    827848        }
    828849        return false;                 
     
    838859    File commoncrawlDir = new File(args[0]);
    839860    if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
    840         System.out.println("Error: " + args[0] + " does not exist or is not a directory");
     861        error("Error: " + args[0] + " does not exist or is not a directory");
    841862        return;
    842863    }
     
    844865    File outFolder = new File(args[1]);
    845866    if(!outFolder.exists() || !outFolder.isDirectory()) {
    846         System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
     867        error("Error: " + args[1] + " does not exist or is not a directory.");
    847868        return;
    848869    }   
     
    855876    for(int i = 0; i < ccrawlFolders.length; i++) {
    856877        File ccrawlFolder = ccrawlFolders[i];
    857         System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
     878        info("About to process commoncrawl WET files folder: " + ccrawlFolder);
    858879        ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);       
    859880    }
     
    869890    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
    870891
    871     System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
    872 
    873     System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
     892    info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
     893
     894    info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
    874895   
    875896   
    876897    } catch(Exception e) {
    877898    // can get an exception when instantiating CCWETProcessor instance
    878     e.printStackTrace();
    879     System.err.println(e.getMessage());
     899    error(e.getMessage(), e);
    880900    }
    881901   
Note: See TracChangeset for help on using the changeset viewer.