Changeset 33569 for gs3-extensions


Ignore:
Timestamp:
2019-10-16T20:00:09+13:00 (5 years ago)
Author:
ak19
Message:
  1. batchcrawl.sh now does what it should have from the start, which is to move the log.out and UNFINISHED files into the output folder instead of leaving them in the input folder, as the input to_crawl folder can and does get replaced all the time, every time I regenerate it after black/white/greylisting more urls. 2. Blacklisted more adult sites, greylisted more product sites and .ru, .pl and .tk domains with whitelisting in the whitelist file. 3. CCWETProcessor now looks out for additional adult sites based on URL and adds them to its blacklist in memory (not the file) and logs the domain for checking and manually adding to the blacklist file.
Location:
gs3-extensions/maori-lang-detection
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/conf/sites-too-big-to-exhaustively-crawl.txt

    r33568 r33569  
    5454# NOT TOP SITES, BUT SITES WE INSPECTED AND WANT TO CONTROL SIMILARLY TO TOP SITES
    555500.gs,SINGLEPAGE
    56 
    57 # May be a large site
     56# May be a large site with only seedURLs of real relevance
    5857topographic-map.com,SINGLEPAGE
     58ami-media.net,SINGLEPAGE
     59# 2 pages of declarations of human rights in Maori, rest in other languages
     60anitra.net,SINGLEPAGE
     61# special case
     62mi.centr-zashity.ru,SINGLEPAGE
     63
     64# TOP SITE BUT NOT TOP 500
     65www.tumblr.com,SINGLEPAGE
     66
    5967
    6068# TOP SITES
     
    7482# The page's containing folder is whitelisted in case the photos are there.
    7583korora.econ.yale.edu,SINGLEPAGE
     84
    7685
    7786000webhost.com
  • gs3-extensions/maori-lang-detection/conf/url-blacklist-filter.txt

    r33568 r33569  
    2828zh-min-nan.wiktionary.org
    2929
     30######
    3031# unwanted domains
    3132.video-chat.
     
    6970acba.osb-land.com
    7071
     72
     73# just get rid of any URL containing "livejasmin"
     74## livejasmin
     75# Actually: do that in the code (CCWETProcessor) with a log message,
     76# since we actually need to get rid of any sites in entirety that contain
     77# any url with the string "livejasmin"
     78# So run the program once, check the log for messages mentioning "additional"
     79# adult sites found and add their domains in here.
     80anigma-beauty.com
     81adultfeet.com
     82atopian.org
     83bellydancingvideo.net
     84bmmodelsagency.com
     85brucknergallery.com
     86fuckvidz.org
     87photobattle.net
     88votekat.info
     89
     90# Similar to above, the following contained the string "jasmin" in the URL
     91teenycuties.com
     92a.tiles.mapbox.com
     93blazingteens.net
     94redtubeporn.info
     95osb-land.com
     96totallyhotmales.com
     97babeevents.com
     98talkserver.de
     99hehechat.org
     100fetish-nights.com
     101lesslove.com
     102hebertsvideo.com
     103
    71104# sounds like some pirating site
    72105^http://pirateguides.com/
     
    85118# not sure about the domain name and/or full url seems like it belongs here
    86119abcutie.com
     120
     121# only had a single seedURL and it quickly redirected to an adult site
     122apparactes.gq
  • gs3-extensions/maori-lang-detection/conf/url-greylist-filter.txt

    r33568 r33569  
    1818abacre.com
    1919cn-huafu.net
     20apteka.social
     21
    2022
    2123# not product stores but autotranslated?
     
    25271videosmusica.com
    2628256file.com
    27 7773033.ru
    28 abali.ru
    29 allbeautyone.ru
     29# already in greylisting of all .ru
     30#7773033.ru
     31#abali.ru
     32#allbeautyone.ru
     33aqualuz.org
    3034
    3135# if page doesn't load and can't be tested
     
    3337www.kiterewa.pl
    3438
    35 # license plate site?
    36 eba.com.ru
     39
     40
     41# MANUALLY INSPECTED URLS AND ADDED TO GREYLIST
     42
     43# license plate site? - already in greylisting of all .ru
     44#eba.com.ru
    3745
    3846# As per archive.org, there's just a photo on the defunct page at this site
     
    4250# seems to be Indonesian or Malaysian Bible rather than in Maori or any Polynesian language
    4351alkitab.life:2022
     52
     53# appears defunct
     54alixira.com
     55
     56# single seedURL was not a page in Maori, but global languages.
     57# And the rest of the domain appears to be in English
     58anglican.org
     59
     60
     61### TLDs that we greylist - any exceptions will be in the whitelist
     62# Our list of .ru and .pl domains were not relevant
     63.ru/
     64.pl/
     65.tk/
  • gs3-extensions/maori-lang-detection/conf/url-whitelist-filter.txt

    r33559 r33569  
    1010# some particular other urls on yale.edu
    1111http://korora.econ.yale.edu/phillips/archive/hauraki.htm
     12
     13# We've added .ru$ sites to the blacklist, but the following
     14# Russian website contains actual Maori language content
     15http://www.krassotkin.ru/sites/prayer.su/maori/
     16https://mi.centr-zashity.ru/
  • gs3-extensions/maori-lang-detection/hdfs-cc-work/scripts/batchcrawl.sh

    r33567 r33569  
    7474        echo "CRAWL FAILED." 2>&1 | tee -a ${siteDir}log.out
    7575    fi
    76    
     76
     77
     78    # move the peripheral crawl products (the log.out and UNFINISHED files)
     79    # from the input to the output folder. This way we can re-run the crawl and
     80    # the original output will still have been preserved
     81    mv ${siteDir}log.out $outputDir/$crawlId/log.out
     82    mv ${siteDir}UNFINISHED $outputDir/$crawlId/UNFINISHED
    7783}
    7884
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33568 r33569  
    150150       
    151151    } catch(Exception e) {
    152         System.err.println("Exception attempting to read properties from config.properties.");
    153         logger.error("Exception attempting to read properties from config.properties.");
    154         e.printStackTrace();
     152        error("Exception attempting to read properties from config.properties.", e);
    155153    }
    156154
    157155    if(configProperties.size() == 0) {
    158         System.err.println("*** Warning: no values read into config properties. Using defaults.");
     156        warn("*** Warning: no values read into config properties. Using defaults.");
    159157    }
    160158   
     
    192190
    193191    // prepare our blacklist, greylist (for inspection) and whitelist
    194     System.err.println("Loading blacklist.");
     192    info("Loading blacklist.");
    195193    blackList = new HashMap<String, Integer>();
    196194    initURLFilterList(blackList, "url-blacklist-filter.txt");
    197195   
    198     System.err.println("Loading greylist.");
     196    info("Loading greylist.");
    199197    greyList = new HashMap<String, Integer>();
    200198    initURLFilterList(greyList, "url-greylist-filter.txt");
    201199   
    202     System.err.println("Loading whitelist.");
     200    info("Loading whitelist.");
    203201    whiteList = new HashMap<String, Integer>();
    204202    initURLFilterList(whiteList, "url-whitelist-filter.txt");
    205203
    206204    // Create the map of topSites
    207     System.err.println("Loading map of topsites with regex of allowable url patterns for each topsite.");
     205    info("Loading map of topsites with regex of allowable url patterns for each topsite.");
    208206    topSitesMap = new HashMap<String, String>();
    209207   
     
    228226        topSitesMap.put(topsite, allowed_url_pattern);
    229227
    230         //System.err.println("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
     228        //debug("@@@@ topsite: " + topsite + " - " + allowed_url_pattern);
    231229       
    232230        }
    233231    } catch(Exception e) {
    234         e.printStackTrace();
    235         System.err.println("\n@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData);
    236     }
    237    
    238    
     232        error("@@@@@@@@@ Error attempting to parse CSV format of text file " + topSitesCSVData, e);
     233    }   
    239234 
    240     //System.err.println("Prematurely terminating for testing purposes.");
     235    //debug("Prematurely terminating for testing purposes.");
    241236    //System.exit(-1);
    242237    }
     
    309304            urlsSet = new TreeSet<String>();
    310305            urlsSet.add(url);
    311             domainsToURLsMap.put(domainWithProtocol, urlsSet);
     306            domainsToURLsMap.put(domainWithProtocol, urlsSet);         
    312307        } else {
    313308            urlsSet = domainsToURLsMap.get(domainWithProtocol);
     
    317312        }
    318313    } catch (IOException ioe) {
    319         ioe.printStackTrace();
    320         System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile);
     314        error("@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile, ioe);
    321315    }
    322316
     
    356350    String value = topSitesMap.get("wikipedia.org");
    357351    if(value == null) {
    358         System.err.println("### wikipedia.org had null value");
     352        debug("### wikipedia.org had null value");
    359353    } else {
    360         System.err.println("### wikipedia.org had value: " + value);
     354        debug("### wikipedia.org had value: " + value);
    361355    } // DEBUG
    362356        */
     
    370364
    371365        /*if(domain.contains("docs.google.com")) {
    372             System.err.println("domain with protocol: " + domainWithProtocol);
    373             System.err.println("domain: " + domain);
     366            debug("domain with protocol: " + domainWithProtocol);
     367            debug("domain: " + domain);
    374368            }*/
    375369       
     
    495489           
    496490        } catch (IOException ioe) {
    497             ioe.printStackTrace();
    498             System.err.println("\n@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile);
     491            error("@@@@@@@@@ Error writing to one of:" + siteSeedsFile + " or " + siteRegexFile, ioe);
    499492        }
    500493       
     
    502495
    503496    } catch (IOException ioe) {
    504         ioe.printStackTrace();
    505         System.err.println("\n@@@@@@@@@ Error writing to one of: ");
    506         System.err.println("\t" + seedURLsFile);
    507         System.err.println("\t" + urlFilterFile);
    508         System.err.println("\t" + domainURLsFile); 
    509         System.err.println("\t" + topSiteMatchesFile);   
     497        error("\n@@@@@@@@@ Error writing to one of:\n\t" + seedURLsFile
     498                   + "\n\t" + urlFilterFile
     499                   + "\n\t" + domainURLsFile
     500                   + "\n\t" + topSiteMatchesFile, ioe);
    510501    }   
    511502
    512503    /*
    513504    // BEGIN DEBUG
    514     System.err.println("@@@@ TopSitesMap contains: ");
     505    debug("@@@@ TopSitesMap contains: ");
    515506    for(Map.Entry<String, String> entry : topSitesMap.entrySet()) {
    516507        String topSite = entry.getKey();
    517508        String urlPattern = entry.getValue();       
    518         System.err.println(topSite + " - " + urlPattern);
     509        debug(topSite + " - " + urlPattern);
    519510    } // END DEBUG
    520511    */
     
    587578        if(allowed_url_pattern != null) { // if topSitesMap.containsKey(domain);
    588579        // there's an entry for the URL in the topSitesMap
    589         System.err.println("##### A top site matches URL domain " + domain);
     580        debug("##### A top site matches URL domain " + domain);
    590581
    591582        // if we're dealing with SUBDOMAIN-COPY, then the fullSeedDomain, with or without
     
    648639     */
    649640    public boolean isBlacklisted(String url) {
    650     return isListedInFilterList(blackList, url);
     641    boolean isBlackListed = isListedInFilterList(blackList, url);
     642
     643    // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
     644    // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
     645    String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol
     646    if(!isBlackListed && url.contains("jasmin")) {
     647        warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
     648        blackList.put(domainWithoutProtocol, LIST_ENTRY_CONTAINS);
     649    }
     650    return isBlackListed;
    651651    }
    652652   
     
    680680    // if filterListFilename does not exist in the conf folder, just return
    681681    if(MY_CLASSLOADER.getResource(filterListFilename) == null) {
    682         System.err.println(filterListFilename + " does not exist");
     682        warn("Filter list filename: " + filterListFilename + " does not exist");
    683683        return;     
    684684    }
     
    702702            filter = filter.substring(1);
    703703            list.put(filter, LIST_ENTRY_STARTSWITH);
    704             System.err.println("Match filter startswith: " + filter);
     704            //debug("Match filter startswith: " + filter);
    705705        }
    706706        else if(filter.endsWith("$")) {
    707707            filter = filter.substring(0, filter.length()-1);
    708708            list.put(filter, LIST_ENTRY_ENDSWITH);
     709            //debug("@@@ Match filter endswith: " + filter);
    709710        }
    710711        else {
    711712            list.put(filter, LIST_ENTRY_CONTAINS);
    712713        }
    713         //System.err.println("Got filter: " + filter);
     714        //debug("Got filter: " + filter);
    714715        }
    715716       
    716717    } catch (IOException ioe) {
    717         ioe.printStackTrace();
    718         System.err.println("\n@@@@@@@@@ Error reading into map from file " + filterListFilename);
     718        error("@@@@@@@@@ Error reading into map from file " + filterListFilename, ioe);
    719719    }
    720720   
     
    739739    for(int i = 0; i < WETFiles.length; i++) {
    740740        File WETFile = WETFiles[i];     
    741         logger.debug("Processing WETfile: " + WETFile);
     741        debug("Processing WETfile: " + WETFile);
    742742
    743743        // Any .gz files listed means they haven't been unzipped yet. So unzip.
     
    754754        // Check the unzipped WETFile exists       
    755755
    756         if(!WETFile.exists() || !WETFile.isFile()) {
    757         System.err.println("Error: " + WETFile + " does not exist (failure to unzip?)");
    758         logger.error("Error: " + WETFile + " does not exist (failure to unzip?)");
     756        if(!WETFile.exists() || !WETFile.isFile()) {       
     757        error("Error: " + WETFile + " does not exist (failure to unzip?)");
    759758        return;
    760759        }
     
    777776
    778777    // --------------- STATIC METHODS AND INNER CLASSED USED BY MAIN -------------- //
     778    public static void info(String msg) {
     779    System.err.println(msg);
     780    logger.info(msg);
     781    }
     782    public static void debug(String msg) {
     783    System.err.println(msg);
     784    logger.debug(msg);
     785    }
     786    public static void warn(String msg) {
     787    System.err.println(msg);
     788    logger.warn(msg);
     789    }
     790    public static void error(String msg) {
     791    System.err.println(msg);
     792    logger.error(msg);
     793    }
     794    public static void error(String msg, Exception e) {
     795    logger.error(msg, e);
     796    System.err.println(msg);
     797    e.printStackTrace();
     798    }
     799   
    779800    public static void printUsage() {
    780     System.err.println("Run this program as:");
    781     System.err.println("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");   
     801    info("Run this program as:");
     802    info("\tWetProcessor <folder containing wet(.gz) files> <output folder path>");
    782803    }
    783804
     
    824845        }
    825846        else {
    826         System.err.println("File " + f + " is not a directory");
     847        info("File " + f + " is not a directory");
    827848        }
    828849        return false;                 
     
    838859    File commoncrawlDir = new File(args[0]);
    839860    if(!commoncrawlDir.exists() || !commoncrawlDir.isDirectory()) {
    840         System.out.println("Error: " + args[0] + " does not exist or is not a directory");
     861        error("Error: " + args[0] + " does not exist or is not a directory");
    841862        return;
    842863    }
     
    844865    File outFolder = new File(args[1]);
    845866    if(!outFolder.exists() || !outFolder.isDirectory()) {
    846         System.out.println("Error: " + args[1] + " does not exist or is not a directory.");
     867        error("Error: " + args[1] + " does not exist or is not a directory.");
    847868        return;
    848869    }   
     
    855876    for(int i = 0; i < ccrawlFolders.length; i++) {
    856877        File ccrawlFolder = ccrawlFolders[i];
    857         System.err.println("About to process commoncrawl WET files folder: " + ccrawlFolder);
     878        info("About to process commoncrawl WET files folder: " + ccrawlFolder);
    858879        ccWETFilesProcessor.processAllWETFilesOfCrawl(ccrawlFolder);       
    859880    }
     
    869890    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile);
    870891
    871     System.out.println("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
    872 
    873     System.out.println("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
     892    info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
     893
     894    info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
    874895   
    875896   
    876897    } catch(Exception e) {
    877898    // can get an exception when instantiating CCWETProcessor instance
    878     e.printStackTrace();
    879     System.err.println(e.getMessage());
     899    error(e.getMessage(), e);
    880900    }
    881901   
Note: See TracChangeset for help on using the changeset viewer.