Changeset 33624


Ignore:
Timestamp:
2019-11-05T21:48:50+13:00 (4 years ago)
Author:
ak19
Message:

Some cleanup surrounding the now renamed function createSeedURLsFile, now called prepareSitesForNutchCrawling(). The main method now also takes a flag on whether it should prepare the possible-product-sites.txt file or not, as that is a step that takes a long time.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33623 r33624  
    251251     * as output the URL seed list and regex-urlfilter text files required by nutch, see
    252252     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
     253     *
     254     * This method creates seedURLs files and url-regexfilter files needed by nutch, instructing
     255     * it what portion to crawl of each site.
     256     *
     257     * The topSiteMatches file also gets created, listing sites excluded from crawling as
     258     * they're too large to exhaustively crawl. The user will be told to inspect this file
     259     * after this program has finished running.
     260     *
     261     * If checkForPossibleProductSites, then any urls containing /mi(/) that are outside of NZ
     262     * or whose geolocation isn't known will end up in the file denoted by possibleProductSitesFile
     263     *
    253264     */
    254     public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
     265    public void prepareSitesForNutchCrawling(File seedURLsFile, File urlFilterFile,
    255266                    File domainURLsFile, File topSiteMatchesFile,
    256                     File possibleProductSitesFile) {
     267                    boolean checkForPossibleProductSites, File possibleProductSitesFile) {
    257268    // Maintain a Map of unique domains mapped to seed urls at that domain
    258269    // TreeSet: by default, "the elements are ordered using their natural ordering"
     
    294305        }
    295306
    296         /*
    297         // Dr Nichols said that a url that was located outside the country and
    298         // which had /mi/ URLs was more likely to be an autotranslated (product) site.
    299         // Following Dr Nichols' idea, let's keep a look out for more product sites:
    300         // if any URL contains /mi AND the tld of its domain is outside of New Zealand
    301         // then add that domain (if not already added) and that url into a file
    302         // for later manual inspection
    303         if(!domainWithProtocol.endsWith(".nz")
    304            && (url.contains("/mi/") || url.endsWith("/mi"))) {
    305            
    306             if(!possibleProductDomains.contains(domainWithProtocol)) {
    307 
    308             String countryCode = "";
    309             try {
    310                 // more expensive test, so do this only if above conditions are true:
    311                 countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
    312                 System.err.println("@@@@ Got country code: " + countryCode);
    313             } catch(Exception exceptObj) {
    314                 countryCode = ""; // forces domain to be included for inspection
     307        if(checkForPossibleProductSites) {         
     308            // Dr Nichols said that a url that was located outside the country and
     309            // which had /mi/ URLs was more likely to be an autotranslated (product) site.
     310            // Following Dr Nichols' idea, let's keep a look out for more product sites:
     311            // if any URL contains /mi AND the tld of its domain is outside of New Zealand
     312            // then add that domain (if not already added) and that url into a file
     313            // for later manual inspection
     314            if(!domainWithProtocol.endsWith(".nz")
     315               && (url.contains("/mi/") || url.endsWith("/mi"))) {
     316           
     317            if(!possibleProductDomains.contains(domainWithProtocol)) {
    315318               
    316                 logger.error("Could not check if domain " + domainWithProtocol
    317                   + " was in country: " + countryCode,
    318                   exceptObj);
     319                String countryCode = "";
     320                try {
     321                // more expensive test, so do this only if above conditions are true:
     322                countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
     323                System.err.println("@@@@ Got country code: " + countryCode);
     324                } catch(Exception exceptObj) {
     325                countryCode = ""; // forces domain to be included for inspection
     326               
     327                logger.error("Could not check if domain " + domainWithProtocol
     328                         + " was in country: " + countryCode,
     329                         exceptObj);
     330                }
     331               
     332                boolean isInNZ = countryCode.toLowerCase().equals("nz");
     333               
     334               
     335                //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
     336                if(!isInNZ) {
     337                possibleProductDomains.add(domainWithProtocol);
     338                // write both domain and a sample seedURL on that site out to file
     339                possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");             
     340                possibleProductSitesWriter.write("\t" + url + "\n");
     341                }
    319342            }
    320 
    321             boolean isInNZ = countryCode.toLowerCase().equals("nz");
    322            
    323 
    324             //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
    325             if(!isInNZ) {
    326                 possibleProductDomains.add(domainWithProtocol);
    327                 // write both domain and a sample seedURL on that site out to file
    328                 possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");             
    329                 possibleProductSitesWriter.write("\t" + url + "\n");
    330             }
     343            //else {
     344            // already wrote out domain to file at some point, write just the URL out to file
     345            //possibleProductSitesWriter.write("\t" + url + "\n");
     346            //}         
    331347            }
    332             //else {
    333             // already wrote out domain to file at some point, write just the URL out to file
    334             //possibleProductSitesWriter.write("\t" + url + "\n");
    335             //}
    336         }
    337         */
     348        }
    338349        }
    339350    } catch (IOException ioe) {
     
    803814    public static void printUsage() {
    804815    System.err.println("Run this program as:");
    805     System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path>");   
     816    System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path> [--check-for-product-sites]");   
    806817    }
    807818
     
    855866
    856867    public static void main(String[] args) {
    857     if(args.length != 2) {
     868    if(args.length < 2 || args.length > 3) {
    858869        printUsage();
    859870        return;
     871    }
     872   
     873    boolean checkForPossibleProductSites = false;
     874    if(args.length == 3) {
     875        if(!args[2].equals("--check-for-product-sites")) {
     876        printUsage();
     877        return;
     878        } else {
     879        checkForPossibleProductSites = true;
     880        }
    860881    }
    861882   
     
    870891        logger.error("Error: " + args[1] + " does not exist or is not a directory.");
    871892        return;
    872     }   
     893    }
     894
    873895
    874896    try {
     
    891913    File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
    892914    File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt");
    893    
    894     ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile);
     915
     916   
     917    ccWETFilesProcessor.prepareSitesForNutchCrawling(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, checkForPossibleProductSites, possibleProductSitesFile);
    895918
    896919    logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
    897920
    898     logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
     921    if(checkForPossibleProductSites) {
     922        logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
     923    } else {
     924        possibleProductSitesFile.delete();
     925    }
    899926   
    900927   
Note: See TracChangeset for help on using the changeset viewer.