Changeset 33624

Show
Ignore:
Timestamp:
05.11.2019 21:48:50 (9 days ago)
Author:
ak19
Message:

Some cleanup surrounding the now renamed function createSeedURLsFile, now called prepareSitesForNutchCrawling(). The main method now also takes a flag on whether it should prepare the possible-product-sites.txt file or not, as that is a step that takes a long time.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33623 r33624  
    251251     * as output the URL seed list and regex-urlfilter text files required by nutch, see 
    252252     * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 
     253     * 
     254     * This method creates seedURLs files and url-regexfilter files needed by nutch, instructing 
     255     * it what portion to crawl of each site. 
     256     * 
     257     * The topSiteMatches file also gets created, listing sites excluded from crawling as 
     258     * they're too large to exhaustively crawl. The user will be told to inspect this file 
     259     * after this program has finished running. 
     260     * 
     261     * If checkForPossibleProductSites, then any urls containing /mi(/) that are outside of NZ 
     262     * or whose geolocation isn't known will end up in the file denoted by possibleProductSitesFile 
     263     * 
    253264     */ 
    254     public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile, 
     265    public void prepareSitesForNutchCrawling(File seedURLsFile, File urlFilterFile, 
    255266                    File domainURLsFile, File topSiteMatchesFile, 
    256                     File possibleProductSitesFile) { 
     267                    boolean checkForPossibleProductSites, File possibleProductSitesFile) { 
    257268    // Maintain a Map of unique domains mapped to seed urls at that domain 
    258269    // TreeSet: by default, "the elements are ordered using their natural ordering" 
     
    294305        } 
    295306 
    296         /* 
    297         // Dr Nichols said that a url that was located outside the country and 
    298         // which had /mi/ URLs was more likely to be an autotranslated (product) site. 
    299         // Following Dr Nichols' idea, let's keep a look out for more product sites: 
    300         // if any URL contains /mi AND the tld of its domain is outside of New Zealand 
    301         // then add that domain (if not already added) and that url into a file 
    302         // for later manual inspection 
    303         if(!domainWithProtocol.endsWith(".nz") 
    304            && (url.contains("/mi/") || url.endsWith("/mi"))) { 
    305              
    306             if(!possibleProductDomains.contains(domainWithProtocol)) { 
    307  
    308             String countryCode = ""; 
    309             try { 
    310                 // more expensive test, so do this only if above conditions are true: 
    311                 countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile); 
    312                 System.err.println("@@@@ Got country code: " + countryCode); 
    313             } catch(Exception exceptObj) { 
    314                 countryCode = ""; // forces domain to be included for inspection 
     307        if(checkForPossibleProductSites) {           
     308            // Dr Nichols said that a url that was located outside the country and 
     309            // which had /mi/ URLs was more likely to be an autotranslated (product) site. 
     310            // Following Dr Nichols' idea, let's keep a look out for more product sites: 
     311            // if any URL contains /mi AND the tld of its domain is outside of New Zealand 
     312            // then add that domain (if not already added) and that url into a file 
     313            // for later manual inspection 
     314            if(!domainWithProtocol.endsWith(".nz") 
     315               && (url.contains("/mi/") || url.endsWith("/mi"))) { 
     316             
     317            if(!possibleProductDomains.contains(domainWithProtocol)) { 
    315318                 
    316                 logger.error("Could not check if domain " + domainWithProtocol 
    317                   + " was in country: " + countryCode, 
    318                   exceptObj); 
     319                String countryCode = ""; 
     320                try { 
     321                // more expensive test, so do this only if above conditions are true: 
     322                countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile); 
     323                System.err.println("@@@@ Got country code: " + countryCode); 
     324                } catch(Exception exceptObj) { 
     325                countryCode = ""; // forces domain to be included for inspection 
     326                 
     327                logger.error("Could not check if domain " + domainWithProtocol 
     328                         + " was in country: " + countryCode, 
     329                         exceptObj); 
     330                } 
     331                 
     332                boolean isInNZ = countryCode.toLowerCase().equals("nz"); 
     333                 
     334                 
     335                //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 
     336                if(!isInNZ) { 
     337                possibleProductDomains.add(domainWithProtocol); 
     338                // write both domain and a sample seedURL on that site out to file 
     339                possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");               
     340                possibleProductSitesWriter.write("\t" + url + "\n"); 
     341                } 
    319342            } 
    320  
    321             boolean isInNZ = countryCode.toLowerCase().equals("nz"); 
    322              
    323  
    324             //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) { 
    325             if(!isInNZ) { 
    326                 possibleProductDomains.add(domainWithProtocol); 
    327                 // write both domain and a sample seedURL on that site out to file 
    328                 possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");               
    329                 possibleProductSitesWriter.write("\t" + url + "\n"); 
    330             } 
     343            //else { 
     344            // already wrote out domain to file at some point, write just the URL out to file 
     345            //possibleProductSitesWriter.write("\t" + url + "\n"); 
     346            //}          
    331347            } 
    332             //else { 
    333             // already wrote out domain to file at some point, write just the URL out to file 
    334             //possibleProductSitesWriter.write("\t" + url + "\n"); 
    335             //} 
    336         } 
    337         */ 
     348        } 
    338349        } 
    339350    } catch (IOException ioe) { 
     
    803814    public static void printUsage() { 
    804815    System.err.println("Run this program as:"); 
    805     System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path>");    
     816    System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path> [--check-for-product-sites]");    
    806817    } 
    807818 
     
    855866 
    856867    public static void main(String[] args) { 
    857     if(args.length != 2) { 
     868    if(args.length < 2 || args.length > 3) { 
    858869        printUsage(); 
    859870        return; 
     871    } 
     872     
     873    boolean checkForPossibleProductSites = false; 
     874    if(args.length == 3) { 
     875        if(!args[2].equals("--check-for-product-sites")) { 
     876        printUsage(); 
     877        return; 
     878        } else { 
     879        checkForPossibleProductSites = true; 
     880        } 
    860881    } 
    861882     
     
    870891        logger.error("Error: " + args[1] + " does not exist or is not a directory."); 
    871892        return; 
    872     }    
     893    } 
     894 
    873895 
    874896    try { 
     
    891913    File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt"); 
    892914    File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt"); 
    893      
    894     ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile); 
     915 
     916     
     917    ccWETFilesProcessor.prepareSitesForNutchCrawling(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, checkForPossibleProductSites, possibleProductSitesFile); 
    895918 
    896919    logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n"); 
    897920 
    898     logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); 
     921    if(checkForPossibleProductSites) { 
     922        logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n"); 
     923    } else { 
     924        possibleProductSitesFile.delete(); 
     925    } 
    899926     
    900927