Context Navigation

← Previous Changeset
Next Changeset →

Changeset 33624

Timestamp:

2019-11-05T21:48:50+13:00 (4 years ago)

Author:

ak19

Message:

Some cleanup surrounding the now renamed function createSeedURLsFile, now called prepareSitesForNutchCrawling(). The main method now also takes a flag on whether it should prepare the possible-product-sites.txt file or not, as that is a step that takes a long time.

File:

: 1 edited

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

-              r33623
+              r33624
      * as output the URL seed list and regex-urlfilter text files required by nutch, see
      * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial
+     *
+     * This method creates seedURLs files and url-regexfilter files needed by nutch, instructing
+     * it what portion to crawl of each site.
+     *
+     * The topSiteMatches file also gets created, listing sites excluded from crawling as
+     * they're too large to exhaustively crawl. The user will be told to inspect this file
+     * after this program has finished running.
+     *
+     * If checkForPossibleProductSites, then any urls containing /mi(/) that are outside of NZ
+     * or whose geolocation isn't known will end up in the file denoted by possibleProductSitesFile
+     *
      */
     public void createSeedURLsFiles(File seedURLsFile, File urlFilterFile,
+    public void prepareSitesForNutchCrawling(File seedURLsFile, File urlFilterFile,
                     File domainURLsFile, File topSiteMatchesFile,
                     File possibleProductSitesFile) {
+                    boolean checkForPossibleProductSites, File possibleProductSitesFile) {
     // Maintain a Map of unique domains mapped to seed urls at that domain
     // TreeSet: by default, "the elements are ordered using their natural ordering"
 …
+        }
+        /*
+        // Dr Nichols said that a url that was located outside the country and
+        // which had /mi/ URLs was more likely to be an autotranslated (product) site.
+        // Following Dr Nichols' idea, let's keep a look out for more product sites:
+        // if any URL contains /mi AND the tld of its domain is outside of New Zealand
+        // then add that domain (if not already added) and that url into a file
+        // for later manual inspection
+        if(!domainWithProtocol.endsWith(".nz")
+           && (url.contains("/mi/") || url.endsWith("/mi"))) {
+            if(!possibleProductDomains.contains(domainWithProtocol)) {
+            String countryCode = "";
+            try {
+                // more expensive test, so do this only if above conditions are true:
+                countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
+                System.err.println("@@@@ Got country code: " + countryCode);
+            } catch(Exception exceptObj) {
+                countryCode = ""; // forces domain to be included for inspection
+        if(checkForPossibleProductSites) {
+            // Dr Nichols said that a url that was located outside the country and
+            // which had /mi/ URLs was more likely to be an autotranslated (product) site.
+            // Following Dr Nichols' idea, let's keep a look out for more product sites:
+            // if any URL contains /mi AND the tld of its domain is outside of New Zealand
+            // then add that domain (if not already added) and that url into a file
+            // for later manual inspection
+            if(!domainWithProtocol.endsWith(".nz")
+               && (url.contains("/mi/") || url.endsWith("/mi"))) {
+            if(!possibleProductDomains.contains(domainWithProtocol)) {
+                logger.error("Could not check if domain " + domainWithProtocol
+                  + " was in country: " + countryCode,
+                  exceptObj);
+                String countryCode = "";
+                try {
+                // more expensive test, so do this only if above conditions are true:
+                countryCode = Utility.getCountryCodeOfDomain(domainWithProtocol, geoLiteCityDatFile);
+                System.err.println("@@@@ Got country code: " + countryCode);
+                } catch(Exception exceptObj) {
+                countryCode = ""; // forces domain to be included for inspection
+                logger.error("Could not check if domain " + domainWithProtocol
+                         + " was in country: " + countryCode,
+                         exceptObj);
+                }
+                boolean isInNZ = countryCode.toLowerCase().equals("nz");
+                //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
+                if(!isInNZ) {
+                possibleProductDomains.add(domainWithProtocol);
+                // write both domain and a sample seedURL on that site out to file
+                possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");
+                possibleProductSitesWriter.write("\t" + url + "\n");
+                }
+            }
+            boolean isInNZ = countryCode.toLowerCase().equals("nz");
+            //if(!Utility.isDomainInCountry(domainWithProtocol, "nz", geoLiteCityDatFile)) {
+            if(!isInNZ) {
+                possibleProductDomains.add(domainWithProtocol);
+                // write both domain and a sample seedURL on that site out to file
+                possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");
+                possibleProductSitesWriter.write("\t" + url + "\n");
+            }
+            //else {
+            // already wrote out domain to file at some point, write just the URL out to file
+            //possibleProductSitesWriter.write("\t" + url + "\n");
+            //}
+            }
+            //else {
+            // already wrote out domain to file at some point, write just the URL out to file
+            //possibleProductSitesWriter.write("\t" + url + "\n");
+            //}
+        }
+        */
+        }
+        }
     } catch (IOException ioe) {
 …
     public static void printUsage() {
     System.err.println("Run this program as:");
     System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path>");
+    System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path> [--check-for-product-sites]");
+    }
 …
     public static void main(String[] args) {
     if(args.length != 2) {
+    if(args.length < 2 || args.length > 3) {
         printUsage();
         return;
+    }
+    boolean checkForPossibleProductSites = false;
+    if(args.length == 3) {
+        if(!args[2].equals("--check-for-product-sites")) {
+        printUsage();
+        return;
+        } else {
+        checkForPossibleProductSites = true;
+        }
+    }
 …
         logger.error("Error: " + args[1] + " does not exist or is not a directory.");
         return;
+    }
+    }
     try {
 …
     File topSitesMatchedFile = new File(outFolder, "unprocessed-topsite-matches.txt");
     File possibleProductSitesFile = new File(outFolder, "possible-product-sites.txt");
+    ccWETFilesProcessor.createSeedURLsFiles(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, possibleProductSitesFile);
+    ccWETFilesProcessor.prepareSitesForNutchCrawling(seedURLsFile, urlFilterFile, domainURLsFile, topSitesMatchedFile, checkForPossibleProductSites, possibleProductSitesFile);
     logger.info("\n*** Inspect urls in greylist at " + ccWETFilesProcessor.greyListedFile + "\n");
+    logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
+    if(checkForPossibleProductSites) {
+        logger.info("\n*** Check " + topSitesMatchedFile + " for sites not prepared for crawling because they matched top sites for which no regex of allowed url patterns were specified in sites-too-big-to-exhaustively-crawl.txt.\n");
+    } else {
+        possibleProductSitesFile.delete();
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 33624

Legend:

gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

Download in other formats: