Ignore:
Timestamp:
2019-11-05T21:04:09+13:00 (4 years ago)
Author:
ak19
Message:
  1. Incorporated Dr Nichols earlier suggestion of storing page modified time and char-encoding metadata if present in the crawl dump output. Have done so, but neither modifiedTime nor fetchTime metadata of the dump file appear to be a webpage's actual modified time, as they're from 2019 and set around the period we've been crawling. 2. Moved getDomainFromURL() function from CCWETProcessor.java to Utility.java since it's been reused. 3. MongoDBAccess class successfully connects (at least, no exceptions) and uses the newly added properties in config.properties to make the connection.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java

    r33615 r33623  
    237237    }
    238238
    239     /** Work out the 'domain' for a given url.
    240      * This retains any www. or subdomain prefix.
    241      */
    242     public static String getDomainForURL(String url, boolean withProtocol) {
    243     int startIndex = startIndex = url.indexOf("//"); // for http:// or https:// prefix
    244     startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion
    245     // the keep the URL around in case param withProtocol=true
    246     String protocol = (startIndex == -1) ? "" : url.substring(0, startIndex);
    247    
    248     String domain = url.substring(startIndex);
    249     int endIndex = domain.indexOf("/");
    250     if(endIndex == -1) endIndex = domain.length();
    251     domain = domain.substring(0, endIndex);
    252 
    253     if(withProtocol) {
    254         // now that we have the domain (everything to the first / when there is no protocol)
    255         // can glue the protocol back on
    256         domain = protocol + domain;
    257     }
    258    
    259     return domain;
    260     }
    261239
    262240    /** Utility function to help escape regex characters in URL to go into regex-urlfilter.txt */
     
    304282        // work out domain. This retains any www. or subdomain prefix
    305283        // passing true to further also retain the http(s) protocol
    306         domainWithProtocol = getDomainForURL(url, true);
     284        domainWithProtocol = Utility.getDomainForURL(url, true);
    307285
    308286        Set<String> urlsSet;
     
    316294        }
    317295
     296        /*
    318297        // Dr Nichols said that a url that was located outside the country and
    319298        // which had /mi/ URLs was more likely to be an autotranslated (product) site.
     
    322301        // then add that domain (if not already added) and that url into a file
    323302        // for later manual inspection
    324         if(!domainWithProtocol.endsWith(".nz") && (url.contains("/mi/") || url.endsWith("/mi"))) {
    325             /*
     303        if(!domainWithProtocol.endsWith(".nz")
     304           && (url.contains("/mi/") || url.endsWith("/mi"))) {
     305           
    326306            if(!possibleProductDomains.contains(domainWithProtocol)) {
    327307
     
    345325            if(!isInNZ) {
    346326                possibleProductDomains.add(domainWithProtocol);
    347                 // write both domain and a sample URL on that site out to file
     327                // write both domain and a sample seedURL on that site out to file
    348328                possibleProductSitesWriter.write(countryCode + " : " + domainWithProtocol + "\n");             
    349329                possibleProductSitesWriter.write("\t" + url + "\n");
    350330            }
    351             }*/ /*else {
    352             // already wrote out domain to file at some point, write just the URL out to file
    353             possibleProductSitesWriter.write("\t" + url + "\n");
    354             }*/
    355         }
     331            }
     332            //else {
     333            // already wrote out domain to file at some point, write just the URL out to file
     334            //possibleProductSitesWriter.write("\t" + url + "\n");
     335            //}
     336        }
     337        */
    356338        }
    357339    } catch (IOException ioe) {
     
    686668    // if any portion of the URL contains the word "livejasmin", or even "jasmin" actually,
    687669    // then it's an adult site, so blacklist the entire domain if it wasn't already blacklisted
    688     String domainWithoutProtocol = getDomainForURL(url, false); // remove protocol
     670    String domainWithoutProtocol = Utility.getDomainForURL(url, false); // remove protocol
    689671    if(!isBlackListed && url.contains("jasmin")) {
    690672        logger.warn("### Blacklisting additional domain (likely an adult site): " + domainWithoutProtocol);
     
    821803    public static void printUsage() {
    822804    System.err.println("Run this program as:");
    823     System.err.println("\tCCWetProcessor <path to 'ccrawl-data' folder> <output folder path>");
     805    System.err.println("\tCCWetProcessor <path to 'ccrawl-data' input folder> <output folder path>");   
    824806    }
    825807
Note: See TracChangeset for help on using the changeset viewer.