Changeset 34000


Ignore:
Timestamp:
2020-03-09T18:55:01+13:00 (4 years ago)
Author:
ak19
Message:

Some debugging and other minor changes

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/AllDomainCount.java

    r33984 r34000  
    3030    private Set<String> uniqueURLs = new TreeSet<String>(); // stripped of protocol and www
    3131
     32    // just for debugging
     33    private Map<String, String> domainToInfile = new HashMap<String,String>();
     34   
    3235    private int lineCount = 0;
    3336    private final String[] filenames = {"discardURLs.txt", "greyListed.txt", "keepURLs.txt"};;
     
    6467        // don't care about real uniqueness of domains - Set just ensures that each
    6568        // domain is listed once. Keeps http(s) and www distinct
    66         String domainWithProtocol = Utility.getDomainForURL(url, true);     
     69        String domainWithProtocol = Utility.getDomainForURL(url, true);
    6770        domains.add(domainWithProtocol);
     71       
     72        // START DEBUGGING - to inspect why domains tend to be repeated
     73        // between (a) discardURLs and greylisted/keepURLs and
     74        // (b) greylisted and keepURLs.
     75        // Because (a) some pages on greylisted or even accepted sites contain
     76        // less than minimum content and end up in discarded urls list.
     77        // And (b) some domains contain URLs with /product-... suffix
     78        // and are greylisted but the same domain can contain URLs without a /product
     79        // pattern and end up as part of keepURLs.
     80        if(!domainToInfile.containsKey(domainWithProtocol)) {
     81            domainToInfile.put(domainWithProtocol, infile.getPath());
     82        } else {
     83            String oldInfile = domainToInfile.get(domainWithProtocol);
     84            if(!oldInfile.equals(infile.getPath())) {
     85            System.err.println("\t*** " + domainWithProtocol + " is repeated between old " + oldInfile + " and new " + infile.getPath());
     86            }
     87        }
     88        // END DEBUGGING
    6889       
    6990        // unique domains - so stripped of http(s) and www
  • other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java

    r33981 r34000  
    708708    // https://developer.mozilla.org/en-US/docs/Tools/Browser_Console
    709709    String webConsoleCommand = ":screenshot --selector \".map\" --file --filename \"" + outputFilePath + "\""; 
    710     System.out.println("The following command should be copied to your clipboard. Else copy it.");
     710    System.out.println("The following command should have been copied to your clipboard. Else copy it.");
    711711    System.out.println("Run it in the opened Firefox tab's Web Console command line (press F12 and undock web console first):\n");
    712712    System.out.println(webConsoleCommand);
  • other-projects/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33887 r34000  
    9393    url = stripProtocolFromURL(url);
    9494   
    95     if(url.startsWith("www.")) { // strip any "wwww." at start as well too
     95    if(url.startsWith("www.")) { // also strip any "www." at start
    9696        url = url.substring(4);
    9797    }
Note: See TracChangeset for help on using the changeset viewer.