Changeset 34000

Show
Ignore:
Timestamp:
09.03.2020 18:55:01 (3 weeks ago)
Author:
ak19
Message:

Some debugging and other minor changes

Location:
other-projects/maori-lang-detection/src/org/greenstone/atea
Files:
3 modified

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/AllDomainCount.java

    r33984 r34000  
    3030    private Set<String> uniqueURLs = new TreeSet<String>(); // stripped of protocol and www 
    3131 
     32    // just for debugging 
     33    private Map<String, String> domainToInfile = new HashMap<String,String>(); 
     34     
    3235    private int lineCount = 0; 
    3336    private final String[] filenames = {"discardURLs.txt", "greyListed.txt", "keepURLs.txt"};; 
     
    6467        // don't care about real uniqueness of domains - Set just ensures that each 
    6568        // domain is listed once. Keeps http(s) and www distinct 
    66         String domainWithProtocol = Utility.getDomainForURL(url, true);      
     69        String domainWithProtocol = Utility.getDomainForURL(url, true); 
    6770        domains.add(domainWithProtocol); 
     71         
     72        // START DEBUGGING - to inspect why domains tend to be repeated 
     73        // between (a) discardURLs and greylisted/keepURLs and 
     74        // (b) greylisted and keepURLs. 
     75        // Because (a) some pages on greylisted or even accepted sites contain 
     76        // less than minimum content and end up in discarded urls list. 
     77        // And (b) some domains contain URLs with /product-... suffix 
     78        // and are greylisted but the same domain can contain URLs without a /product 
     79        // pattern and end up as part of keepURLs. 
     80        if(!domainToInfile.containsKey(domainWithProtocol)) { 
     81            domainToInfile.put(domainWithProtocol, infile.getPath()); 
     82        } else { 
     83            String oldInfile = domainToInfile.get(domainWithProtocol); 
     84            if(!oldInfile.equals(infile.getPath())) { 
     85            System.err.println("\t*** " + domainWithProtocol + " is repeated between old " + oldInfile + " and new " + infile.getPath()); 
     86            } 
     87        } 
     88        // END DEBUGGING 
    6889         
    6990        // unique domains - so stripped of http(s) and www 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java

    r33981 r34000  
    708708    // https://developer.mozilla.org/en-US/docs/Tools/Browser_Console 
    709709    String webConsoleCommand = ":screenshot --selector \".map\" --file --filename \"" + outputFilePath + "\"";   
    710     System.out.println("The following command should be copied to your clipboard. Else copy it."); 
     710    System.out.println("The following command should have been copied to your clipboard. Else copy it."); 
    711711    System.out.println("Run it in the opened Firefox tab's Web Console command line (press F12 and undock web console first):\n"); 
    712712    System.out.println(webConsoleCommand); 
  • other-projects/maori-lang-detection/src/org/greenstone/atea/Utility.java

    r33887 r34000  
    9393    url = stripProtocolFromURL(url); 
    9494     
    95     if(url.startsWith("www.")) { // strip any "wwww." at start as well too 
     95    if(url.startsWith("www.")) { // also strip any "www." at start 
    9696        url = url.substring(4); 
    9797    }