Changeset 34000
- Timestamp:
- 2020-03-09T18:55:01+13:00 (4 years ago)
- Location:
- other-projects/maori-lang-detection/src/org/greenstone/atea
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/AllDomainCount.java
r33984 r34000 30 30 private Set<String> uniqueURLs = new TreeSet<String>(); // stripped of protocol and www 31 31 32 // just for debugging 33 private Map<String, String> domainToInfile = new HashMap<String,String>(); 34 32 35 private int lineCount = 0; 33 36 private final String[] filenames = {"discardURLs.txt", "greyListed.txt", "keepURLs.txt"};; … … 64 67 // don't care about real uniqueness of domains - Set just ensures that each 65 68 // domain is listed once. Keeps http(s) and www distinct 66 String domainWithProtocol = Utility.getDomainForURL(url, true); 69 String domainWithProtocol = Utility.getDomainForURL(url, true); 67 70 domains.add(domainWithProtocol); 71 72 // START DEBUGGING - to inspect why domains tend to be repeated 73 // between (a) discardURLs and greylisted/keepURLs and 74 // (b) greylisted and keepURLs. 75 // Because (a) some pages on greylisted or even accepted sites contain 76 // less than minimum content and end up in discarded urls list. 77 // And (b) some domains contain URLs with /product-... suffix 78 // and are greylisted but the same domain can contain URLs without a /product 79 // pattern and end up as part of keepURLs. 80 if(!domainToInfile.containsKey(domainWithProtocol)) { 81 domainToInfile.put(domainWithProtocol, infile.getPath()); 82 } else { 83 String oldInfile = domainToInfile.get(domainWithProtocol); 84 if(!oldInfile.equals(infile.getPath())) { 85 System.err.println("\t*** " + domainWithProtocol + " is repeated between old " + oldInfile + " and new " + infile.getPath()); 86 } 87 } 88 // END DEBUGGING 68 89 69 90 // unique domains - so stripped of http(s) and www -
other-projects/maori-lang-detection/src/org/greenstone/atea/CountryCodeCountsMapData.java
r33981 r34000 708 708 // https://developer.mozilla.org/en-US/docs/Tools/Browser_Console 709 709 String webConsoleCommand = ":screenshot --selector \".map\" --file --filename \"" + outputFilePath + "\""; 710 System.out.println("The following command should becopied to your clipboard. Else copy it.");710 System.out.println("The following command should have been copied to your clipboard. Else copy it."); 711 711 System.out.println("Run it in the opened Firefox tab's Web Console command line (press F12 and undock web console first):\n"); 712 712 System.out.println(webConsoleCommand); -
other-projects/maori-lang-detection/src/org/greenstone/atea/Utility.java
r33887 r34000 93 93 url = stripProtocolFromURL(url); 94 94 95 if(url.startsWith("www.")) { // strip any "wwww." at start as well too95 if(url.startsWith("www.")) { // also strip any "www." at start 96 96 url = url.substring(4); 97 97 }
Note:
See TracChangeset
for help on using the changeset viewer.