- Timestamp:
- 2020-03-09T18:55:01+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/AllDomainCount.java
r33984 r34000 30 30 private Set<String> uniqueURLs = new TreeSet<String>(); // stripped of protocol and www 31 31 32 // just for debugging 33 private Map<String, String> domainToInfile = new HashMap<String,String>(); 34 32 35 private int lineCount = 0; 33 36 private final String[] filenames = {"discardURLs.txt", "greyListed.txt", "keepURLs.txt"};; … … 64 67 // don't care about real uniqueness of domains - Set just ensures that each 65 68 // domain is listed once. Keeps http(s) and www distinct 66 String domainWithProtocol = Utility.getDomainForURL(url, true); 69 String domainWithProtocol = Utility.getDomainForURL(url, true); 67 70 domains.add(domainWithProtocol); 71 72 // START DEBUGGING - to inspect why domains tend to be repeated 73 // between (a) discardURLs and greylisted/keepURLs and 74 // (b) greylisted and keepURLs. 75 // Because (a) some pages on greylisted or even accepted sites contain 76 // less than minimum content and end up in discarded urls list. 77 // And (b) some domains contain URLs with /product-... suffix 78 // and are greylisted but the same domain can contain URLs without a /product 79 // pattern and end up as part of keepURLs. 80 if(!domainToInfile.containsKey(domainWithProtocol)) { 81 domainToInfile.put(domainWithProtocol, infile.getPath()); 82 } else { 83 String oldInfile = domainToInfile.get(domainWithProtocol); 84 if(!oldInfile.equals(infile.getPath())) { 85 System.err.println("\t*** " + domainWithProtocol + " is repeated between old " + oldInfile + " and new " + infile.getPath()); 86 } 87 } 88 // END DEBUGGING 68 89 69 90 // unique domains - so stripped of http(s) and www
Note:
See TracChangeset
for help on using the changeset viewer.