Changeset 33518
- Timestamp:
- 2019-09-24T21:13:47+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/CCWETProcessor.java
r33517 r33518 9 9 import java.util.Map; 10 10 import java.util.Set; 11 import java.util.TreeMap; 11 12 import java.util.TreeSet; 12 13 … … 92 93 93 94 /** Map of domains we keep and the full urls we're keeping that are of that domain. 94 * Choosing a TreeMap to preserve natural (alphabetical) ordering of keys, 95 * since a HashMap has no notion of ordering. 96 */ 97 private TreeMap<String, TreeSet<String>> domainsToURLsMap; 95 * No need to use a TreeMap which preserves natural (alphabetical) ordering of keys, 96 * while a HashMap has no notion of ordering, because we just need to store urls with 97 * their domains. Whether the domains are sorted or the urls per domain are sorted becomes 98 * irrelevant. (Does it really? What if we have urls followed vs preceded by urls with the 99 * same prefix, e.g. pinky.com/toto/index.html and pinky.com/toto/nono/file.html 100 * Is there any benefit to nutch when crawling if these seedURLs are ordered or not?) 101 */ 102 private Map<String, Set<String>> domainsToURLsMap; 98 103 99 104 // Keep a count of all the records that all WETProcessors instantiated … … 184 189 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations. 185 190 186 Set<String> domainsSet = new TreeSet<String>(); 187 Set<String> urlsSet = new TreeSet<String>(); 188 191 //Set<String> domainsSet = new TreeSet<String>(); 192 //Set<String> urlsSet = new TreeSet<String>(); 193 domainsToURLsMap = new TreeMap<String, Set<String>>(); 194 189 195 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* 190 196 … … 206 212 domain = domain.substring(0, endIndex); 207 213 208 //if(!domainsMap.containsKey(domain)) { 209 urlsSet.add(url); 210 domainsSet.add(domain); 211 //} 214 //urlsSet.add(url); 215 //domainsSet.add(domain); 216 Set<String> urlsSet; 217 if(!domainsToURLsMap.containsKey(domain)) { 218 urlsSet = new TreeSet<String>(); 219 urlsSet.add(url); 220 domainsToURLsMap.put(domain, urlsSet); 221 } else { 222 urlsSet = domainsToURLsMap.get(domain); 223 urlsSet.add(url); 224 } 225 212 226 } 213 227 } catch (IOException ioe) { … … 215 229 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + this.keepURLsFile); 216 230 } 217 231 232 /* 218 233 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 219 234 Iterator<String> i = urlsSet.iterator(); … … 227 242 System.err.println("\n@@@@@@@@@ Error writing to " + seedURLsFile); 228 243 } 229 244 */ 245 246 // write out each domain followed in sequence by all urls we found in that domain 247 // (urls with tab up front) 248 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 249 //Set<Map.Entry<String, Set<String>>> domainsSet = domainsToURLsMap.keySet(); 250 Set<String> domainsSet = domainsToURLsMap.keySet(); 251 Iterator<String> domainIterator = domainsSet.iterator(); 252 253 while(domainIterator.hasNext()) { 254 // write out the domain 255 String domain = domainIterator.next(); 256 seedURLsWriter.write(domain + "\n"); 257 258 // next write out the urls for the domain with a tab prefixed to each 259 Set<String> urlsForDomainSet = domainsToURLsMap.get(domain); 260 Iterator<String> urlIterator = urlsForDomainSet.iterator(); 261 while(urlIterator.hasNext()) { 262 String url = urlIterator.next(); 263 seedURLsWriter.write("\t" + url + "\n"); 264 } 265 } 266 267 } catch (IOException ioe) { 268 ioe.printStackTrace(); 269 System.err.println("\n@@@@@@@@@ Error writing to " + urlFilterFile); 270 } 271 272 // write out domains as regular expressions into "regex-urlfilter.txt" file 230 273 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) { 274 Set<String> domainsSet = domainsToURLsMap.keySet(); 231 275 Iterator<String> i = domainsSet.iterator(); 232 276 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/
Note:
See TracChangeset
for help on using the changeset viewer.