Changeset 33488 for gs3-extensions
- Timestamp:
- 2019-09-17T14:48:36+12:00 (5 years ago)
- Location:
- gs3-extensions/maori-lang-detection
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/maori-lang-detection/bin/script/unique_mri_domains_from_cc.sh
r33471 r33488 69 69 70 70 71 done < <(cat sorted_MRI_urls.txt) 71 done < <(cat sorted_MRI_urls.txt) 72 72 73 73 echo "**************************************" -
gs3-extensions/maori-lang-detection/src/org/greenstone/atea/WETProcessor.java
r33480 r33488 5 5 import java.util.Properties; 6 6 import java.util.zip.GZIPInputStream; 7 import java.util.Iterator; 8 import java.util.Set; 9 import java.util.TreeSet; 7 10 8 11 import org.apache.log4j.Logger; … … 301 304 } 302 305 } 306 307 308 /** 309 * Takes as input the keepURLs.txt file generated by running WETProcessor instances. 310 * As output produces the URL seed list and regex-urlfilter text files required by nutch, 311 * https://cwiki.apache.org/confluence/display/nutch/NutchTutorial 312 */ 313 public static void createSeedURLsFiles(File urlsFile, File seedURLsFile, File urlFilterFile) { 314 // Maintain Sets of unique domains and urls 315 // TreeSet: by default, "the elements are ordered using their natural ordering" 316 // (or by a Comparator provided at set creation time). 317 // Whereas HashSet doesn't guarantee ordering. 318 // So we get alphabetic sorting for free. And guaranteed log(n) for basic operations. 319 320 Set<String> domainsSet = new TreeSet<String>(); 321 Set<String> urlsSet = new TreeSet<String>(); 322 323 final String FILTER_REGEX_PREFIX = "+https?://([a-z0-9-]+\\.)*"; // https?://([a-z0-9-]+\.)* 324 325 try ( 326 BufferedReader reader = new BufferedReader(new FileReader(urlsFile)); 327 ) { 328 329 // read a URL at a time from urlsFile 330 String url = null; 331 String domain = null; 332 while((url = reader.readLine()) != null) { // readLine removes newline separator 333 334 // work out domain. This retains any www. or subdomain prefix: 335 int startIndex = url.indexOf("//"); // http:// or https:// prefix 336 startIndex = (startIndex == -1) ? 0 : (startIndex+2); // skip past the protocol's // portion 337 domain = url.substring(startIndex); 338 int endIndex = domain.indexOf("/"); 339 if(endIndex == -1) endIndex = domain.length(); 340 domain = domain.substring(0, endIndex); 341 342 //if(!domainsMap.containsKey(domain)) { 343 urlsSet.add(url); 344 domainsSet.add(domain); 345 //} 346 } 347 } catch (IOException ioe) { 348 ioe.printStackTrace(); 349 System.err.println("\n@@@@@@@@@ Error reading in urls from file " + urlsFile); 350 } 351 352 try (BufferedWriter seedURLsWriter = new BufferedWriter(new FileWriter(seedURLsFile))) { 353 Iterator<String> i = urlsSet.iterator(); 354 while(i.hasNext()) { 355 String url = i.next(); 356 seedURLsWriter.write(url + "\n"); 357 } 358 359 } catch (IOException ioe) { 360 ioe.printStackTrace(); 361 System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile); 362 } 363 364 try (BufferedWriter urlFilterWriter = new BufferedWriter(new FileWriter(urlFilterFile))) { 365 Iterator<String> i = domainsSet.iterator(); 366 // nutch.apache.org => +^https?://([a-z0-9-]+\.)*nutch\.apache\.org/ 367 while(i.hasNext()) { 368 String domain = i.next(); 369 domain = FILTER_REGEX_PREFIX + domain.replace(".", "\\.") + "/"; 370 urlFilterWriter.write(domain + "\n"); 371 } 372 373 } catch (IOException ioe) { 374 ioe.printStackTrace(); 375 System.err.println("\n@@@@@@@@@ Error writing to either " + seedURLsFile + " or " + urlFilterFile); 376 } 377 } 378 303 379 304 305 380 //public static int getRecordCount() { return recordCount; } 306 381 … … 412 487 413 488 } 489 490 File seedURLsFile = new File(outFolder, "seedURLs.txt"); 491 File urlFilterFile = new File(outFolder, "regex-urlfilter.txt"); 492 WETProcessor.createSeedURLsFiles(WETProcessor.keepURLsFile, seedURLsFile, urlFilterFile); 414 493 415 494 return;
Note:
See TracChangeset
for help on using the changeset viewer.