Changeset 33884 for other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
- Timestamp:
- 2020-01-31T22:21:40+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
r33883 r33884 30 30 private File outFolder; 31 31 32 33 34 public static class Tuple { 35 public final String url; 36 public final String countryCode; 37 38 public Tuple(String url, String countryCode) { 39 this.url = url; 40 this.countryCode = countryCode; 41 } 42 43 public String toString() { 44 return this.url + "," + countryCode; 45 } 46 } 47 32 48 33 49 public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder) … … 46 62 47 63 public void produceURLsForPagesInMRI(File domainsFile) { 48 ArrayList< String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);64 ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 49 65 File outFile = new File(outFolder, "isMRI_"+domainsFile.getName()); 50 66 writeURLsToFile(urlsList, outFile, urlsList.size()); … … 55 71 56 72 public void produceURLsForPagesContainingMRI(File domainsFile) { 57 ArrayList< String> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile);73 ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.CONTAINS_MRI, domainsFile); 58 74 File outFile = new File(outFolder, "containsMRI_"+domainsFile.getName()); 59 75 writeURLsToFile(urlsList, outFile, urlsList.size()); … … 63 79 } 64 80 65 private ArrayList< String> getURLsForWebPages(int filterType, File domainsFile) {66 ArrayList< String> urlsList = new ArrayList<String>();81 private ArrayList<Tuple> getURLsForWebPages(int filterType, File domainsFile) { 82 ArrayList<Tuple> urlsList = new ArrayList<Tuple>(); 67 83 68 84 // 1. read each url from the domainsFile … … 79 95 domain = domain.trim(); 80 96 if(!domain.equals("")) { 97 98 String countryCode = ""; 99 int index = domain.lastIndexOf(","); 100 if(index != -1) { 101 countryCode = domain.substring(index+1).trim(); 102 domain = domain.substring(0, index); 103 } 81 104 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType); 82 urlsList.addAll(moreURLs); 105 106 // Print out whether there were no isMRI pages for the domain (only containsMRI). A useful thing to know 83 107 if(moreURLs.size() == 0 && filterType == MongoDBAccess.IS_MRI) { 84 System.out.println(" Domain " + domain + " had no isMRI webpages (only containsMRI).");108 System.out.println(" " + countryCode + " domain " + domain + " had no isMRI webpages - only containsMRI."); 85 109 } 110 111 //urlsList.addAll(moreURLs); 112 for(int i = 0; i < moreURLs.size(); i++) { 113 urlsList.add(new Tuple(moreURLs.get(i), countryCode)); 114 } 115 86 116 } 87 117 } 118 System.err.println(""); 88 119 } catch(Exception e) { 89 120 logger.error("Unable to read URLs from file " + getFilePath(domainsFile)); … … 112 143 113 144 // 0. get a list of all the web pages in the given domain listing where isMRI = true 114 ArrayList< String> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile);145 ArrayList<Tuple> urlsList = getURLsForWebPages(MongoDBAccess.IS_MRI, domainsFile); 115 146 // produceURLsForPagesInMRI(domainsFile); 116 147 … … 174 205 * Writes out the first n URLs in urlsList into outFile. 175 206 */ 176 private void writeURLsToFile(ArrayList< String> urlsList, File outFile, final int n) {207 private void writeURLsToFile(ArrayList<Tuple> urlsList, File outFile, final int n) { 177 208 try ( 178 209 Writer writer = new BufferedWriter(new FileWriter(outFile)); … … 180 211 181 212 for (int i=0; i < n; i++) { 182 String url = urlsList.get(i); 213 Tuple urlInfo = urlsList.get(i); 214 183 215 //System.out.println(list.get(i)); 184 writer.write(url + "\n");216 writer.write(urlInfo + "\n"); // calls toString() on tuple of url -> countryCode 185 217 } 186 218 } catch(Exception e) { … … 281 313 WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder); 282 314 315 System.out.println("*************************************"); 316 283 317 284 318 if(args.length >= 1) { … … 311 345 // TODO: generate the tables 312 346 } 313 347 348 System.out.println("*************************************"); 314 349 } catch(Exception e) { 315 350 logger.error(e.getMessage(), e);
Note:
See TracChangeset
for help on using the changeset viewer.