Changeset 33876 for other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
- Timestamp:
- 2020-01-29T21:48:52+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
r33873 r33876 18 18 * TO RUN: 19 19 * maori-lang-detection/src$ 20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255 21 21 * 22 22 */ … … 25 25 26 26 private final MongoDBAccess mongodbAccess; 27 private final int numURLs; 28 29 //private String[] urls; 30 ArrayList<String> urlsList = new ArrayList<String>(); 27 private int numURLs; 28 private File domainsFile; 29 30 public WebPageURLsListing(MongoDBAccess mongodbAccess, 31 File domainsFile) 32 { 33 this.mongodbAccess = mongodbAccess; 34 this.domainsFile = domainsFile; 35 } 31 36 32 37 public WebPageURLsListing(MongoDBAccess mongodbAccess, … … 34 39 int numURLs) 35 40 { 36 this .mongodbAccess = mongodbAccess;41 this(mongodbAccess, domainsFile); 37 42 this.numURLs = numURLs; 43 } 38 44 45 public String produceURLsForPagesInMRI() { 46 return writeFile(MongoDBAccess.IS_MRI); 47 } 48 49 public String produceURLsForPagesContainingMRI() { 50 return writeFile(MongoDBAccess.CONTAINS_MRI); 51 } 52 53 54 public String writeFile(int filterType) { 55 56 ArrayList<String> urlsList = new ArrayList<String>(); 57 39 58 // 1. read each url from the domainsFile 40 41 42 59 // 1a. do the query 60 // 1b. add the arraylist result to urls 61 43 62 try ( 44 63 BufferedReader reader = new BufferedReader(new FileReader(domainsFile)); … … 51 70 domain = domain.trim(); 52 71 if(!domain.equals("")) { 53 ArrayList<String> moreURLs = mongodbAccess.queryAllMatching IsMRIURLs(domain);72 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType); 54 73 urlsList.addAll(moreURLs); 55 74 } … … 63 82 // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java 64 83 File parentFolder = domainsFile.getParentFile(); 65 String fileName = domainsFile.getName();66 //File outFile = new File(parentFolder, "random"+numURLs+"_"+fileName);67 File fullSetOutFile = new File(parentFolder, "allPages_"+fileName);84 //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName()); 85 String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_"; 86 File outFile = new File(parentFolder, fileName+domainsFile.getName()); 68 87 69 88 // write out ALL the URLs 70 89 try ( 71 Writer writer = new BufferedWriter(new FileWriter( fullSetOutFile));90 Writer writer = new BufferedWriter(new FileWriter(outFile)); 72 91 ) { 73 92 … … 78 97 } 79 98 } catch(Exception e) { 80 logger.error("Unable to write to file " + fullSetOutFile.getAbsolutePath());99 logger.error("Unable to write to file " + outFile.getAbsolutePath()); 81 100 logger.error(e.getMessage(), e); 82 101 } … … 98 117 } 99 118 */ 119 120 return outFile.getAbsolutePath(); 100 121 } 122 101 123 102 124 public static void printUsage() { … … 107 129 // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ 108 130 109 110 131 111 132 public static void main(String args[]) { … … 124 145 } 125 146 126 int genNumURLs = Integer.parseInt(args[1]);147 //int genNumURLs = Integer.parseInt(args[1]); 127 148 128 149 mongodb.connectToDB(); 129 150 130 151 WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile); 152 //String isMRIFile = listing.produceURLsForPagesInMRI(); 153 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(); 154 mongodb.aggregateContainsMRIForOverseas(); 131 155 132 156 } catch(Exception e) {
Note:
See TracChangeset
for help on using the changeset viewer.