Changeset 33882 for other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
- Timestamp:
- 2020-01-30T22:54:39+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java
r33880 r33882 25 25 26 26 private final MongoDBAccess mongodbAccess; 27 private int numURLs; 28 private File domainsFile; 29 30 31 32 public WebPageURLsListing(MongoDBAccess mongodbAccess, 33 File domainsFile) 27 private File outFolder; 28 29 public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder) 34 30 { 35 31 this.mongodbAccess = mongodbAccess; 36 this.domainsFile = domainsFile; 37 } 38 39 public WebPageURLsListing(MongoDBAccess mongodbAccess, 40 File domainsFile, 41 int numURLs) 42 { 43 this(mongodbAccess, domainsFile); 44 this.numURLs = numURLs; 45 } 46 47 public String produceURLsForPagesInMRI() { 48 return writeFile(MongoDBAccess.IS_MRI); 49 } 50 51 public String produceURLsForPagesContainingMRI() { 52 return writeFile(MongoDBAccess.CONTAINS_MRI); 53 } 54 55 56 public String writeFile(int filterType) { 32 this.outFolder = outFolder; 33 } 34 35 public String produceURLsForPagesInMRI(File domainsFile) { 36 return writeFile(MongoDBAccess.IS_MRI, domainsFile); 37 } 38 39 public String produceURLsForPagesContainingMRI(File domainsFile) { 40 return writeFile(MongoDBAccess.CONTAINS_MRI, domainsFile); 41 } 42 43 44 public String writeFile(int filterType, File domainsFile/*, int numURLs*/) { 57 45 58 46 ArrayList<String> urlsList = new ArrayList<String>(); … … 126 114 127 115 /** 128 * Create the file 116 * Create the file 5counts_tentativeNonAutotranslatedSites.json 117 * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI 118 * followed by counts and domain listing for overseas sites that are either from Australia 119 * or don't contain mi in their URL path. 129 120 * @return full path of file generated 130 121 */ 131 122 public String writeTentativeNonAutotranslatedSites() { 132 File outFolder = new File("../mongodb-data/").getAbsoluteFile();133 File outFile = new File(outFolder, "5 counts_tentativeNonAutotranslatedSites.json");123 124 File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json"); 134 125 135 126 String filename = outFile.getAbsolutePath(); … … 139 130 ) { 140 131 // first write out NZ sites and .nz TLD count and domains 141 mongodbAccess.aggregateContainsMRIForNZ(writer); 142 // next write out all overseas sites and .nz TLD count and domains 143 mongodbAccess.aggregateContainsMRIForOverseas(writer); 132 mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI); 133 // next write out all overseas sites (not NZ origin or .nz TLD) 134 // that have no "mi" in the URL path as mi.* or */mi 135 boolean isMiInURLPath = false; 136 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath); 144 137 145 138 filename = outFile.getCanonicalPath(); … … 148 141 logger.error(e.getMessage(), e); 149 142 } 143 144 System.err.println("*** Wrote file: " + filename); 150 145 151 146 return filename; 152 147 } 153 148 149 /** 150 * Listing of the remainder of overseas sites that CONTAIN_MRI not included by 151 * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path 152 */ 153 public String writeOverseasSitesWithMiInURLPath() { 154 File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json"); 155 156 String filename = outFile.getAbsolutePath(); 157 try ( 158 Writer writer = new BufferedWriter(new FileWriter(outFile)); 159 ) { 160 boolean isMiInURLPath = true; 161 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath); 162 filename = outFile.getCanonicalPath(); 163 } catch(Exception e) { 164 logger.error("Unable to write to file " + outFile.getAbsolutePath()); 165 logger.error(e.getMessage(), e); 166 } 167 168 System.err.println("*** Wrote file: " + filename); 169 return filename; 170 } 154 171 155 172 public static void printUsage() { … … 170 187 MongoDBAccess mongodb = new MongoDBAccess(); 171 188 ) { 172 File domainsFile = new File(args[0]);173 if(!domainsFile.exists()) {174 System.err.println("File " + domainsFile + " does not exist");175 System.exit(-1);176 }177 178 //int genNumURLs = Integer.parseInt(args[1]);179 189 180 190 mongodb.connectToDB(); 181 182 WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile); 183 //String isMRIFile = listing.produceURLsForPagesInMRI(); 184 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(); 185 String filename = listing.writeTentativeNonAutotranslatedSites(); 186 System.err.println("Check file: " + filename); 191 192 // output files will be stored in mongodb-data-auto 193 File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile(); 194 WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder); 195 196 197 if(args.length >= 1) { 198 File domainsFile = new File(args[0]); 199 if(!domainsFile.exists()) { 200 System.err.println("File " + domainsFile + " does not exist"); 201 System.exit(-1); 202 } 203 204 205 //int genNumURLs = Integer.parseInt(args[1]); 206 //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile); 207 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile); 208 209 //listing.writeWebPagesOfAllNZSitesAndDomainListing(); 210 211 } else { 212 213 String filename = listing.writeTentativeNonAutotranslatedSites(); 214 filename = listing.writeOverseasSitesWithMiInURLPath(); 215 } 187 216 188 217
Note:
See TracChangeset
for help on using the changeset viewer.