source: other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java@ 33882

Last change on this file since 33882 was 33882, checked in by ak19, 4 years ago

Code now writes both a listing of all non-autotranslated websites and a listing of overseas autotranslated sites.

File size: 6.8 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.log4j.Logger;
7
8/**
9* TO COMPILE OR RUN, FIRST DO:
10 * cd maori-lang-detection/apache-opennlp-1.9.1
11 * export OPENNLP_HOME=`pwd`
12 * cd maori-lang-detection/src
13 *
14 * TO COMPILE:
15 * maori-lang-detection/src$
16 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing.java
17 *
18 * TO RUN:
19 * maori-lang-detection/src$
20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255
21 *
22*/
23public class WebPageURLsListing {
24 static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName());
25
26 private final MongoDBAccess mongodbAccess;
27 private File outFolder;
28
29 public WebPageURLsListing(MongoDBAccess mongodbAccess, File outFolder)
30 {
31 this.mongodbAccess = mongodbAccess;
32 this.outFolder = outFolder;
33 }
34
35 public String produceURLsForPagesInMRI(File domainsFile) {
36 return writeFile(MongoDBAccess.IS_MRI, domainsFile);
37 }
38
39 public String produceURLsForPagesContainingMRI(File domainsFile) {
40 return writeFile(MongoDBAccess.CONTAINS_MRI, domainsFile);
41 }
42
43
44 public String writeFile(int filterType, File domainsFile/*, int numURLs*/) {
45
46 ArrayList<String> urlsList = new ArrayList<String>();
47
48 // 1. read each url from the domainsFile
49 // 1a. do the query
50 // 1b. add the arraylist result to urls
51
52 try (
53 BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
54 ) {
55
56
57 String domain;
58
59 while((domain = reader.readLine()) != null) {
60 domain = domain.trim();
61 if(!domain.equals("")) {
62 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
63 urlsList.addAll(moreURLs);
64 }
65 }
66 } catch(Exception e) {
67 logger.error("Unable to read URLs from file " + domainsFile.getAbsolutePath());
68 logger.error(e.getMessage(), e);
69 }
70
71 // Shuffle the urlsList, then write out the first numURLs into a file.
72 // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
73 File parentFolder = domainsFile.getParentFile();
74 //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName());
75 String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
76 File outFile = new File(parentFolder, fileName+domainsFile.getName());
77
78 // write out ALL the URLs
79 try (
80 Writer writer = new BufferedWriter(new FileWriter(outFile));
81 ) {
82
83 for (int i=0; i < urlsList.size(); i++) {
84 String url = urlsList.get(i);
85 //System.out.println(list.get(i));
86 writer.write(url + "\n");
87 }
88 } catch(Exception e) {
89 logger.error("Unable to write to file " + outFile.getAbsolutePath());
90 logger.error(e.getMessage(), e);
91 }
92
93 /*
94 // shuffle list and take the first n - write to file
95 try (
96 Writer writer = new BufferedWriter(new FileWriter(outFile));
97 ) {
98 Collections.shuffle(urlsList);
99 for (int i=0; i<numURLs; i++) {
100 String url = urlsList.get(i);
101 //System.out.println(list.get(i));
102 writer.write(url + "\n");
103 }
104 } catch(Exception e) {
105 logger.error("Unable to write to file " + outFile.getAbsolutePath());
106 logger.error(e.getMessage(), e);
107 }
108 */
109
110 return outFile.getAbsolutePath();
111 }
112
113 /* ---------------------------------------- */
114
115 /**
116 * Create the file 5counts_tentativeNonAutotranslatedSites.json
117 * that contains the count and domains for NZ sites (NZ origin or nz TLD) that CONTAIN_MRI
118 * followed by counts and domain listing for overseas sites that are either from Australia
119 * or don't contain mi in their URL path.
120 * @return full path of file generated
121 */
122 public String writeTentativeNonAutotranslatedSites() {
123
124 File outFile = new File(outFolder, "5a_counts_tentativeNonAutotranslatedSites.json");
125
126 String filename = outFile.getAbsolutePath();
127
128 try (
129 Writer writer = new BufferedWriter(new FileWriter(outFile));
130 ) {
131 // first write out NZ sites and .nz TLD count and domains
132 mongodbAccess.aggregateContainsMRIForNZ(writer, MongoDBAccess.CONTAINS_MRI);
133 // next write out all overseas sites (not NZ origin or .nz TLD)
134 // that have no "mi" in the URL path as mi.* or */mi
135 boolean isMiInURLPath = false;
136 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
137
138 filename = outFile.getCanonicalPath();
139 } catch(Exception e) {
140 logger.error("Unable to write to file " + outFile.getAbsolutePath());
141 logger.error(e.getMessage(), e);
142 }
143
144 System.err.println("*** Wrote file: " + filename);
145
146 return filename;
147 }
148
149 /**
150 * Listing of the remainder of overseas sites that CONTAIN_MRI not included by
151 * writeTentativeNonAutotranslatedSites(): those that have mi in their URL path
152 */
153 public String writeOverseasSitesWithMiInURLPath() {
154 File outFile = new File(outFolder, "5b_counts_overseasSitesWithMiInPath.json");
155
156 String filename = outFile.getAbsolutePath();
157 try (
158 Writer writer = new BufferedWriter(new FileWriter(outFile));
159 ) {
160 boolean isMiInURLPath = true;
161 mongodbAccess.aggregateContainsMRIForOverseas(writer, MongoDBAccess.CONTAINS_MRI, isMiInURLPath);
162 filename = outFile.getCanonicalPath();
163 } catch(Exception e) {
164 logger.error("Unable to write to file " + outFile.getAbsolutePath());
165 logger.error(e.getMessage(), e);
166 }
167
168 System.err.println("*** Wrote file: " + filename);
169 return filename;
170 }
171
172 public static void printUsage() {
173 System.err.println("WebPageURLsListing [domains.txt]");
174 }
175
176 // Depending on args, generates isMRI and containsMRI file listings for:
177 // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ
178
179
180 public static void main(String args[]) {
181 if(args.length >= 2) {
182 printUsage();
183 System.exit(-1);
184 }
185
186 try (
187 MongoDBAccess mongodb = new MongoDBAccess();
188 ) {
189
190 mongodb.connectToDB();
191
192 // output files will be stored in mongodb-data-auto
193 File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
194 WebPageURLsListing listing = new WebPageURLsListing(mongodb, outFolder);
195
196
197 if(args.length >= 1) {
198 File domainsFile = new File(args[0]);
199 if(!domainsFile.exists()) {
200 System.err.println("File " + domainsFile + " does not exist");
201 System.exit(-1);
202 }
203
204
205 //int genNumURLs = Integer.parseInt(args[1]);
206 //String isMRIFile = listing.produceURLsForPagesInMRI(domainsFile);
207 //String containsMRIFile = listing.produceURLsForPagesContainingMRI(domainsFile);
208
209 //listing.writeWebPagesOfAllNZSitesAndDomainListing();
210
211 } else {
212
213 String filename = listing.writeTentativeNonAutotranslatedSites();
214 filename = listing.writeOverseasSitesWithMiInURLPath();
215 }
216
217
218 } catch(Exception e) {
219 logger.error(e.getMessage(), e);
220 }
221 }
222}
Note: See TracBrowser for help on using the repository browser.