source: other-projects/maori-lang-detection/src/org/greenstone/atea/WebPageURLsListing.java@ 33880

Last change on this file since 33880 was 33880, checked in by ak19, 4 years ago

Write out the 5counts_tentativeNonAutotranslatedSites.json file with proper pretty printing: needed gson's capabilities to convert mongodb's document result into json string output, as this allows newlines between array elements whereas mongodb's JsonWriter didn't.

File size: 5.4 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.log4j.Logger;
7
8/**
9* TO COMPILE OR RUN, FIRST DO:
10 * cd maori-lang-detection/apache-opennlp-1.9.1
11 * export OPENNLP_HOME=`pwd`
12 * cd maori-lang-detection/src
13 *
14 * TO COMPILE:
15 * maori-lang-detection/src$
16 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing.java
17 *
18 * TO RUN:
19 * maori-lang-detection/src$
20 * java -cp ".:../conf:../lib/*" org/greenstone/atea/WebPageURLsListing ../mongodb-data/domainsNZ_IsMRI.txt 255
21 *
22*/
23public class WebPageURLsListing {
24 static Logger logger = Logger.getLogger(org.greenstone.atea.WebPageURLsListing.class.getName());
25
26 private final MongoDBAccess mongodbAccess;
27 private int numURLs;
28 private File domainsFile;
29
30
31
32 public WebPageURLsListing(MongoDBAccess mongodbAccess,
33 File domainsFile)
34 {
35 this.mongodbAccess = mongodbAccess;
36 this.domainsFile = domainsFile;
37 }
38
39 public WebPageURLsListing(MongoDBAccess mongodbAccess,
40 File domainsFile,
41 int numURLs)
42 {
43 this(mongodbAccess, domainsFile);
44 this.numURLs = numURLs;
45 }
46
47 public String produceURLsForPagesInMRI() {
48 return writeFile(MongoDBAccess.IS_MRI);
49 }
50
51 public String produceURLsForPagesContainingMRI() {
52 return writeFile(MongoDBAccess.CONTAINS_MRI);
53 }
54
55
56 public String writeFile(int filterType) {
57
58 ArrayList<String> urlsList = new ArrayList<String>();
59
60 // 1. read each url from the domainsFile
61 // 1a. do the query
62 // 1b. add the arraylist result to urls
63
64 try (
65 BufferedReader reader = new BufferedReader(new FileReader(domainsFile));
66 ) {
67
68
69 String domain;
70
71 while((domain = reader.readLine()) != null) {
72 domain = domain.trim();
73 if(!domain.equals("")) {
74 ArrayList<String> moreURLs = mongodbAccess.queryAllMatchingURLsFilteredBy(domain, filterType);
75 urlsList.addAll(moreURLs);
76 }
77 }
78 } catch(Exception e) {
79 logger.error("Unable to read URLs from file " + domainsFile.getAbsolutePath());
80 logger.error(e.getMessage(), e);
81 }
82
83 // Shuffle the urlsList, then write out the first numURLs into a file.
84 // https://stackoverflow.com/questions/5505927/how-to-generate-a-random-permutation-in-java
85 File parentFolder = domainsFile.getParentFile();
86 //File outFile = new File(parentFolder, "random"+numURLs+"_"+domainsFile.getName());
87 String fileName = (filterType == MongoDBAccess.IS_MRI) ? "isMRI_" : "containsMRI_";
88 File outFile = new File(parentFolder, fileName+domainsFile.getName());
89
90 // write out ALL the URLs
91 try (
92 Writer writer = new BufferedWriter(new FileWriter(outFile));
93 ) {
94
95 for (int i=0; i < urlsList.size(); i++) {
96 String url = urlsList.get(i);
97 //System.out.println(list.get(i));
98 writer.write(url + "\n");
99 }
100 } catch(Exception e) {
101 logger.error("Unable to write to file " + outFile.getAbsolutePath());
102 logger.error(e.getMessage(), e);
103 }
104
105 /*
106 // shuffle list and take the first n - write to file
107 try (
108 Writer writer = new BufferedWriter(new FileWriter(outFile));
109 ) {
110 Collections.shuffle(urlsList);
111 for (int i=0; i<numURLs; i++) {
112 String url = urlsList.get(i);
113 //System.out.println(list.get(i));
114 writer.write(url + "\n");
115 }
116 } catch(Exception e) {
117 logger.error("Unable to write to file " + outFile.getAbsolutePath());
118 logger.error(e.getMessage(), e);
119 }
120 */
121
122 return outFile.getAbsolutePath();
123 }
124
125 /* ---------------------------------------- */
126
127 /**
128 * Create the file
129 * @return full path of file generated
130 */
131 public String writeTentativeNonAutotranslatedSites() {
132 File outFolder = new File("../mongodb-data/").getAbsoluteFile();
133 File outFile = new File(outFolder, "5counts_tentativeNonAutotranslatedSites.json");
134
135 String filename = outFile.getAbsolutePath();
136
137 try (
138 Writer writer = new BufferedWriter(new FileWriter(outFile));
139 ) {
140 // first write out NZ sites and .nz TLD count and domains
141 mongodbAccess.aggregateContainsMRIForNZ(writer);
142 // next write out all overseas sites and .nz TLD count and domains
143 mongodbAccess.aggregateContainsMRIForOverseas(writer);
144
145 filename = outFile.getCanonicalPath();
146 } catch(Exception e) {
147 logger.error("Unable to write to file " + outFile.getAbsolutePath());
148 logger.error(e.getMessage(), e);
149 }
150
151 return filename;
152 }
153
154
155 public static void printUsage() {
156 System.err.println("WebPageURLsListing [domains.txt]");
157 }
158
159 // Depending on args, generates isMRI and containsMRI file listings for:
160 // 1. UNFILTERED: all (NZ + overseas) AND takes manually curated domain list file for overseas and adds all NZ
161
162
163 public static void main(String args[]) {
164 if(args.length >= 2) {
165 printUsage();
166 System.exit(-1);
167 }
168
169 try (
170 MongoDBAccess mongodb = new MongoDBAccess();
171 ) {
172 File domainsFile = new File(args[0]);
173 if(!domainsFile.exists()) {
174 System.err.println("File " + domainsFile + " does not exist");
175 System.exit(-1);
176 }
177
178 //int genNumURLs = Integer.parseInt(args[1]);
179
180 mongodb.connectToDB();
181
182 WebPageURLsListing listing = new WebPageURLsListing(mongodb, domainsFile);
183 //String isMRIFile = listing.produceURLsForPagesInMRI();
184 //String containsMRIFile = listing.produceURLsForPagesContainingMRI();
185 String filename = listing.writeTentativeNonAutotranslatedSites();
186 System.err.println("Check file: " + filename);
187
188
189 } catch(Exception e) {
190 logger.error(e.getMessage(), e);
191 }
192 }
193}
Note: See TracBrowser for help on using the repository browser.