1 | package org.greenstone.atea;
|
---|
2 |
|
---|
3 | import java.util.*;
|
---|
4 | import java.io.*;
|
---|
5 |
|
---|
6 | import org.apache.commons.csv.*;
|
---|
7 | import org.apache.log4j.Logger;
|
---|
8 |
|
---|
9 | import org.greenstone.util.SafeProcess;
|
---|
10 |
|
---|
11 | /**
|
---|
12 | * Program to help going through the n number of random sample web page URLs stored in input
|
---|
13 | * csv file, to eyeball whether the full text (stored in mongodb for each) is indeed inMRI
|
---|
14 | * or not. User can enter Y|N|? and ctr-D or ctrl-C to continue working on this later.
|
---|
15 | * The output file is the input filename + .tmp suffix.
|
---|
16 | * When user continues later, the output file from last time must be used as input file.
|
---|
17 | * Any csv records not completed earlier or with ? entered will be presented for input
|
---|
18 | * on (re-)running this program.
|
---|
19 | *
|
---|
20 | * TO COMPILE OR RUN, FIRST DO:
|
---|
21 | * cd maori-lang-detection/apache-opennlp-1.9.1
|
---|
22 | * export OPENNLP_HOME=`pwd`
|
---|
23 | * cd maori-lang-detection/src
|
---|
24 | *
|
---|
25 | * TO COMPILE:
|
---|
26 | * maori-lang-detection/src$
|
---|
27 | * javac -cp ".:../conf:../lib/*" org/greenstone/atea/ManualURLInspection.java
|
---|
28 | *
|
---|
29 | * TO RUN:
|
---|
30 | * maori-lang-detection/src$
|
---|
31 | * java -cp ".:../conf:../lib/*" org/greenstone/atea/ManualURLInspection ../mongodb-data/random260_manualList_globalDomains_whereAPageContainsMRI.txt
|
---|
32 | *
|
---|
33 | */
|
---|
34 | public class ManualURLInspection {
|
---|
35 |
|
---|
36 | static Logger logger = Logger.getLogger(org.greenstone.atea.ManualURLInspection.class.getName());
|
---|
37 |
|
---|
38 | private final MongoDBQueryer mongodbQueryer;
|
---|
39 | private final File outFolder;
|
---|
40 | private final File webPageURLsCSVFile;
|
---|
41 | private final File tmpOutFile;
|
---|
42 |
|
---|
43 | /** csv column numbers */
|
---|
44 | public static final int URL_COLUMN = 0;
|
---|
45 | public static final int COUNTRY_CODE_COLUMN = 1;
|
---|
46 | public static final int IS_REALLY_IN_MRI_COLUMN = 2;
|
---|
47 |
|
---|
48 |
|
---|
49 | public ManualURLInspection(MongoDBQueryer mongodbQueryer, File csvFile)
|
---|
50 | {
|
---|
51 | this.mongodbQueryer = mongodbQueryer;
|
---|
52 | this.webPageURLsCSVFile = csvFile;
|
---|
53 | this.outFolder = csvFile.getParentFile();
|
---|
54 |
|
---|
55 | String tmpFilename = Utility.getFilePath(webPageURLsCSVFile);
|
---|
56 | this.tmpOutFile = new File(tmpFilename+".tmp");
|
---|
57 | }
|
---|
58 |
|
---|
59 | public String getCSVOutputFilename() {
|
---|
60 | return (tmpOutFile == null) ? "" : Utility.getFilePath(tmpOutFile);
|
---|
61 | }
|
---|
62 |
|
---|
63 | /**
|
---|
64 | * Read .csv input file one line at a time.
|
---|
65 | * For each line,
|
---|
66 | * - if empty line empty, skip it.
|
---|
67 | * - If the 3rd column of line is already filled in with Y|N, write out identical line
|
---|
68 | * into tmp output file.
|
---|
69 | * - If third column contains ? or if 3rd column is empty, run a MongoDBQuery to get
|
---|
70 | * the full text of the page and display it on screen.
|
---|
71 | * Wait for user input.
|
---|
72 | * - If Enter hit or Y input, write out Y in 3rd field of line into tmp file.
|
---|
73 | * - If N or ? entered, write out N/? as 3rd field.
|
---|
74 | *
|
---|
75 | * Loop through input csv until finished or until Ctrl-C or Ctrl-D pressed.
|
---|
76 | * Ctrl-D here means end of all user interaction, signalling user wants
|
---|
77 | * to stop entering data and continue later.
|
---|
78 | *
|
---|
79 | * When finished or Ctrl-D entered or Ctrl C pressed, all data entered must have been written
|
---|
80 | * out. So to avoid losing data on Ctrl-Ck, write out each processed csv record (whether
|
---|
81 | * already complete or whether user entry made it complete) and flush writer.
|
---|
82 | * When program terminates in any manner, print message that the file has been created.
|
---|
83 | */
|
---|
84 | public String processCSV() {
|
---|
85 |
|
---|
86 | final String USER_PROMPT = "Enter isMRI value of Y|N|? for (%d): %s - %s > ";
|
---|
87 | //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > ";
|
---|
88 |
|
---|
89 | boolean terminate = false;
|
---|
90 | CSVParser parser = null;
|
---|
91 |
|
---|
92 | try {
|
---|
93 | parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
|
---|
94 | } catch(Exception e) {
|
---|
95 | logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e);
|
---|
96 | return "Failed";
|
---|
97 | }
|
---|
98 |
|
---|
99 | try (
|
---|
100 | //BufferedWriter writer = new BufferedWriter(new FileWriter(tmpOutFile));
|
---|
101 | CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
|
---|
102 | ) {
|
---|
103 |
|
---|
104 | int recordCount = 0;
|
---|
105 | for (CSVRecord csvRecord : parser) {
|
---|
106 | //if(terminate) condition handled further below
|
---|
107 |
|
---|
108 | //logger.debug("Got record: " + csvRecord.toString());
|
---|
109 |
|
---|
110 | //int recordNo = csvRecord.RecordNumber(); // will count empty lines!
|
---|
111 |
|
---|
112 | //if(csvRecord.size() != 0) {
|
---|
113 | String url = csvRecord.get(URL_COLUMN);
|
---|
114 | if(url.equals("")) { // skip empty lines
|
---|
115 | continue;
|
---|
116 | }
|
---|
117 |
|
---|
118 | recordCount++;
|
---|
119 | String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
|
---|
120 | String isReallyInMRI = "";
|
---|
121 | //String isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
|
---|
122 | //if(!isReallyInMRI.equals("")) {
|
---|
123 | if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) {
|
---|
124 | isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
|
---|
125 | }
|
---|
126 |
|
---|
127 | if(terminate || (!isReallyInMRI.equals("") && !isReallyInMRI.equals("?"))) {
|
---|
128 | // if(terminate) on Ctrl-D, don't stop processing csv records
|
---|
129 | // Instead, copy remaining records of input csv file into output csv file
|
---|
130 | isReallyInMRI = isReallyInMRI.toUpperCase();
|
---|
131 | csvWriter.printRecord(url, countryCode, isReallyInMRI);
|
---|
132 | csvWriter.flush();
|
---|
133 | logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
|
---|
134 | + " - " + isReallyInMRI);
|
---|
135 | }
|
---|
136 | else {
|
---|
137 |
|
---|
138 | // First, display full text for web page record with matching url
|
---|
139 | // so the user can look at it to decide whether it is indeed overall in MRI or not.
|
---|
140 | String fulltext = mongodbQueryer.displayFullTextOfPage(url);
|
---|
141 | System.err.println(String.format("FULL-TEXT for record %d:\n%s\n", recordCount, fulltext));
|
---|
142 |
|
---|
143 | //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode);
|
---|
144 |
|
---|
145 | // Read Input until Ctrl-D: read System.In as bufferedReader
|
---|
146 | // https://stackoverflow.com/questions/5837823/read-input-until-controld
|
---|
147 | // Ctrl-C is already taken care if, see
|
---|
148 | // https://coderanch.com/t/279136/java/terminated-program-Control-close-open
|
---|
149 | // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases."
|
---|
150 | // So I just need to flush the csv print writer after every record is written
|
---|
151 | // and Ctrl-C won't lose any of the data thus far entered by the user.
|
---|
152 |
|
---|
153 | BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
|
---|
154 |
|
---|
155 | boolean done = false;
|
---|
156 |
|
---|
157 | System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode));
|
---|
158 | boolean previouslyQuestionMark = false;
|
---|
159 | if(isReallyInMRI.equals("?")) {
|
---|
160 | previouslyQuestionMark = true;
|
---|
161 | System.err.println("\t? entered last time");
|
---|
162 | }
|
---|
163 | while(!done && ((isReallyInMRI = systemIn.readLine()) != null)) {
|
---|
164 |
|
---|
165 | isReallyInMRI = isReallyInMRI.toUpperCase();
|
---|
166 |
|
---|
167 | //logger.debug("@@ Got: |" + isReallyInMRI + "|");
|
---|
168 |
|
---|
169 | // if user hit enter, it means they accepted
|
---|
170 | // - that the full text displayed is really in MRI: Y
|
---|
171 | // - the previous value entered if it was a ?
|
---|
172 | if(isReallyInMRI.equals("")) {
|
---|
173 | if(previouslyQuestionMark) {
|
---|
174 | isReallyInMRI = "?";
|
---|
175 | } else {
|
---|
176 | isReallyInMRI = "Y";
|
---|
177 | }
|
---|
178 | }
|
---|
179 |
|
---|
180 |
|
---|
181 | if(isReallyInMRI.equals("Y") || isReallyInMRI.equals("N") || isReallyInMRI.equals("?")) {
|
---|
182 | done = true;
|
---|
183 | //break;
|
---|
184 | } else {
|
---|
185 | System.out.println("@@ UNRECOGNISED. "
|
---|
186 | + String.format(USER_PROMPT, recordCount, url, countryCode));
|
---|
187 | }
|
---|
188 | }
|
---|
189 |
|
---|
190 | // save the record
|
---|
191 | csvWriter.printRecord(url, countryCode, isReallyInMRI);
|
---|
192 | csvWriter.flush();
|
---|
193 |
|
---|
194 | if(isReallyInMRI == null) {
|
---|
195 | terminate = true;
|
---|
196 | System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating.");
|
---|
197 | } else {
|
---|
198 | System.out.println("User entered: " + isReallyInMRI);
|
---|
199 | }
|
---|
200 |
|
---|
201 | }
|
---|
202 | }
|
---|
203 | //}
|
---|
204 |
|
---|
205 | } catch(Exception e) {
|
---|
206 | e.printStackTrace();
|
---|
207 | logger.error("Exception occurred when processing CSV file or writing out file:\n"
|
---|
208 | + Utility.getFilePath(tmpOutFile));
|
---|
209 | logger.error(e.getMessage(), e);
|
---|
210 | }
|
---|
211 |
|
---|
212 | //return urlsList;
|
---|
213 | return Utility.getFilePath(tmpOutFile);
|
---|
214 | }
|
---|
215 |
|
---|
216 |
|
---|
217 |
|
---|
218 | public static void printUsage() {
|
---|
219 | System.err.println("Usage: ManualURLInspection webPageURLs.txt");
|
---|
220 | }
|
---|
221 |
|
---|
222 | /**
|
---|
223 | * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
|
---|
224 | * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
|
---|
225 | * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
|
---|
226 | * translated and really contain at least one webpage containing at least one sentence in MRI.
|
---|
227 | * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
|
---|
228 | * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
|
---|
229 | * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
|
---|
230 | * 90% confidence with 5% margin of error for testing binary outcomes, see
|
---|
231 | * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
|
---|
232 | */
|
---|
233 | public static void main(String args[]) {
|
---|
234 | SafeProcess.DEBUG = 1;
|
---|
235 |
|
---|
236 | if(args.length != 1) {
|
---|
237 | printUsage();
|
---|
238 | System.exit(-1);
|
---|
239 | }
|
---|
240 |
|
---|
241 |
|
---|
242 | try (
|
---|
243 | MongoDBQueryer mongodb = new MongoDBQueryer();
|
---|
244 | ) {
|
---|
245 |
|
---|
246 | mongodb.connectToDB();
|
---|
247 |
|
---|
248 | // output files will be stored in mongodb-data-auto
|
---|
249 | File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
|
---|
250 |
|
---|
251 |
|
---|
252 | logger.info("*************************************");
|
---|
253 |
|
---|
254 |
|
---|
255 |
|
---|
256 | final File inputFile = new File(args[0]);
|
---|
257 | if(!inputFile.exists()) {
|
---|
258 | logger.info("File " + inputFile + " does not exist");
|
---|
259 | System.exit(-1);
|
---|
260 | }
|
---|
261 |
|
---|
262 | final ManualURLInspection inspector = new ManualURLInspection(mongodb, inputFile);
|
---|
263 |
|
---|
264 |
|
---|
265 | Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
|
---|
266 | public void run() {
|
---|
267 | logger.info("@@@@@@@@@@@@@@@@@@@@@@@@");
|
---|
268 | logger.info("WARNING!!!");
|
---|
269 | logger.info("Got Ctrl-C. INCOMPLETE generated temp CSV file: " +
|
---|
270 | inspector.getCSVOutputFilename());
|
---|
271 | logger.info(String.format("Copy remaining records from input file %s into this file.",
|
---|
272 | Utility.getFilePath(inputFile)));
|
---|
273 | logger.info("@@@@@@@@@@@@@@@@@@@@@@@@");
|
---|
274 | }
|
---|
275 | }));
|
---|
276 |
|
---|
277 | String filename = inspector.processCSV();
|
---|
278 |
|
---|
279 | logger.info("Generated temp CSV file: " + filename);
|
---|
280 | logger.info("*************************************");
|
---|
281 | } catch(Exception e) {
|
---|
282 | logger.error(e.getMessage(), e);
|
---|
283 | }
|
---|
284 | }
|
---|
285 | }
|
---|