source: other-projects/maori-lang-detection/src/org/greenstone/atea/ManualURLInspection.java@ 33941

Last change on this file since 33941 was 33941, checked in by ak19, 4 years ago
  1. Uppercase 3rd field (Y/N/? field) read back in from file before being written out again, in case any manual editing of the file outside the program took place. 2. Uppercase 3rd field after reading from std input, saves having to test if condition of uppercased values and ensures that what's written out is uppercase.
File size: 10.4 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.commons.csv.*;
7import org.apache.log4j.Logger;
8
9import org.greenstone.util.SafeProcess;
10
11/**
12 * Program to help going through the n number of random sample web page URLs stored in input
13 * csv file, to eyeball whether the full text (stored in mongodb for each) is indeed inMRI
14 * or not. User can enter Y|N|? and ctr-D or ctrl-C to continue working on this later.
15 * The output file is the input filename + .tmp suffix.
16 * When user continues later, the output file from last time must be used as input file.
17 * Any csv records not completed earlier or with ? entered will be presented for input
18 * on (re-)running this program.
19 *
20 * TO COMPILE OR RUN, FIRST DO:
21 * cd maori-lang-detection/apache-opennlp-1.9.1
22 * export OPENNLP_HOME=`pwd`
23 * cd maori-lang-detection/src
24 *
25 * TO COMPILE:
26 * maori-lang-detection/src$
27 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/ManualURLInspection.java
28 *
29 * TO RUN:
30 * maori-lang-detection/src$
31 * java -cp ".:../conf:../lib/*" org/greenstone/atea/ManualURLInspection ../mongodb-data/random260_manualList_globalDomains_whereAPageContainsMRI.txt
32 *
33*/
34public class ManualURLInspection {
35
36 static Logger logger = Logger.getLogger(org.greenstone.atea.ManualURLInspection.class.getName());
37
38 private final MongoDBQueryer mongodbQueryer;
39 private final File outFolder;
40 private final File webPageURLsCSVFile;
41 private final File tmpOutFile;
42
43 /** csv column numbers */
44 public static final int URL_COLUMN = 0;
45 public static final int COUNTRY_CODE_COLUMN = 1;
46 public static final int IS_REALLY_IN_MRI_COLUMN = 2;
47
48
49 public ManualURLInspection(MongoDBQueryer mongodbQueryer, File csvFile)
50 {
51 this.mongodbQueryer = mongodbQueryer;
52 this.webPageURLsCSVFile = csvFile;
53 this.outFolder = csvFile.getParentFile();
54
55 String tmpFilename = Utility.getFilePath(webPageURLsCSVFile);
56 this.tmpOutFile = new File(tmpFilename+".tmp");
57 }
58
59 public String getCSVOutputFilename() {
60 return (tmpOutFile == null) ? "" : Utility.getFilePath(tmpOutFile);
61 }
62
63 /**
64 * Read .csv input file one line at a time.
65 * For each line,
66 * - if empty line empty, skip it.
67 * - If the 3rd column of line is already filled in with Y|N, write out identical line
68 * into tmp output file.
69 * - If third column contains ? or if 3rd column is empty, run a MongoDBQuery to get
70 * the full text of the page and display it on screen.
71 * Wait for user input.
72 * - If Enter hit or Y input, write out Y in 3rd field of line into tmp file.
73 * - If N or ? entered, write out N/? as 3rd field.
74 *
75 * Loop through input csv until finished or until Ctrl-C or Ctrl-D pressed.
76 * Ctrl-D here means end of all user interaction, signalling user wants
77 * to stop entering data and continue later.
78 *
79 * When finished or Ctrl-D entered or Ctrl C pressed, all data entered must have been written
80 * out. So to avoid losing data on Ctrl-Ck, write out each processed csv record (whether
81 * already complete or whether user entry made it complete) and flush writer.
82 * When program terminates in any manner, print message that the file has been created.
83 */
84 public String processCSV() {
85
86 final String USER_PROMPT = "Enter isMRI value of Y|N|? for (%d): %s - %s > ";
87 //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > ";
88
89 boolean terminate = false;
90 CSVParser parser = null;
91
92 try {
93 parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
94 } catch(Exception e) {
95 logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e);
96 return "Failed";
97 }
98
99 try (
100 //BufferedWriter writer = new BufferedWriter(new FileWriter(tmpOutFile));
101 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
102 ) {
103
104 int recordCount = 0;
105 for (CSVRecord csvRecord : parser) {
106 //if(terminate) condition handled further below
107
108 //logger.debug("Got record: " + csvRecord.toString());
109
110 //int recordNo = csvRecord.RecordNumber(); // will count empty lines!
111
112 //if(csvRecord.size() != 0) {
113 String url = csvRecord.get(URL_COLUMN);
114 if(url.equals("")) { // skip empty lines
115 continue;
116 }
117
118 recordCount++;
119 String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
120 String isReallyInMRI = "";
121 //String isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
122 //if(!isReallyInMRI.equals("")) {
123 if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) {
124 isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
125 }
126
127 if(terminate || (!isReallyInMRI.equals("") && !isReallyInMRI.equals("?"))) {
128 // if(terminate) on Ctrl-D, don't stop processing csv records
129 // Instead, copy remaining records of input csv file into output csv file
130 isReallyInMRI = isReallyInMRI.toUpperCase();
131 csvWriter.printRecord(url, countryCode, isReallyInMRI);
132 csvWriter.flush();
133 logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
134 + " - " + isReallyInMRI);
135 }
136 else {
137
138 // First, display full text for web page record with matching url
139 // so the user can look at it to decide whether it is indeed overall in MRI or not.
140 String fulltext = mongodbQueryer.displayFullTextOfPage(url);
141 System.err.println(String.format("FULL-TEXT for record %d:\n%s\n", recordCount, fulltext));
142
143 //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode);
144
145 // Read Input until Ctrl-D: read System.In as bufferedReader
146 // https://stackoverflow.com/questions/5837823/read-input-until-controld
147 // Ctrl-C is already taken care if, see
148 // https://coderanch.com/t/279136/java/terminated-program-Control-close-open
149 // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases."
150 // So I just need to flush the csv print writer after every record is written
151 // and Ctrl-C won't lose any of the data thus far entered by the user.
152
153 BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
154
155 boolean done = false;
156
157 System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode));
158 boolean previouslyQuestionMark = false;
159 if(isReallyInMRI.equals("?")) {
160 previouslyQuestionMark = true;
161 System.err.println("\t? entered last time");
162 }
163 while(!done && ((isReallyInMRI = systemIn.readLine()) != null)) {
164
165 isReallyInMRI = isReallyInMRI.toUpperCase();
166
167 //logger.debug("@@ Got: |" + isReallyInMRI + "|");
168
169 // if user hit enter, it means they accepted
170 // - that the full text displayed is really in MRI: Y
171 // - the previous value entered if it was a ?
172 if(isReallyInMRI.equals("")) {
173 if(previouslyQuestionMark) {
174 isReallyInMRI = "?";
175 } else {
176 isReallyInMRI = "Y";
177 }
178 }
179
180
181 if(isReallyInMRI.equals("Y") || isReallyInMRI.equals("N") || isReallyInMRI.equals("?")) {
182 done = true;
183 //break;
184 } else {
185 System.out.println("@@ UNRECOGNISED. "
186 + String.format(USER_PROMPT, recordCount, url, countryCode));
187 }
188 }
189
190 // save the record
191 csvWriter.printRecord(url, countryCode, isReallyInMRI);
192 csvWriter.flush();
193
194 if(isReallyInMRI == null) {
195 terminate = true;
196 System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating.");
197 } else {
198 System.out.println("User entered: " + isReallyInMRI);
199 }
200
201 }
202 }
203 //}
204
205 } catch(Exception e) {
206 e.printStackTrace();
207 logger.error("Exception occurred when processing CSV file or writing out file:\n"
208 + Utility.getFilePath(tmpOutFile));
209 logger.error(e.getMessage(), e);
210 }
211
212 //return urlsList;
213 return Utility.getFilePath(tmpOutFile);
214 }
215
216
217
218 public static void printUsage() {
219 System.err.println("Usage: ManualURLInspection webPageURLs.txt");
220 }
221
222 /**
223 * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
224 * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
225 * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
226 * translated and really contain at least one webpage containing at least one sentence in MRI.
227 * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
228 * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
229 * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
230 * 90% confidence with 5% margin of error for testing binary outcomes, see
231 * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
232 */
233 public static void main(String args[]) {
234 SafeProcess.DEBUG = 1;
235
236 if(args.length != 1) {
237 printUsage();
238 System.exit(-1);
239 }
240
241
242 try (
243 MongoDBQueryer mongodb = new MongoDBQueryer();
244 ) {
245
246 mongodb.connectToDB();
247
248 // output files will be stored in mongodb-data-auto
249 File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
250
251
252 logger.info("*************************************");
253
254
255
256 final File inputFile = new File(args[0]);
257 if(!inputFile.exists()) {
258 logger.info("File " + inputFile + " does not exist");
259 System.exit(-1);
260 }
261
262 final ManualURLInspection inspector = new ManualURLInspection(mongodb, inputFile);
263
264
265 Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
266 public void run() {
267 logger.info("@@@@@@@@@@@@@@@@@@@@@@@@");
268 logger.info("WARNING!!!");
269 logger.info("Got Ctrl-C. INCOMPLETE generated temp CSV file: " +
270 inspector.getCSVOutputFilename());
271 logger.info(String.format("Copy remaining records from input file %s into this file.",
272 Utility.getFilePath(inputFile)));
273 logger.info("@@@@@@@@@@@@@@@@@@@@@@@@");
274 }
275 }));
276
277 String filename = inspector.processCSV();
278
279 logger.info("Generated temp CSV file: " + filename);
280 logger.info("*************************************");
281 } catch(Exception e) {
282 logger.error(e.getMessage(), e);
283 }
284 }
285}
Note: See TracBrowser for help on using the repository browser.