source: other-projects/maori-lang-detection/src/org/greenstone/atea/ManualURLInspection.java@ 33940

Last change on this file since 33940 was 33940, checked in by ak19, 4 years ago
  1. In order to make it easier to do the manual work of inspecting 260 web pages, there's now a new class ManualURLInspection.java that will automate the part where I have to paste a URL into mongo db and press Find to locate the matching web page's record and then view the record. The new class takes the input CSV file and for each of the 260 web page URLs of the random sample, it gets the full text from mongodb and displays it on the console and let's me enter Y/N/? for whether I think the overall page is in MRI or not. Then the updated records are written into a tmp file, as well as any remainiing ones I haven't gone through yet. If I'm satisfied, I'll replace the original sample csv file with the new one and can continue from where I left off later. Responds to Ctrl-D for end of input. Some handling of Ctrl-C: each processed line is written out, but user is instructed to manually append the unprocessed records to the end of the temp output file before replacing original csv input file with output file to continue working later. 2. Needed to update commons-csv jar file to have access to additional method. 3. Needed a new utility method in MongodbQueryer. 4. Minor changes to SummaryTool.java. 5. Processed first 190 of 260 random sample web urls using my helpful new class ManualURLInspection.java. The first 15 URLs were done totally manually, the remainder with my new class. The new class made the work very much faster and far less tedious.
File size: 10.4 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.commons.csv.*;
7import org.apache.log4j.Logger;
8
9import org.greenstone.util.SafeProcess;
10
11/**
12 * Program to help going through the n number of random sample web page URLs stored in input
13 * csv file, to eyeball whether the full text (stored in mongodb for each) is indeed inMRI
14 * or not. User can enter Y|N|? and ctr-D or ctrl-C to continue working on this later.
15 * The output file is the input filename + .tmp suffix.
16 * When user continues later, the output file from last time must be used as input file.
17 * Any csv records not completed earlier or with ? entered will be presented for input
18 * on (re-)running this program.
19 *
20 * TO COMPILE OR RUN, FIRST DO:
21 * cd maori-lang-detection/apache-opennlp-1.9.1
22 * export OPENNLP_HOME=`pwd`
23 * cd maori-lang-detection/src
24 *
25 * TO COMPILE:
26 * maori-lang-detection/src$
27 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/ManualURLInspection.java
28 *
29 * TO RUN:
30 * maori-lang-detection/src$
31 * java -cp ".:../conf:../lib/*" org/greenstone/atea/ManualURLInspection ../mongodb-data/random260_manualList_globalDomains_whereAPageContainsMRI.txt
32 *
33*/
34public class ManualURLInspection {
35
36 static Logger logger = Logger.getLogger(org.greenstone.atea.ManualURLInspection.class.getName());
37
38 private final MongoDBQueryer mongodbQueryer;
39 private final File outFolder;
40 private final File webPageURLsCSVFile;
41 private final File tmpOutFile;
42
43 /** csv column numbers */
44 public static final int URL_COLUMN = 0;
45 public static final int COUNTRY_CODE_COLUMN = 1;
46 public static final int IS_REALLY_IN_MRI_COLUMN = 2;
47
48
49 public ManualURLInspection(MongoDBQueryer mongodbQueryer, File csvFile)
50 {
51 this.mongodbQueryer = mongodbQueryer;
52 this.webPageURLsCSVFile = csvFile;
53 this.outFolder = csvFile.getParentFile();
54
55 String tmpFilename = Utility.getFilePath(webPageURLsCSVFile);
56 this.tmpOutFile = new File(tmpFilename+".tmp");
57 }
58
59 public String getCSVOutputFilename() {
60 return (tmpOutFile == null) ? "" : Utility.getFilePath(tmpOutFile);
61 }
62
63 /**
64 * Read .csv input file one line at a time.
65 * For each line,
66 * - if empty line empty, skip it.
67 * - If the 3rd column of line is already filled in with Y|N, write out identical line
68 * into tmp output file.
69 * - If third column contains ? or if 3rd column is empty, run a MongoDBQuery to get
70 * the full text of the page and display it on screen.
71 * Wait for user input.
72 * - If Enter hit or Y input, write out Y in 3rd field of line into tmp file.
73 * - If N or ? entered, write out N/? as 3rd field.
74 *
75 * Loop through input csv until finished or until Ctrl-C or Ctrl-D pressed.
76 * Ctrl-D here means end of all user interaction, signalling user wants
77 * to stop entering data and continue later.
78 *
79 * When finished or Ctrl-D entered or Ctrl C pressed, all data entered must have been written
80 * out. So to avoid losing data on Ctrl-Ck, write out each processed csv record (whether
81 * already complete or whether user entry made it complete) and flush writer.
82 * When program terminates in any manner, print message that the file has been created.
83 */
84 public String processCSV() {
85
86 final String USER_PROMPT = "Enter isMRI value of Y|N|? for (%d): %s - %s > ";
87 //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > ";
88
89 boolean terminate = false;
90 CSVParser parser = null;
91
92 try {
93 parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
94 } catch(Exception e) {
95 logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e);
96 return "Failed";
97 }
98
99 try (
100 //BufferedWriter writer = new BufferedWriter(new FileWriter(tmpOutFile));
101 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
102 ) {
103
104 int recordCount = 0;
105 for (CSVRecord csvRecord : parser) {
106 //if(terminate) condition handled further below
107
108 //logger.debug("Got record: " + csvRecord.toString());
109
110 //int recordNo = csvRecord.RecordNumber(); // will count empty lines!
111
112 //if(csvRecord.size() != 0) {
113 String url = csvRecord.get(URL_COLUMN);
114 if(url.equals("")) { // skip empty lines
115 continue;
116 }
117
118 recordCount++;
119 String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
120 String isReallyInMRI = "";
121 //String isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
122 //if(!isReallyInMRI.equals("")) {
123 if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) {
124 isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
125 }
126
127 if(terminate || (!isReallyInMRI.equals("") && !isReallyInMRI.equals("?"))) {
128 // if(terminate) on Ctrl-D, don't stop processing csv records
129 // Instead, copy remaining records of input csv file into output csv file
130
131 csvWriter.printRecord(url, countryCode, isReallyInMRI);
132 csvWriter.flush();
133 logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
134 + " - " + isReallyInMRI);
135 }
136 else {
137
138 // First, display full text for web page record with matching url
139 // so the user can look at it to decide whether it is indeed overall in MRI or not.
140 String fulltext = mongodbQueryer.displayFullTextOfPage(url);
141 System.err.println(String.format("FULL-TEXT for record %d:\n%s\n", recordCount, fulltext));
142
143 //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode);
144
145 // Read Input until Ctrl-D: read System.In as bufferedReader
146 // https://stackoverflow.com/questions/5837823/read-input-until-controld
147 // Ctrl-C is already taken care if, see
148 // https://coderanch.com/t/279136/java/terminated-program-Control-close-open
149 // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases."
150 // So I just need to flush the csv print writer after every record is written
151 // and Ctrl-C won't lose any of the data thus far entered by the user.
152
153 BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
154
155 boolean done = false;
156
157 System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode));
158 boolean previouslyQuestionMark = false;
159 if(isReallyInMRI.equals("?")) {
160 previouslyQuestionMark = true;
161 System.err.println("\t? entered last time");
162 }
163 while(!done && ((isReallyInMRI = systemIn.readLine()) != null)) {
164
165 //logger.debug("@@ Got: |" + isReallyInMRI + "|");
166
167 // if user hit enter, it means they accepted
168 // - that the full text displayed is really in MRI: Y
169 // - the previous value entered if it was a ?
170 if(isReallyInMRI.equals("")) {
171 if(previouslyQuestionMark) {
172 isReallyInMRI = "?";
173 } else {
174 isReallyInMRI = "Y";
175 }
176 }
177
178 if(isReallyInMRI.toUpperCase().equals("Y")
179 || isReallyInMRI.toUpperCase().equals("N")
180 || isReallyInMRI.equals("?")) {
181 done = true;
182 //break;
183 } else {
184 System.out.println("@@ UNRECOGNISED. "
185 + String.format(USER_PROMPT, recordCount, url, countryCode));
186 }
187 }
188
189 // save the record
190 csvWriter.printRecord(url, countryCode, isReallyInMRI);
191 csvWriter.flush();
192
193 if(isReallyInMRI == null) {
194 terminate = true;
195 System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating.");
196 } else {
197 System.out.println("User entered: " + isReallyInMRI);
198 }
199
200 }
201 }
202 //}
203
204 } catch(Exception e) {
205 e.printStackTrace();
206 logger.error("Exception occurred when processing CSV file or writing out file:\n"
207 + Utility.getFilePath(tmpOutFile));
208 logger.error(e.getMessage(), e);
209 }
210
211 //return urlsList;
212 return Utility.getFilePath(tmpOutFile);
213 }
214
215
216
217 public static void printUsage() {
218 System.err.println("Usage: ManualURLInspection webPageURLs.txt");
219 }
220
221 /**
222 * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
223 * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
224 * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
225 * translated and really contain at least one webpage containing at least one sentence in MRI.
226 * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
227 * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
228 * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
229 * 90% confidence with 5% margin of error for testing binary outcomes, see
230 * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
231 */
232 public static void main(String args[]) {
233 SafeProcess.DEBUG = 1;
234
235 if(args.length != 1) {
236 printUsage();
237 System.exit(-1);
238 }
239
240
241 try (
242 MongoDBQueryer mongodb = new MongoDBQueryer();
243 ) {
244
245 mongodb.connectToDB();
246
247 // output files will be stored in mongodb-data-auto
248 File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
249
250
251 logger.info("*************************************");
252
253
254
255 final File inputFile = new File(args[0]);
256 if(!inputFile.exists()) {
257 logger.info("File " + inputFile + " does not exist");
258 System.exit(-1);
259 }
260
261 final ManualURLInspection inspector = new ManualURLInspection(mongodb, inputFile);
262
263
264 Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
265 public void run() {
266 logger.info("@@@@@@@@@@@@@@@@@@@@@@@@");
267 logger.info("WARNING!!!");
268 logger.info("Got Ctrl-C. INCOMPLETE generated temp CSV file: " +
269 inspector.getCSVOutputFilename());
270 logger.info(String.format("Copy remaining records from input file %s into this file.",
271 Utility.getFilePath(inputFile)));
272 logger.info("@@@@@@@@@@@@@@@@@@@@@@@@");
273 }
274 }));
275
276 String filename = inspector.processCSV();
277
278 logger.info("Generated temp CSV file: " + filename);
279 logger.info("*************************************");
280 } catch(Exception e) {
281 logger.error(e.getMessage(), e);
282 }
283 }
284}
Note: See TracBrowser for help on using the repository browser.