source: other-projects/maori-lang-detection/src/org/greenstone/atea/ManualURLInspection.java@ 33963

Last change on this file since 33963 was 33963, checked in by ak19, 4 years ago

Added a new helper method to MongoDBQueryer.java to add numPagesInMRI and totalPages totals columns into the csv file

File size: 29.8 KB
Line 
1package org.greenstone.atea;
2
3import java.util.*;
4import java.io.*;
5
6import org.apache.commons.csv.*;
7import org.apache.log4j.Logger;
8
9import org.greenstone.util.SafeProcess;
10
11/**
12 * Program to help going through the n number of random sample web page URLs stored in input
13 * csv file, to eyeball whether the full text (stored in mongodb for each) is indeed inMRI
14 * or not. User can enter Y|N|? and ctr-D or ctrl-C to continue working on this later.
15 * The output file is the input filename + .tmp suffix.
16 * When user continues later, the output file from last time must be used as input file.
17 * Any csv records not completed earlier or with ? entered will be presented for input
18 * on (re-)running this program.
19 *
20 * TO COMPILE OR RUN, FIRST DO:
21 * cd maori-lang-detection/apache-opennlp-1.9.1
22 * export OPENNLP_HOME=`pwd`
23 * cd maori-lang-detection/src
24 *
25 * TO COMPILE:
26 * maori-lang-detection/src$
27 * javac -cp ".:../conf:../lib/*" org/greenstone/atea/ManualURLInspection.java
28 *
29 * TO RUN:
30 * maori-lang-detection/src$
31 * java -cp ".:../conf:../lib/*" org/greenstone/atea/ManualURLInspection ../mongodb-data/random260_manualList_globalDomains_whereAPageContainsMRI.txt
32 *
33*/
34public class ManualURLInspection {
35
36 static Logger logger = Logger.getLogger(org.greenstone.atea.ManualURLInspection.class.getName());
37
38 private final MongoDBQueryer mongodbQueryer;
39 private final File outFolder;
40 private final File webPageURLsCSVFile;
41 private final File tmpOutFile;
42
43 /** csv column numbers */
44 public static final int URL_COLUMN = 0;
45 public static final int COUNTRY_CODE_COLUMN = 1;
46 public static final int IS_REALLY_IN_MRI_COLUMN = 2;
47 public static final int QUALITY_LEVEL_COLUMN = 3;
48 public static final int COUNT_OF_PAGES_IN_MRI_COLUMN = 4; // count as detected by OpenNLP
49 public static final int TOTAL_PAGES_IN_SITE_COLUMN = 5;
50
51 /** Possible values for the Quality Level column of the csv file */
52 public static final String NAV = "NAV";
53 public static final String LITTLE_TEXT = "LITTLE_TEXT";
54 public static final String MIXED_TEXT = "MIXED_TEXT";
55 public static final String SIGNIFICANTLY_MAORI = "SIGNIFICANTLY_MAORI";
56 public static final String MAORI_PARAGRAPHS = "MAORI_PARAGRAPHS";
57 public static final String WORDS = "WORDS"; // words or titles, not full sentences
58 public static final String OTHER_LANGUAGES = "OTHER_LANGUAGES";
59 public static final String POEMS_OR_SONGS = "POEMS_OR_SONGS";
60 public static final String SINGLE_MRI_SENTENCE = "SINGLE_MRI_SENTENCE"; // TODO: REVIEW
61 public static final String LINK_TEXT = "LINK_TEXT"; // for office positions designations and link text
62
63
64 public ManualURLInspection(MongoDBQueryer mongodbQueryer, File csvFile)
65 {
66 this.mongodbQueryer = mongodbQueryer;
67 this.webPageURLsCSVFile = csvFile;
68 this.outFolder = csvFile.getParentFile();
69
70 String tmpFilename = Utility.getFilePath(webPageURLsCSVFile);
71 this.tmpOutFile = new File(tmpFilename+".tmp");
72 }
73
74 public String getCSVOutputFilename() {
75 return (tmpOutFile == null) ? "" : Utility.getFilePath(tmpOutFile);
76 }
77
78 /**
79 * Read .csv input file one line at a time.
80 * For each line,
81 * - if empty line empty, skip it.
82 * - If the 3rd column of line is already filled in with Y|N, write out identical line
83 * into tmp output file.
84 * - If third column contains ? or if 3rd column is empty, run a MongoDBQuery to get
85 * the full text of the page and display it on screen.
86 * Wait for user input.
87 * - If Enter hit or Y input, write out Y in 3rd field of line into tmp file.
88 * - If N or ? entered, write out N/? as 3rd field.
89 *
90 * Loop through input csv until finished or until Ctrl-C or Ctrl-D pressed.
91 * Ctrl-D here means end of all user interaction, signalling user wants
92 * to stop entering data and continue later.
93 *
94 * When finished or Ctrl-D entered or Ctrl C pressed, all data entered must have been written
95 * out. So to avoid losing data on Ctrl-Ck, write out each processed csv record (whether
96 * already complete or whether user entry made it complete) and flush writer.
97 * When program terminates in any manner, print message that the file has been created.
98 */
99 public String processCSV() {
100
101 final String USER_PROMPT = "Enter isMRI value of Y|N|? for (%d): %s - %s > ";
102 //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > ";
103
104 boolean terminate = false;
105 CSVParser parser = null;
106
107 try {
108 parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
109 } catch(Exception e) {
110 logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e);
111 return "Failed";
112 }
113
114 try (
115 //BufferedWriter writer = new BufferedWriter(new FileWriter(tmpOutFile));
116 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
117 ) {
118
119 int recordCount = 0;
120 for (CSVRecord csvRecord : parser) {
121 //if(terminate) condition handled further below
122
123 //logger.debug("Got record: " + csvRecord.toString());
124
125 //int recordNo = csvRecord.RecordNumber(); // will count empty lines!
126
127 //if(csvRecord.size() != 0) {
128 String url = csvRecord.get(URL_COLUMN);
129 if(url.equals("")) { // skip empty lines
130 continue;
131 }
132
133 recordCount++;
134 String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
135 String isReallyInMRI = "";
136 String qualityLevel = null;
137
138 //String isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
139 //if(!isReallyInMRI.equals("")) {
140 if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) {
141 isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
142 }
143
144 if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
145 qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
146 }
147
148 if(terminate || (!isReallyInMRI.equals("") && !isReallyInMRI.equals("?"))) {
149 // if(terminate) on Ctrl-D, don't stop processing csv records
150 // Instead, copy remaining records of input csv file into output csv file
151 isReallyInMRI = isReallyInMRI.toUpperCase();
152 if(qualityLevel == null) {
153 csvWriter.printRecord(url, countryCode, isReallyInMRI);
154 } else {
155 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel);
156 }
157 csvWriter.flush();
158 logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
159 + " - " + isReallyInMRI + " - " + qualityLevel);
160 }
161 else {
162
163 // First, display full text for web page record with matching url
164 // so the user can look at it to decide whether it is indeed overall in MRI or not.
165 String fulltext = mongodbQueryer.displayFullTextOfPage(url);
166 System.err.println(String.format("FULL-TEXT for record %d:\n%s\n", recordCount, fulltext));
167
168 //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + qualityLevel);
169
170 // Read Input until Ctrl-D: read System.In as bufferedReader
171 // https://stackoverflow.com/questions/5837823/read-input-until-controld
172 // Ctrl-C is already taken care if, see
173 // https://coderanch.com/t/279136/java/terminated-program-Control-close-open
174 // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases."
175 // So I just need to flush the csv print writer after every record is written
176 // and Ctrl-C won't lose any of the data thus far entered by the user.
177
178 BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
179
180 boolean done = false;
181
182 System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode));
183 boolean previouslyQuestionMark = false;
184 if(isReallyInMRI.equals("?")) {
185 previouslyQuestionMark = true;
186 System.err.println("\t? entered last time");
187 }
188 while(!done && ((isReallyInMRI = systemIn.readLine()) != null)) {
189
190 isReallyInMRI = isReallyInMRI.toUpperCase();
191
192 //logger.debug("@@ Got: |" + isReallyInMRI + "|");
193
194 // if user hit enter, it means they accepted
195 // - that the full text displayed is really in MRI: Y
196 // - the previous value entered if it was a ?
197 if(isReallyInMRI.equals("")) {
198 if(previouslyQuestionMark) {
199 isReallyInMRI = "?";
200 } else {
201 isReallyInMRI = "Y";
202 }
203 }
204
205
206 if(isReallyInMRI.equals("Y") || isReallyInMRI.equals("N") || isReallyInMRI.equals("?")) {
207 done = true;
208 //break;
209 } else {
210 System.out.println("@@ UNRECOGNISED. "
211 + String.format(USER_PROMPT, recordCount, url, countryCode));
212 }
213 }
214
215 // Save the CSV record - even if quality level is null
216 // Because we don't want to lose the line that used to exist in the file
217 if(qualityLevel == null) {
218 csvWriter.printRecord(url, countryCode, isReallyInMRI);
219 } else {
220 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel);
221 }
222 csvWriter.flush();
223
224 if(isReallyInMRI == null) { // if sys.in readLine() was terminated with Ctrl-D
225 terminate = true;
226 System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating.");
227 } else {
228 System.out.println("User entered: " + isReallyInMRI);
229 }
230
231 }
232 }
233 //}
234
235 } catch(Exception e) {
236 e.printStackTrace();
237 logger.error("Exception occurred when processing CSV file or writing out file:\n"
238 + Utility.getFilePath(tmpOutFile));
239 logger.error(e.getMessage(), e);
240 }
241
242 //return urlsList;
243 return Utility.getFilePath(tmpOutFile);
244 }
245
246 /**
247 * Similar to processCSV() above, but for entering the page quality level of each web page
248 * This goes into the QUALITY_LEVEL_COLUMN column of the csv file.
249 * Web pages from some web sites commonly recurring in the csv input file tend to be largely
250 * navigation menus, so preset to NAV. Others are known to be low quality for text resources
251 * as they only have nav menus and pictures despite these being largely in Māori,
252 * which can also go under NAV.
253 * Other web sites have little text overall whether Māori or mixed with English, nav included,
254 * (LITTLE_TEXT), or significantly mixed (MRI+ENG/...) text even if a decent amount of text
255 * (MIXED_TEXT). Some sites may largely have standalone words for learning (WORDS).
256 * Other than known websites that have regular content of one of the above types,
257 * the user can enter these values for rarer websites whose web pages may pop up:
258 * NAV, LITTLE_TEXT, MIXED_TEXT, WORDS, SIGNIFICANTLY_MAORI (for decent amounts of MRI text)
259 * MAORI_PARAGRAPHS (for largely continuous paras in MRI even if there are paras in other
260 * langs) and OTHER_LANGUAGES if text not in MRI but mostly in other language,
261 * POEMS_OR_SONGS for content that's largely songs or poetry.
262 */
263 public String processCSV_QualityLevelColumn() {
264
265 Map<String, String> predefinedDefaultsMap = new HashMap<String, String>();
266 predefinedDefaultsMap.put("tetaurawhiri.govt.nz", NAV);
267 predefinedDefaultsMap.put("tmoa.tki.org.nz", SIGNIFICANTLY_MAORI);
268 predefinedDefaultsMap.put("paekupu.co.nz", MIXED_TEXT); // html is mixed, but display is more MRI
269 predefinedDefaultsMap.put("m.biblepub.com", SIGNIFICANTLY_MAORI);
270 predefinedDefaultsMap.put("biblehub.com", SIGNIFICANTLY_MAORI);
271 predefinedDefaultsMap.put("pukoro.co.nz", WORDS);
272 predefinedDefaultsMap.put("mi.wikipedia.org", MIXED_TEXT);
273 predefinedDefaultsMap.put("mi.m.wikipedia.org", WORDS);
274 predefinedDefaultsMap.put("tkkmmokopuna.school.nz", NAV);
275 predefinedDefaultsMap.put("twtop.school.nz", NAV);
276 predefinedDefaultsMap.put("animations.tewhanake.maori.nz", MAORI_PARAGRAPHS);
277 predefinedDefaultsMap.put("csunplugged.org", SIGNIFICANTLY_MAORI);
278 predefinedDefaultsMap.put("waiata.maori.nz", POEMS_OR_SONGS);
279
280 final String USER_PROMPT = "Enter qualityLevel value of\n\t? | (N)AV | (L)ITTLE_TEXT | (M)IXED_TEXT | (S)IGNIFICANTLY_MAORI | MAORI_(P)ARAGRAPHS"
281 + "\n\t | LINK_(T)EXT | PO(E)MS_OR_SONGS | S(I)NGLE_MRI_SENTENCE | (W)ORDS | (O)THER_LANGUAGES\n\tfor (%d): %s - %s > ";
282 //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > ";
283
284 boolean terminate = false;
285 CSVParser parser = null;
286
287 try {
288 parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
289 } catch(Exception e) {
290 logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e);
291 return "Failed";
292 }
293
294 try (
295 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
296 ) {
297
298 int recordCount = 0;
299 for (CSVRecord csvRecord : parser) {
300 //if(terminate) condition handled further below
301
302 //logger.debug("Got record: " + csvRecord.toString());
303
304 String url = csvRecord.get(URL_COLUMN);
305 if(url.equals("")) { // skip empty lines
306 continue;
307 }
308
309 recordCount++;
310 String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
311 String isReallyInMRI = "";
312 String qualityLevel = "";
313
314 if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) {
315 isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
316 }
317
318 if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
319 qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
320
321 // Force valid values or ""
322 qualityLevel = getFullQualityLevelNameUppercased(qualityLevel);
323 }
324
325 if(terminate || (!qualityLevel.equals("") && !qualityLevel.equals("?"))) {
326 // if(terminate) on Ctrl-D, don't stop processing csv records
327 // Instead, copy remaining records of input csv file into output csv file
328
329 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel);
330 csvWriter.flush();
331 logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
332 + " - " + isReallyInMRI + " - " + qualityLevel);
333 }
334 else {
335
336 // First, display full text for web page record with matching url
337 // so the user can look at it to decide whether it is indeed overall in MRI or not.
338 String fulltext = mongodbQueryer.displayFullTextOfPage(url);
339 System.err.println(String.format("\nFULL-TEXT for record %d:\n%s\n", recordCount, fulltext));
340
341 //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + qualityLevel);
342
343 // Read Input until Ctrl-D: read System.In as bufferedReader
344 // https://stackoverflow.com/questions/5837823/read-input-until-controld
345 // Ctrl-C is already taken care if, see
346 // https://coderanch.com/t/279136/java/terminated-program-Control-close-open
347 // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases."
348 // So I just need to flush the csv print writer after every record is written
349 // and Ctrl-C won't lose any of the data thus far entered by the user.
350
351 BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
352
353 boolean done = false;
354
355 // Work out default if basic URLs present in defaults map
356 // If it is, use its value as default for this URL
357 String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false));
358 String predefQualityLevel = predefinedDefaultsMap.get(basicURL);
359
360 System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode));
361 if(predefQualityLevel != null) {
362 System.err.println("\tDefault for this domain: " + predefQualityLevel
363 + ". Press Enter to accept >");
364 }
365
366 boolean previouslyQuestionMark = false;
367 String oldQualityLevel = qualityLevel;
368
369 if(qualityLevel.equals("?")) {
370 previouslyQuestionMark = true;
371 System.err.println("\t? entered last time. Press Enter to keep >");
372 }
373 while(!done && ((qualityLevel = systemIn.readLine()) != null)) {
374 //logger.debug("@@ Got: |" + qualityLevel + "|");
375
376 // If the user hit enter, it means they accepted
377 // - the previous value entered, if it was a ?
378 // - or want the default for the URL if any displayed
379 // - or want SIGNIFICANTLY_MAORI if no default displayed
380 if(qualityLevel.equals("")) { // User just hit enter without other chars
381 if(previouslyQuestionMark) {
382 qualityLevel = "?";
383 } else {
384 qualityLevel = (predefQualityLevel == null) ? SIGNIFICANTLY_MAORI : predefQualityLevel;
385 }
386
387 oldQualityLevel = qualityLevel;
388 }
389 else {
390 // force valid values - will return "" if invalid value
391 qualityLevel = getFullQualityLevelNameUppercased(qualityLevel);
392 }
393
394 // only if qualityLevel entered was invalid, would it now
395 // have been changed to ""
396 if(!qualityLevel.equals("")) {
397 oldQualityLevel = qualityLevel;
398 done = true;
399 } else {
400 System.out.println("@@ UNRECOGNISED. "
401 + String.format(USER_PROMPT, recordCount, url, countryCode));
402 }
403 }
404
405 // Save the CSV record - even if quality level is null
406 // Because we don't want to lose the line that used to exist in the file
407 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel);
408 csvWriter.flush();
409
410 if(qualityLevel == null) { // if sys.in readLine() was terminated with Ctrl-D
411 terminate = true;
412 System.out.println("--- Got Ctrl-D (Lin)/Ctrl-Z (Win). Terminating. ---");
413 } else {
414 System.out.println("User entered: " + oldQualityLevel);
415
416 }
417 }
418 }
419
420 if(terminate = true) {
421 System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating.");
422 }
423
424 } catch(Exception e) {
425 e.printStackTrace();
426 logger.error("Exception occurred when processing CSV file or writing out file:\n"
427 + Utility.getFilePath(tmpOutFile));
428 logger.error(e.getMessage(), e);
429 }
430
431
432 return Utility.getFilePath(tmpOutFile);
433 }
434
435 public String getFullQualityLevelNameUppercased(String qualityLevel) {
436
437 qualityLevel = qualityLevel.toUpperCase();
438
439 if(qualityLevel.equals("N")) {
440 return NAV;
441 } else if(qualityLevel.equals("L")) {
442 return LITTLE_TEXT;
443 } else if(qualityLevel.equals("M")) {
444 return MIXED_TEXT;
445 } else if(qualityLevel.equals("S")) {
446 return SIGNIFICANTLY_MAORI;
447 } else if(qualityLevel.equals("P")) {
448 return MAORI_PARAGRAPHS;
449 } else if(qualityLevel.equals("W")) {
450 return WORDS;
451 } else if(qualityLevel.equals("O")) {
452 return OTHER_LANGUAGES;
453 } else if(qualityLevel.equals("E")) {
454 return POEMS_OR_SONGS;
455 } else if(qualityLevel.equals("I")) {
456 return SINGLE_MRI_SENTENCE;
457 } else if(qualityLevel.equals("T")) {
458 return LINK_TEXT;
459 } else if(qualityLevel.equals(NAV)
460 || qualityLevel.equals(LITTLE_TEXT)
461 || qualityLevel.equals(MIXED_TEXT)
462 || qualityLevel.equals(SIGNIFICANTLY_MAORI)
463 || qualityLevel.equals(MAORI_PARAGRAPHS)
464 || qualityLevel.equals(WORDS)
465 || qualityLevel.equals(OTHER_LANGUAGES)
466 || qualityLevel.equals(POEMS_OR_SONGS)
467 || qualityLevel.equals(SINGLE_MRI_SENTENCE)
468 || qualityLevel.equals(LINK_TEXT)) {
469 return qualityLevel;
470 }
471 return "";
472 }
473
474
475 public void reviewQualityLevelFieldFor(/*String basicDomain,*/ String fieldValue) {
476 final String USER_PROMPT = "Enter qualityLevel value of\n\t? | (N)AV | (L)ITTLE_TEXT | (M)IXED_TEXT | (S)IGNIFICANTLY_MAORI | MAORI_(P)ARAGRAPHS"
477 + "\n\t | LINK_(T)EXT | PO(E)MS_OR_SONGS | S(I)NGLE_MRI_SENTENCE | (W)ORDS | (O)THER_LANGUAGES\n\tfor (%d): %s - %s > ";
478 //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > ";
479
480 boolean terminate = false;
481 CSVParser parser = null;
482
483 try {
484 parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
485 } catch(Exception e) {
486 logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e);
487 return;
488 }
489
490 try (
491 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
492 ) {
493
494 int recordCount = 0;
495 for (CSVRecord csvRecord : parser) {
496
497 String url = csvRecord.get(URL_COLUMN);
498 if(url.equals("")) { // skip empty lines
499 continue;
500 }
501
502 recordCount++;
503
504 String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false));
505 /*
506 if(!basicURL.equals(basicDomain)) {
507 continue; // skip URLs we're not interested in
508 }
509 */
510
511 // Work out default if basic URLs present in defaults map
512 // If it is, use its value as default for this URL
513 //String predefQualityLevel = predefinedDefaultsMap.get(basicURL);
514
515 String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
516 String isReallyInMRI = "";
517 String qualityLevel = "";
518
519 if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) {
520 isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
521 }
522
523 if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
524 qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
525
526 // Force valid values or ""
527 qualityLevel = getFullQualityLevelNameUppercased(qualityLevel);
528 }
529
530
531 if(terminate || !qualityLevel.equals(fieldValue)
532 /* || basicURL.equals("paekupu.co.nz") // when reviewing MIXED_TEXT */
533 /*|| basicURL.equals("tetaurawhiri.govt.nz") // when reviewing NAV */
534 /*|| basicURL.equals("biblehub.com") || basicURL.equals("m.biblepub.com") // when reviewing SIGNIFICANTLY_MAORI */) {
535 // if(terminate) on Ctrl-D, don't stop processing csv records
536 // Instead, copy remaining records of input csv file into output csv file
537
538 // Similarly, if the qualityLevel field does not have the value we're interested in
539 // then just write it out as-is
540 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel);
541 csvWriter.flush();
542 logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
543 + " - " + isReallyInMRI + " - " + qualityLevel);
544 }
545 else {
546
547 // First, display full text for web page record with matching url
548 // so the user can look at it to decide whether it is indeed overall in MRI or not.
549 String fulltext = mongodbQueryer.displayFullTextOfPage(url);
550 System.err.println(String.format("\nFULL-TEXT for record %d:\n%s\n", recordCount, fulltext));
551
552 //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + qualityLevel);
553
554 // Read Input until Ctrl-D: read System.In as bufferedReader
555 // https://stackoverflow.com/questions/5837823/read-input-until-controld
556 // Ctrl-C is already taken care if, see
557 // https://coderanch.com/t/279136/java/terminated-program-Control-close-open
558 // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases."
559 // So I just need to flush the csv print writer after every record is written
560 // and Ctrl-C won't lose any of the data thus far entered by the user.
561
562 BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
563
564 boolean done = false;
565
566 System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode));
567 //if(predefQualityLevel != null) {
568 //System.err.println("\tDefault for this domain: " + predefQualityLevel
569 //+ ". Press Enter to accept >");
570 //}
571
572 if(qualityLevel.equals(fieldValue)) {
573 System.err.println("\t" + fieldValue + " entered last time. Press Enter to keep >");
574 }
575 while(!done && ((qualityLevel = systemIn.readLine()) != null)) {
576 //logger.debug("@@ Got: |" + qualityLevel + "|");
577
578 // If the user hit enter, it means they accepted the previous value entered
579 if(qualityLevel.equals("")) { // User just hit enter without other chars
580 qualityLevel = fieldValue;
581 }
582 else {
583 // force valid values - will return "" if invalid value
584 qualityLevel = getFullQualityLevelNameUppercased(qualityLevel);
585 }
586
587 // only if qualityLevel entered was invalid, would it now
588 // have been changed to ""
589 if(!qualityLevel.equals("")) {
590 done = true;
591 } else {
592 System.out.println("@@ UNRECOGNISED. "
593 + String.format(USER_PROMPT, recordCount, url, countryCode));
594 }
595 }
596
597 // Save the CSV record - even if quality level is null
598 // Because we don't want to lose the line that used to exist in the file
599 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel);
600 csvWriter.flush();
601
602 if(qualityLevel == null) { // if sys.in readLine() was terminated with Ctrl-D
603 terminate = true;
604 System.out.println("--- Got Ctrl-D (Lin)/Ctrl-Z (Win). Terminating. ---");
605 } else {
606 System.out.println("User entered: " + qualityLevel);
607
608 }
609 }
610 }
611
612 if(terminate = true) {
613 System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating.");
614 }
615
616 } catch(Exception e) {
617 e.printStackTrace();
618 logger.error("Exception occurred when processing CSV file or writing out file:\n"
619 + Utility.getFilePath(tmpOutFile));
620 logger.error(e.getMessage(), e);
621 }
622
623 }
624
625 /**
626 * Add 2 new columns to the csv file: num pages in site that are inMRI and total num pages in site.
627 */
628 public void insertTotalsIntoCSVRecords() {
629
630 boolean terminate = false;
631 CSVParser parser = null;
632
633 try {
634 parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
635 } catch(Exception e) {
636 logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e);
637 return;
638 }
639
640 try (
641 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
642 ) {
643
644 int recordCount = 0;
645 for (CSVRecord csvRecord : parser) {
646
647 String url = csvRecord.get(URL_COLUMN);
648 if(url.equals("")) { // skip empty lines
649 continue;
650 }
651
652 recordCount++;
653
654 String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false));
655
656 String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
657 String isReallyInMRI = "";
658 String qualityLevel = "";
659
660 if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) {
661 isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
662 }
663
664 if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
665 qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
666 }
667
668 //COUNT_OF_PAGES_IN_MRI_COLUMN; TOTAL_PAGES_IN_SITE_COLUMN;
669 long countNumPagesInMRI = mongodbQueryer.getFieldTotalForDomainSuffix(
670 basicURL, MongoDBQueryer.FIELD_NUM_PAGES_IN_MRI);
671 long countTotalPages = mongodbQueryer.getFieldTotalForDomainSuffix(
672 basicURL, MongoDBQueryer.FIELD_TOTAL_PAGES);
673
674 logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
675 + " - " + isReallyInMRI + " - " + qualityLevel
676 + " - " + countNumPagesInMRI + " - " + countTotalPages);
677
678 // Save the CSV record into the tmp file with the 2 counts columns
679 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel,
680 countNumPagesInMRI, countTotalPages);
681 }
682
683 } catch(Exception e) {
684 e.printStackTrace();
685 logger.error("Exception occurred when processing CSV file or writing out file:\n"
686 + Utility.getFilePath(tmpOutFile));
687 logger.error(e.getMessage(), e);
688 }
689
690 }
691
692
693 public static void printUsage() {
694 System.err.println("Usage: ManualURLInspection webPageURLs.txt");
695 }
696
697 /**
698 * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains),
699 * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately.
700 * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically
701 * translated and really contain at least one webpage containing at least one sentence in MRI.
702 * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages
703 * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching
704 * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving
705 * 90% confidence with 5% margin of error for testing binary outcomes, see
706 * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome
707 */
708 public static void main(String args[]) {
709 SafeProcess.DEBUG = 1;
710
711 if(args.length != 1) {
712 printUsage();
713 System.exit(-1);
714 }
715
716
717 try (
718 MongoDBQueryer mongodb = new MongoDBQueryer();
719 ) {
720
721 mongodb.connectToDB();
722
723 // output files will be stored in mongodb-data-auto
724 File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile();
725
726
727 logger.info("*************************************");
728
729
730
731 final File inputFile = new File(args[0]);
732 if(!inputFile.exists()) {
733 logger.info("File " + inputFile + " does not exist");
734 System.exit(-1);
735 }
736
737 final ManualURLInspection inspector = new ManualURLInspection(mongodb, inputFile);
738
739 Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() {
740 public void run() {
741 logger.info("@@@@@@@@@@@@@@@@@@@@@@@@");
742 logger.info("WARNING: If Ctrl-C was pressed, then");
743 logger.info("\tan INCOMPLETE temp CSV file would have been generated at: " +
744 inspector.getCSVOutputFilename());
745 logger.info(String.format("\tSo copy remaining records from input file %s into this file.",
746 Utility.getFilePath(inputFile)));
747 logger.info("@@@@@@@@@@@@@@@@@@@@@@@@");
748 }
749 }));
750
751 //String filename = inspector.processCSV();
752 //String filename = inspector.processCSV_QualityLevelColumn();
753
754
755 //inspector.reviewQualityLevelFieldFor("SINGLE_MRI_SENTENCE");
756
757 inspector.insertTotalsIntoCSVRecords();
758
759 //logger.info("Generated temp CSV file: " + filename);
760 logger.info("*************************************");
761 } catch(Exception e) {
762 logger.error(e.getMessage(), e);
763 }
764 }
765}
Note: See TracBrowser for help on using the repository browser.