Changeset 33946 for other-projects
- Timestamp:
- 2020-02-18T21:58:42+13:00 (4 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
other-projects/maori-lang-detection/src/org/greenstone/atea/ManualURLInspection.java
r33941 r33946 45 45 public static final int COUNTRY_CODE_COLUMN = 1; 46 46 public static final int IS_REALLY_IN_MRI_COLUMN = 2; 47 public static final int QUALITY_LEVEL_COLUMN = 3; 48 49 50 /** Possible values for the Quality Level column of the csv file */ 51 public static final String NAV = "NAV"; 52 public static final String LITTLE_TEXT = "LITTLE_TEXT"; 53 public static final String MIXED_TEXT = "MIXED_TEXT"; 54 public static final String SIGNIFICANTLY_MAORI = "SIGNIFICANTLY_MAORI"; 55 public static final String MAORI_PARAGRAPHS = "MAORI_PARAGRAPHS"; 56 public static final String WORDS = "WORDS"; // words or titles, not full sentences 57 public static final String OTHER_LANGUAGES = "OTHER_LANGUAGES"; 58 public static final String POEMS_OR_SONGS = "POEMS_OR_SONGS"; 59 public static final String SINGLE_MRI_SENTENCE = "SINGLE_MRI_SENTENCE"; 47 60 48 61 … … 119 132 String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN); 120 133 String isReallyInMRI = ""; 134 String qualityLevel = null; 135 121 136 //String isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN); 122 137 //if(!isReallyInMRI.equals("")) { … … 125 140 } 126 141 142 if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) { 143 qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN); 144 } 145 127 146 if(terminate || (!isReallyInMRI.equals("") && !isReallyInMRI.equals("?"))) { 128 147 // if(terminate) on Ctrl-D, don't stop processing csv records 129 148 // Instead, copy remaining records of input csv file into output csv file 130 149 isReallyInMRI = isReallyInMRI.toUpperCase(); 131 csvWriter.printRecord(url, countryCode, isReallyInMRI); 150 if(qualityLevel == null) { 151 csvWriter.printRecord(url, countryCode, isReallyInMRI); 152 } else { 153 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel); 154 } 132 155 csvWriter.flush(); 133 156 logger.info("Got record " + recordCount + ": " + url + " - " + countryCode 134 + " - " + isReallyInMRI );157 + " - " + isReallyInMRI + " - " + qualityLevel); 135 158 } 136 159 else { … … 141 164 System.err.println(String.format("FULL-TEXT for record %d:\n%s\n", recordCount, fulltext)); 142 165 143 //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode );166 //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + qualityLevel); 144 167 145 168 // Read Input until Ctrl-D: read System.In as bufferedReader … … 188 211 } 189 212 190 // save the record 191 csvWriter.printRecord(url, countryCode, isReallyInMRI); 213 // Save the CSV record - even if quality level is null 214 // Because we don't want to lose the line that used to exist in the file 215 if(qualityLevel == null) { 216 csvWriter.printRecord(url, countryCode, isReallyInMRI); 217 } else { 218 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel); 219 } 192 220 csvWriter.flush(); 193 221 194 if(isReallyInMRI == null) { 222 if(isReallyInMRI == null) { // if sys.in readLine() was terminated with Ctrl-D 195 223 terminate = true; 196 224 System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating."); … … 214 242 } 215 243 216 244 /** 245 * Similar to processCSV() above, but for entering the page quality level of each web page 246 * This goes into the QUALITY_LEVEL_COLUMN column of the csv file. 247 * Web pages from some web sites commonly recurring in the csv input file tend to be largely 248 * navigation menus, so preset to NAV. Others are known to be low quality for text resources 249 * as they only have nav menus and pictures despite these being largely in MÄori, 250 * which can also go under NAV. 251 * Other web sites have little text overall whether MÄori or mixed with English, nav included, 252 * (LITTLE_TEXT), or significantly mixed (MRI+ENG/...) text even if a decent amount of text 253 * (MIXED_TEXT). Some sites may largely have standalone words for learning (WORDS). 254 * Other than known websites that have regular content of one of the above types, 255 * the user can enter these values for rarer websites whose web pages may pop up: 256 * NAV, LITTLE_TEXT, MIXED_TEXT, WORDS, SIGNIFICANTLY_MAORI (for decent amounts of MRI text) 257 * MAORI_PARAGRAPHS (for largely continuous paras in MRI even if there are paras in other 258 * langs) and OTHER_LANGUAGES if text not in MRI but mostly in other language, 259 * POEMS_OR_SONGS for content that's largely songs or poetry. 260 */ 261 public String processCSV_QualityLevelColumn() { 262 263 Map<String, String> predefinedDefaultsMap = new HashMap<String, String>(); 264 predefinedDefaultsMap.put("tetaurawhiri.govt.nz", NAV); 265 predefinedDefaultsMap.put("tmoa.tki.org.nz", SIGNIFICANTLY_MAORI); 266 predefinedDefaultsMap.put("paekupu.co.nz", MIXED_TEXT); // html is mixed, but display is more MRI 267 predefinedDefaultsMap.put("m.biblepub.com", SIGNIFICANTLY_MAORI); 268 predefinedDefaultsMap.put("biblehub.com", SIGNIFICANTLY_MAORI); 269 predefinedDefaultsMap.put("pukoro.co.nz", WORDS); 270 predefinedDefaultsMap.put("mi.wikipedia.org", MIXED_TEXT); 271 predefinedDefaultsMap.put("mi.m.wikipedia.org", WORDS); 272 predefinedDefaultsMap.put("tkkmmokopuna.school.nz", NAV); 273 predefinedDefaultsMap.put("twtop.school.nz", NAV); 274 predefinedDefaultsMap.put("animations.tewhanake.maori.nz", MAORI_PARAGRAPHS); 275 predefinedDefaultsMap.put("csunplugged.org", SIGNIFICANTLY_MAORI); 276 predefinedDefaultsMap.put("waiata.maori.nz", POEMS_OR_SONGS); 277 278 final String USER_PROMPT = "Enter qualityLevel value of\n\t? | (N)AV | (L)ITTLE_TEXT | (M)IXED_TEXT | (S)IGNIFICANTLY_MAORI | MAORI_(P)ARAGRAPHS" 279 + "\n\t | PO(E)MS_OR_SONGS | S(I)NGLE_MRI_SENTENCE | (W)ORDS | (O)THER_LANGUAGES\n\tfor (%d): %s - %s > "; 280 //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > "; 281 282 boolean terminate = false; 283 CSVParser parser = null; 284 285 try { 286 parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180); 287 } catch(Exception e) { 288 logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e); 289 return "Failed"; 290 } 291 292 try ( 293 CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); 294 ) { 295 296 int recordCount = 0; 297 for (CSVRecord csvRecord : parser) { 298 //if(terminate) condition handled further below 299 300 //logger.debug("Got record: " + csvRecord.toString()); 301 302 String url = csvRecord.get(URL_COLUMN); 303 if(url.equals("")) { // skip empty lines 304 continue; 305 } 306 307 recordCount++; 308 String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN); 309 String isReallyInMRI = ""; 310 String qualityLevel = ""; 311 312 if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) { 313 isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN); 314 } 315 316 if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) { 317 qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN); 318 319 /* 320 qualityLevel = qualityLevel.toUpperCase(); 321 322 if(qualityLevel.equals("N")) { 323 qualityLevel = NAV; 324 } else if(qualityLevel.equals("L")) { 325 qualityLevel = LITTLE_TEXT; 326 } else if(qualityLevel.equals("M")) { 327 qualityLevel = MIXED_TEXT; 328 } else if(qualityLevel.equals("P")) { 329 qualityLevel = MAORI_PARAGRAPHS; 330 } else if(qualityLevel.equals("S")) { 331 qualityLevel = SIGNIFICANTLY_MAORI; 332 } else if(qualityLevel.equals("W")) { 333 qualityLevel = WORDS; 334 } else if(qualityLevel.equals("O")) { 335 qualityLevel = OTHER_LANGUAGE; 336 } else if(qualityLevel.equals("E")) { 337 qualityLevel = POEMS_OR_SONGS; 338 } else if(qualityLevel.equals("I")) { 339 qualityLevel = SINGLE_MRI_SENTENCE; 340 } 341 // else remains at whatever was already in the file or 342 // else "" if no qualityLevel column for this record present in the file yet 343 */ 344 345 // Force valid values or "" 346 qualityLevel = getFullQualityLevelNameUppercased(qualityLevel); 347 } 348 349 if(terminate || (!qualityLevel.equals("") && !qualityLevel.equals("?"))) { 350 // if(terminate) on Ctrl-D, don't stop processing csv records 351 // Instead, copy remaining records of input csv file into output csv file 352 353 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel); 354 csvWriter.flush(); 355 logger.info("Got record " + recordCount + ": " + url + " - " + countryCode 356 + " - " + isReallyInMRI + " - " + qualityLevel); 357 } 358 else { 359 360 // First, display full text for web page record with matching url 361 // so the user can look at it to decide whether it is indeed overall in MRI or not. 362 String fulltext = mongodbQueryer.displayFullTextOfPage(url); 363 System.err.println(String.format("\nFULL-TEXT for record %d:\n%s\n", recordCount, fulltext)); 364 365 //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + qualityLevel); 366 367 // Read Input until Ctrl-D: read System.In as bufferedReader 368 // https://stackoverflow.com/questions/5837823/read-input-until-controld 369 // Ctrl-C is already taken care if, see 370 // https://coderanch.com/t/279136/java/terminated-program-Control-close-open 371 // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases." 372 // So I just need to flush the csv print writer after every record is written 373 // and Ctrl-C won't lose any of the data thus far entered by the user. 374 375 BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); 376 377 boolean done = false; 378 379 // Work out default if basic URLs present in defaults map 380 // If it is, use its value as default for this URL 381 String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false)); 382 String predefQualityLevel = predefinedDefaultsMap.get(basicURL); 383 384 System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode)); 385 if(predefQualityLevel != null) { 386 System.err.println("\tDefault for this domain: " + predefQualityLevel 387 + ". Press Enter to accept >"); 388 } 389 390 boolean previouslyQuestionMark = false; 391 String oldQualityLevel = qualityLevel; 392 393 if(qualityLevel.equals("?")) { 394 previouslyQuestionMark = true; 395 System.err.println("\t? entered last time. Press Enter to keep >"); 396 } 397 while(!done && ((qualityLevel = systemIn.readLine()) != null)) { 398 //logger.debug("@@ Got: |" + qualityLevel + "|"); 399 400 // If the user hit enter, it means they accepted 401 // - the previous value entered, if it was a ? 402 // - or want the default for the URL if any displayed 403 // - or want SIGNIFICANTLY_MAORI if no default displayed 404 if(qualityLevel.equals("")) { // User just hit enter without other chars 405 if(previouslyQuestionMark) { 406 qualityLevel = "?"; 407 } else { 408 qualityLevel = (predefQualityLevel == null) ? SIGNIFICANTLY_MAORI : predefQualityLevel; 409 } 410 411 oldQualityLevel = qualityLevel; 412 } 413 else { 414 // force valid values - will return "" if invalid value 415 qualityLevel = getFullQualityLevelNameUppercased(qualityLevel); 416 } 417 418 // only if qualityLevel entered was invalid, would it now 419 // have been changed to "" 420 if(!qualityLevel.equals("")) { 421 oldQualityLevel = qualityLevel; 422 done = true; 423 } else { 424 System.out.println("@@ UNRECOGNISED. " 425 + String.format(USER_PROMPT, recordCount, url, countryCode)); 426 } 427 } 428 429 // Save the CSV record - even if quality level is null 430 // Because we don't want to lose the line that used to exist in the file 431 csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel); 432 csvWriter.flush(); 433 434 if(qualityLevel == null) { // if sys.in readLine() was terminated with Ctrl-D 435 terminate = true; 436 System.out.println("--- Got Ctrl-D (Lin)/Ctrl-Z (Win). Terminating. ---"); 437 } else { 438 System.out.println("User entered: " + oldQualityLevel); 439 440 } 441 } 442 } 443 444 if(terminate = true) { 445 System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating."); 446 } 447 448 } catch(Exception e) { 449 e.printStackTrace(); 450 logger.error("Exception occurred when processing CSV file or writing out file:\n" 451 + Utility.getFilePath(tmpOutFile)); 452 logger.error(e.getMessage(), e); 453 } 454 455 456 return Utility.getFilePath(tmpOutFile); 457 } 458 459 public String getFullQualityLevelNameUppercased(String qualityLevel) { 460 461 qualityLevel = qualityLevel.toUpperCase(); 462 463 if(qualityLevel.equals("N")) { 464 return NAV; 465 } else if(qualityLevel.equals("L")) { 466 return LITTLE_TEXT; 467 } else if(qualityLevel.equals("M")) { 468 return MIXED_TEXT; 469 } else if(qualityLevel.equals("S")) { 470 return SIGNIFICANTLY_MAORI; 471 } else if(qualityLevel.equals("P")) { 472 return MAORI_PARAGRAPHS; 473 } else if(qualityLevel.equals("W")) { 474 return WORDS; 475 } else if(qualityLevel.equals("O")) { 476 return OTHER_LANGUAGES; 477 } else if(qualityLevel.equals("E")) { 478 return POEMS_OR_SONGS; 479 } else if(qualityLevel.equals("I")) { 480 return SINGLE_MRI_SENTENCE; 481 } else if(qualityLevel.equals(NAV) 482 || qualityLevel.equals(LITTLE_TEXT) 483 || qualityLevel.equals(MIXED_TEXT) 484 || qualityLevel.equals(SIGNIFICANTLY_MAORI) 485 || qualityLevel.equals(MAORI_PARAGRAPHS) 486 || qualityLevel.equals(WORDS) 487 || qualityLevel.equals(OTHER_LANGUAGES) 488 || qualityLevel.equals(POEMS_OR_SONGS) 489 || qualityLevel.equals(SINGLE_MRI_SENTENCE)) { 490 return qualityLevel; 491 } 492 return ""; 493 } 217 494 218 495 public static void printUsage() { … … 266 543 public void run() { 267 544 logger.info("@@@@@@@@@@@@@@@@@@@@@@@@"); 268 logger.info("WARNING !!!");269 logger.info(" Got Ctrl-C. INCOMPLETE generated temp CSV file: " +545 logger.info("WARNING: If Ctrl-C was pressed, then"); 546 logger.info("\tan INCOMPLETE temp CSV file would have been generated at: " + 270 547 inspector.getCSVOutputFilename()); 271 logger.info(String.format(" Copy remaining records from input file %s into this file.",548 logger.info(String.format("\tSo copy remaining records from input file %s into this file.", 272 549 Utility.getFilePath(inputFile))); 273 550 logger.info("@@@@@@@@@@@@@@@@@@@@@@@@"); 274 551 } 275 552 })); 276 277 String filename = inspector.processCSV(); 553 554 //String filename = inspector.processCSV(); 555 String filename = inspector.processCSV_QualityLevelColumn(); 556 278 557 279 558 logger.info("Generated temp CSV file: " + filename);
Note:
See TracChangeset
for help on using the changeset viewer.