package org.greenstone.atea; import java.util.*; import java.io.*; import org.apache.commons.csv.*; import org.apache.log4j.Logger; import org.greenstone.util.SafeProcess; /** * Program to help going through the n number of random sample web page URLs stored in input * csv file, to eyeball whether the full text (stored in mongodb for each) is indeed inMRI * or not. User can enter Y|N|? and ctr-D or ctrl-C to continue working on this later. * The output file is the input filename + .tmp suffix. * When user continues later, the output file from last time must be used as input file. * Any csv records not completed earlier or with ? entered will be presented for input * on (re-)running this program. * * TO COMPILE OR RUN, FIRST DO: * cd maori-lang-detection/apache-opennlp-1.9.1 * export OPENNLP_HOME=`pwd` * cd maori-lang-detection/src * * TO COMPILE: * maori-lang-detection/src$ * javac -cp ".:../conf:../lib/*" org/greenstone/atea/ManualURLInspection.java * * TO RUN: * maori-lang-detection/src$ * java -cp ".:../conf:../lib/*" org/greenstone/atea/ManualURLInspection ../mongodb-data/random260_manualList_globalDomains_whereAPageContainsMRI.txt * */ public class ManualURLInspection { static Logger logger = Logger.getLogger(org.greenstone.atea.ManualURLInspection.class.getName()); private final MongoDBQueryer mongodbQueryer; private final File outFolder; private final File webPageURLsCSVFile; private final File tmpOutFile; /** csv column numbers */ public static final int URL_COLUMN = 0; public static final int COUNTRY_CODE_COLUMN = 1; public static final int IS_REALLY_IN_MRI_COLUMN = 2; public static final int QUALITY_LEVEL_COLUMN = 3; public static final int COUNT_OF_PAGES_IN_MRI_COLUMN = 4; // count as detected by OpenNLP public static final int TOTAL_PAGES_IN_SITE_COLUMN = 5; /** Possible values for the Quality Level column of the csv file */ public static final String NAV = "NAV"; public static final String LITTLE_TEXT = "LITTLE_TEXT"; public static final String MIXED_TEXT = "MIXED_TEXT"; public static final String SIGNIFICANTLY_MAORI = "SIGNIFICANTLY_MAORI"; public static final String MAORI_PARAGRAPHS = "MAORI_PARAGRAPHS"; public static final String WORDS = "WORDS"; // words or titles, not full sentences public static final String OTHER_LANGUAGES = "OTHER_LANGUAGES"; public static final String POEMS_OR_SONGS = "POEMS_OR_SONGS"; public static final String SINGLE_MRI_SENTENCE = "SINGLE_MRI_SENTENCE"; // TODO: REVIEW public static final String LINK_TEXT = "LINK_TEXT"; // for office positions designations and link text public ManualURLInspection(MongoDBQueryer mongodbQueryer, File csvFile) { this.mongodbQueryer = mongodbQueryer; this.webPageURLsCSVFile = csvFile; this.outFolder = csvFile.getParentFile(); String tmpFilename = Utility.getFilePath(webPageURLsCSVFile); this.tmpOutFile = new File(tmpFilename+".tmp"); } public String getCSVOutputFilename() { return (tmpOutFile == null) ? "" : Utility.getFilePath(tmpOutFile); } /** * Read .csv input file one line at a time. * For each line, * - if empty line empty, skip it. * - If the 3rd column of line is already filled in with Y|N, write out identical line * into tmp output file. * - If third column contains ? or if 3rd column is empty, run a MongoDBQuery to get * the full text of the page and display it on screen. * Wait for user input. * - If Enter hit or Y input, write out Y in 3rd field of line into tmp file. * - If N or ? entered, write out N/? as 3rd field. * * Loop through input csv until finished or until Ctrl-C or Ctrl-D pressed. * Ctrl-D here means end of all user interaction, signalling user wants * to stop entering data and continue later. * * When finished or Ctrl-D entered or Ctrl C pressed, all data entered must have been written * out. So to avoid losing data on Ctrl-Ck, write out each processed csv record (whether * already complete or whether user entry made it complete) and flush writer. * When program terminates in any manner, print message that the file has been created. */ public String processCSV() { final String USER_PROMPT = "Enter isMRI value of Y|N|? for (%d): %s - %s > "; //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > "; boolean terminate = false; CSVParser parser = null; try { parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180); } catch(Exception e) { logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e); return "Failed"; } try ( //BufferedWriter writer = new BufferedWriter(new FileWriter(tmpOutFile)); CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); ) { int recordCount = 0; for (CSVRecord csvRecord : parser) { //if(terminate) condition handled further below //logger.debug("Got record: " + csvRecord.toString()); //int recordNo = csvRecord.RecordNumber(); // will count empty lines! //if(csvRecord.size() != 0) { String url = csvRecord.get(URL_COLUMN); if(url.equals("")) { // skip empty lines continue; } recordCount++; String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN); String isReallyInMRI = ""; String qualityLevel = null; //String isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN); //if(!isReallyInMRI.equals("")) { if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) { isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN); } if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) { qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN); } if(terminate || (!isReallyInMRI.equals("") && !isReallyInMRI.equals("?"))) { // if(terminate) on Ctrl-D, don't stop processing csv records // Instead, copy remaining records of input csv file into output csv file isReallyInMRI = isReallyInMRI.toUpperCase(); if(qualityLevel == null) { csvWriter.printRecord(url, countryCode, isReallyInMRI); } else { csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel); } csvWriter.flush(); logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + isReallyInMRI + " - " + qualityLevel); } else { // First, display full text for web page record with matching url // so the user can look at it to decide whether it is indeed overall in MRI or not. String fulltext = mongodbQueryer.displayFullTextOfPage(url); System.err.println(String.format("FULL-TEXT for record %d:\n%s\n", recordCount, fulltext)); //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + qualityLevel); // Read Input until Ctrl-D: read System.In as bufferedReader // https://stackoverflow.com/questions/5837823/read-input-until-controld // Ctrl-C is already taken care if, see // https://coderanch.com/t/279136/java/terminated-program-Control-close-open // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases." // So I just need to flush the csv print writer after every record is written // and Ctrl-C won't lose any of the data thus far entered by the user. BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); boolean done = false; System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode)); boolean previouslyQuestionMark = false; if(isReallyInMRI.equals("?")) { previouslyQuestionMark = true; System.err.println("\t? entered last time"); } while(!done && ((isReallyInMRI = systemIn.readLine()) != null)) { isReallyInMRI = isReallyInMRI.toUpperCase(); //logger.debug("@@ Got: |" + isReallyInMRI + "|"); // if user hit enter, it means they accepted // - that the full text displayed is really in MRI: Y // - the previous value entered if it was a ? if(isReallyInMRI.equals("")) { if(previouslyQuestionMark) { isReallyInMRI = "?"; } else { isReallyInMRI = "Y"; } } if(isReallyInMRI.equals("Y") || isReallyInMRI.equals("N") || isReallyInMRI.equals("?")) { done = true; //break; } else { System.out.println("@@ UNRECOGNISED. " + String.format(USER_PROMPT, recordCount, url, countryCode)); } } // Save the CSV record - even if quality level is null // Because we don't want to lose the line that used to exist in the file if(qualityLevel == null) { csvWriter.printRecord(url, countryCode, isReallyInMRI); } else { csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel); } csvWriter.flush(); if(isReallyInMRI == null) { // if sys.in readLine() was terminated with Ctrl-D terminate = true; System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating."); } else { System.out.println("User entered: " + isReallyInMRI); } } } //} } catch(Exception e) { e.printStackTrace(); logger.error("Exception occurred when processing CSV file or writing out file:\n" + Utility.getFilePath(tmpOutFile)); logger.error(e.getMessage(), e); } //return urlsList; return Utility.getFilePath(tmpOutFile); } /** * Similar to processCSV() above, but for entering the page quality level of each web page * This goes into the QUALITY_LEVEL_COLUMN column of the csv file. * Web pages from some web sites commonly recurring in the csv input file tend to be largely * navigation menus, so preset to NAV. Others are known to be low quality for text resources * as they only have nav menus and pictures despite these being largely in Māori, * which can also go under NAV. * Other web sites have little text overall whether Māori or mixed with English, nav included, * (LITTLE_TEXT), or significantly mixed (MRI+ENG/...) text even if a decent amount of text * (MIXED_TEXT). Some sites may largely have standalone words for learning (WORDS). * Other than known websites that have regular content of one of the above types, * the user can enter these values for rarer websites whose web pages may pop up: * NAV, LITTLE_TEXT, MIXED_TEXT, WORDS, SIGNIFICANTLY_MAORI (for decent amounts of MRI text) * MAORI_PARAGRAPHS (for largely continuous paras in MRI even if there are paras in other * langs) and OTHER_LANGUAGES if text not in MRI but mostly in other language, * POEMS_OR_SONGS for content that's largely songs or poetry. */ public String processCSV_QualityLevelColumn() { Map predefinedDefaultsMap = new HashMap(); predefinedDefaultsMap.put("tetaurawhiri.govt.nz", NAV); predefinedDefaultsMap.put("tmoa.tki.org.nz", SIGNIFICANTLY_MAORI); predefinedDefaultsMap.put("paekupu.co.nz", MIXED_TEXT); // html is mixed, but display is more MRI predefinedDefaultsMap.put("m.biblepub.com", SIGNIFICANTLY_MAORI); predefinedDefaultsMap.put("biblehub.com", SIGNIFICANTLY_MAORI); predefinedDefaultsMap.put("pukoro.co.nz", WORDS); predefinedDefaultsMap.put("mi.wikipedia.org", MIXED_TEXT); predefinedDefaultsMap.put("mi.m.wikipedia.org", WORDS); predefinedDefaultsMap.put("tkkmmokopuna.school.nz", NAV); predefinedDefaultsMap.put("twtop.school.nz", NAV); predefinedDefaultsMap.put("animations.tewhanake.maori.nz", MAORI_PARAGRAPHS); predefinedDefaultsMap.put("csunplugged.org", SIGNIFICANTLY_MAORI); predefinedDefaultsMap.put("waiata.maori.nz", POEMS_OR_SONGS); final String USER_PROMPT = "Enter qualityLevel value of\n\t? | (N)AV | (L)ITTLE_TEXT | (M)IXED_TEXT | (S)IGNIFICANTLY_MAORI | MAORI_(P)ARAGRAPHS" + "\n\t | LINK_(T)EXT | PO(E)MS_OR_SONGS | S(I)NGLE_MRI_SENTENCE | (W)ORDS | (O)THER_LANGUAGES\n\tfor (%d): %s - %s > "; //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > "; boolean terminate = false; CSVParser parser = null; try { parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180); } catch(Exception e) { logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e); return "Failed"; } try ( CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); ) { int recordCount = 0; for (CSVRecord csvRecord : parser) { //if(terminate) condition handled further below //logger.debug("Got record: " + csvRecord.toString()); String url = csvRecord.get(URL_COLUMN); if(url.equals("")) { // skip empty lines continue; } recordCount++; String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN); String isReallyInMRI = ""; String qualityLevel = ""; if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) { isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN); } if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) { qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN); // Force valid values or "" qualityLevel = getFullQualityLevelNameUppercased(qualityLevel); } if(terminate || (!qualityLevel.equals("") && !qualityLevel.equals("?"))) { // if(terminate) on Ctrl-D, don't stop processing csv records // Instead, copy remaining records of input csv file into output csv file csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel); csvWriter.flush(); logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + isReallyInMRI + " - " + qualityLevel); } else { // First, display full text for web page record with matching url // so the user can look at it to decide whether it is indeed overall in MRI or not. String fulltext = mongodbQueryer.displayFullTextOfPage(url); System.err.println(String.format("\nFULL-TEXT for record %d:\n%s\n", recordCount, fulltext)); //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + qualityLevel); // Read Input until Ctrl-D: read System.In as bufferedReader // https://stackoverflow.com/questions/5837823/read-input-until-controld // Ctrl-C is already taken care if, see // https://coderanch.com/t/279136/java/terminated-program-Control-close-open // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases." // So I just need to flush the csv print writer after every record is written // and Ctrl-C won't lose any of the data thus far entered by the user. BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); boolean done = false; // Work out default if basic URLs present in defaults map // If it is, use its value as default for this URL String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false)); String predefQualityLevel = predefinedDefaultsMap.get(basicURL); System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode)); if(predefQualityLevel != null) { System.err.println("\tDefault for this domain: " + predefQualityLevel + ". Press Enter to accept >"); } boolean previouslyQuestionMark = false; String oldQualityLevel = qualityLevel; if(qualityLevel.equals("?")) { previouslyQuestionMark = true; System.err.println("\t? entered last time. Press Enter to keep >"); } while(!done && ((qualityLevel = systemIn.readLine()) != null)) { //logger.debug("@@ Got: |" + qualityLevel + "|"); // If the user hit enter, it means they accepted // - the previous value entered, if it was a ? // - or want the default for the URL if any displayed // - or want SIGNIFICANTLY_MAORI if no default displayed if(qualityLevel.equals("")) { // User just hit enter without other chars if(previouslyQuestionMark) { qualityLevel = "?"; } else { qualityLevel = (predefQualityLevel == null) ? SIGNIFICANTLY_MAORI : predefQualityLevel; } oldQualityLevel = qualityLevel; } else { // force valid values - will return "" if invalid value qualityLevel = getFullQualityLevelNameUppercased(qualityLevel); } // only if qualityLevel entered was invalid, would it now // have been changed to "" if(!qualityLevel.equals("")) { oldQualityLevel = qualityLevel; done = true; } else { System.out.println("@@ UNRECOGNISED. " + String.format(USER_PROMPT, recordCount, url, countryCode)); } } // Save the CSV record - even if quality level is null // Because we don't want to lose the line that used to exist in the file csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel); csvWriter.flush(); if(qualityLevel == null) { // if sys.in readLine() was terminated with Ctrl-D terminate = true; System.out.println("--- Got Ctrl-D (Lin)/Ctrl-Z (Win). Terminating. ---"); } else { System.out.println("User entered: " + oldQualityLevel); } } } if(terminate = true) { System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating."); } } catch(Exception e) { e.printStackTrace(); logger.error("Exception occurred when processing CSV file or writing out file:\n" + Utility.getFilePath(tmpOutFile)); logger.error(e.getMessage(), e); } return Utility.getFilePath(tmpOutFile); } public String getFullQualityLevelNameUppercased(String qualityLevel) { qualityLevel = qualityLevel.toUpperCase(); if(qualityLevel.equals("N")) { return NAV; } else if(qualityLevel.equals("L")) { return LITTLE_TEXT; } else if(qualityLevel.equals("M")) { return MIXED_TEXT; } else if(qualityLevel.equals("S")) { return SIGNIFICANTLY_MAORI; } else if(qualityLevel.equals("P")) { return MAORI_PARAGRAPHS; } else if(qualityLevel.equals("W")) { return WORDS; } else if(qualityLevel.equals("O")) { return OTHER_LANGUAGES; } else if(qualityLevel.equals("E")) { return POEMS_OR_SONGS; } else if(qualityLevel.equals("I")) { return SINGLE_MRI_SENTENCE; } else if(qualityLevel.equals("T")) { return LINK_TEXT; } else if(qualityLevel.equals(NAV) || qualityLevel.equals(LITTLE_TEXT) || qualityLevel.equals(MIXED_TEXT) || qualityLevel.equals(SIGNIFICANTLY_MAORI) || qualityLevel.equals(MAORI_PARAGRAPHS) || qualityLevel.equals(WORDS) || qualityLevel.equals(OTHER_LANGUAGES) || qualityLevel.equals(POEMS_OR_SONGS) || qualityLevel.equals(SINGLE_MRI_SENTENCE) || qualityLevel.equals(LINK_TEXT)) { return qualityLevel; } return ""; } public void reviewQualityLevelFieldFor(/*String basicDomain,*/ String fieldValue) { final String USER_PROMPT = "Enter qualityLevel value of\n\t? | (N)AV | (L)ITTLE_TEXT | (M)IXED_TEXT | (S)IGNIFICANTLY_MAORI | MAORI_(P)ARAGRAPHS" + "\n\t | LINK_(T)EXT | PO(E)MS_OR_SONGS | S(I)NGLE_MRI_SENTENCE | (W)ORDS | (O)THER_LANGUAGES\n\tfor (%d): %s - %s > "; //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > "; boolean terminate = false; CSVParser parser = null; try { parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180); } catch(Exception e) { logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e); return; } try ( CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); ) { int recordCount = 0; for (CSVRecord csvRecord : parser) { String url = csvRecord.get(URL_COLUMN); if(url.equals("")) { // skip empty lines continue; } recordCount++; String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false)); /* if(!basicURL.equals(basicDomain)) { continue; // skip URLs we're not interested in } */ // Work out default if basic URLs present in defaults map // If it is, use its value as default for this URL //String predefQualityLevel = predefinedDefaultsMap.get(basicURL); String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN); String isReallyInMRI = ""; String qualityLevel = ""; if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) { isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN); } if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) { qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN); // Force valid values or "" qualityLevel = getFullQualityLevelNameUppercased(qualityLevel); } if(terminate || !qualityLevel.equals(fieldValue) /* || basicURL.equals("paekupu.co.nz") // when reviewing MIXED_TEXT */ /*|| basicURL.equals("tetaurawhiri.govt.nz") // when reviewing NAV */ /*|| basicURL.equals("biblehub.com") || basicURL.equals("m.biblepub.com") // when reviewing SIGNIFICANTLY_MAORI */) { // if(terminate) on Ctrl-D, don't stop processing csv records // Instead, copy remaining records of input csv file into output csv file // Similarly, if the qualityLevel field does not have the value we're interested in // then just write it out as-is csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel); csvWriter.flush(); logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + isReallyInMRI + " - " + qualityLevel); } else { // First, display full text for web page record with matching url // so the user can look at it to decide whether it is indeed overall in MRI or not. String fulltext = mongodbQueryer.displayFullTextOfPage(url); System.err.println(String.format("\nFULL-TEXT for record %d:\n%s\n", recordCount, fulltext)); //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + qualityLevel); // Read Input until Ctrl-D: read System.In as bufferedReader // https://stackoverflow.com/questions/5837823/read-input-until-controld // Ctrl-C is already taken care if, see // https://coderanch.com/t/279136/java/terminated-program-Control-close-open // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases." // So I just need to flush the csv print writer after every record is written // and Ctrl-C won't lose any of the data thus far entered by the user. BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); boolean done = false; System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode)); //if(predefQualityLevel != null) { //System.err.println("\tDefault for this domain: " + predefQualityLevel //+ ". Press Enter to accept >"); //} if(qualityLevel.equals(fieldValue)) { System.err.println("\t" + fieldValue + " entered last time. Press Enter to keep >"); } while(!done && ((qualityLevel = systemIn.readLine()) != null)) { //logger.debug("@@ Got: |" + qualityLevel + "|"); // If the user hit enter, it means they accepted the previous value entered if(qualityLevel.equals("")) { // User just hit enter without other chars qualityLevel = fieldValue; } else { // force valid values - will return "" if invalid value qualityLevel = getFullQualityLevelNameUppercased(qualityLevel); } // only if qualityLevel entered was invalid, would it now // have been changed to "" if(!qualityLevel.equals("")) { done = true; } else { System.out.println("@@ UNRECOGNISED. " + String.format(USER_PROMPT, recordCount, url, countryCode)); } } // Save the CSV record - even if quality level is null // Because we don't want to lose the line that used to exist in the file csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel); csvWriter.flush(); if(qualityLevel == null) { // if sys.in readLine() was terminated with Ctrl-D terminate = true; System.out.println("--- Got Ctrl-D (Lin)/Ctrl-Z (Win). Terminating. ---"); } else { System.out.println("User entered: " + qualityLevel); } } } if(terminate = true) { System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating."); } } catch(Exception e) { e.printStackTrace(); logger.error("Exception occurred when processing CSV file or writing out file:\n" + Utility.getFilePath(tmpOutFile)); logger.error(e.getMessage(), e); } } /** * Add 2 new columns to the csv file: num pages in site that are inMRI and total num pages in site. */ public void insertTotalsIntoCSVRecords() { boolean terminate = false; CSVParser parser = null; try { parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180); } catch(Exception e) { logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e); return; } try ( CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL)); ) { int recordCount = 0; for (CSVRecord csvRecord : parser) { String url = csvRecord.get(URL_COLUMN); if(url.equals("")) { // skip empty lines continue; } recordCount++; String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false)); String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN); String isReallyInMRI = ""; String qualityLevel = ""; if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) { isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN); } if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) { qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN); } //COUNT_OF_PAGES_IN_MRI_COLUMN; TOTAL_PAGES_IN_SITE_COLUMN; long countNumPagesInMRI = mongodbQueryer.getFieldTotalForDomainSuffix( basicURL, MongoDBQueryer.FIELD_NUM_PAGES_IN_MRI); long countTotalPages = mongodbQueryer.getFieldTotalForDomainSuffix( basicURL, MongoDBQueryer.FIELD_TOTAL_PAGES); logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + isReallyInMRI + " - " + qualityLevel + " - " + countNumPagesInMRI + " - " + countTotalPages); // Save the CSV record into the tmp file with the 2 counts columns csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel, countNumPagesInMRI, countTotalPages); } } catch(Exception e) { e.printStackTrace(); logger.error("Exception occurred when processing CSV file or writing out file:\n" + Utility.getFilePath(tmpOutFile)); logger.error(e.getMessage(), e); } } public static void printUsage() { System.err.println("Usage: ManualURLInspection webPageURLs.txt"); } /** * If no args are passed in, generates complete containsMRI file listings for NZ and overseas web SITES (domains), * with overseas web sites that have mi (mi.* or *\/mi) in the URL path listed separately. * You can then manually inspect the domains in this listing to shortlist which of these sites are not automatically * translated and really contain at least one webpage containing at least one sentence in MRI. * If a file is passed in containing a list of domains, then this first generates a full listing of all webpages * matching isMRI for each site in the domain list. It then generates a smaller set of random webpages matching * isMRI for the pooled sites in the domain list where the sample size of URLs produced is sufficient for giving * 90% confidence with 5% margin of error for testing binary outcomes, see * https://stats.stackexchange.com/questions/207584/sample-size-choice-with-binary-outcome */ public static void main(String args[]) { SafeProcess.DEBUG = 1; if(args.length != 1) { printUsage(); System.exit(-1); } try ( MongoDBQueryer mongodb = new MongoDBQueryer(); ) { mongodb.connectToDB(); // output files will be stored in mongodb-data-auto File outFolder = new File("../mongodb-data-auto/").getAbsoluteFile(); logger.info("*************************************"); final File inputFile = new File(args[0]); if(!inputFile.exists()) { logger.info("File " + inputFile + " does not exist"); System.exit(-1); } final ManualURLInspection inspector = new ManualURLInspection(mongodb, inputFile); Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() { public void run() { logger.info("@@@@@@@@@@@@@@@@@@@@@@@@"); logger.info("WARNING: If Ctrl-C was pressed, then"); logger.info("\tan INCOMPLETE temp CSV file would have been generated at: " + inspector.getCSVOutputFilename()); logger.info(String.format("\tSo copy remaining records from input file %s into this file.", Utility.getFilePath(inputFile))); logger.info("@@@@@@@@@@@@@@@@@@@@@@@@"); } })); //String filename = inspector.processCSV(); //String filename = inspector.processCSV_QualityLevelColumn(); //inspector.reviewQualityLevelFieldFor("SINGLE_MRI_SENTENCE"); inspector.insertTotalsIntoCSVRecords(); //logger.info("Generated temp CSV file: " + filename); logger.info("*************************************"); } catch(Exception e) { logger.error(e.getMessage(), e); } } }