Ignore:
Timestamp:
2020-02-18T21:58:42+13:00 (4 years ago)
Author:
ak19
Message:
  1. New function to handle user input assigning the newly introduced 4th column in the random samples file. In many ways similar to the function handling the 3rd column, so some code refactoring may be useful. 2. Changed the function handling the 3rd column to leave any 4th column values untouched by writing them out with whatever values they're found.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • other-projects/maori-lang-detection/src/org/greenstone/atea/ManualURLInspection.java

    r33941 r33946  
    4545    public static final int COUNTRY_CODE_COLUMN = 1;
    4646    public static final int IS_REALLY_IN_MRI_COLUMN = 2;
     47    public static final int QUALITY_LEVEL_COLUMN = 3;
     48
     49
     50    /** Possible values for the Quality Level column of the csv file */
     51    public static final String NAV = "NAV";
     52    public static final String LITTLE_TEXT = "LITTLE_TEXT";
     53    public static final String MIXED_TEXT = "MIXED_TEXT";
     54    public static final String SIGNIFICANTLY_MAORI = "SIGNIFICANTLY_MAORI";
     55    public static final String MAORI_PARAGRAPHS = "MAORI_PARAGRAPHS";
     56    public static final String WORDS = "WORDS"; // words or titles, not full sentences
     57    public static final String OTHER_LANGUAGES = "OTHER_LANGUAGES";
     58    public static final String POEMS_OR_SONGS = "POEMS_OR_SONGS";
     59    public static final String SINGLE_MRI_SENTENCE = "SINGLE_MRI_SENTENCE";
    4760
    4861   
     
    119132        String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
    120133        String isReallyInMRI = "";
     134        String qualityLevel = null;
     135       
    121136        //String isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
    122137        //if(!isReallyInMRI.equals("")) {
     
    125140        }
    126141       
     142        if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
     143            qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
     144        }
     145       
    127146        if(terminate || (!isReallyInMRI.equals("") && !isReallyInMRI.equals("?"))) {
    128147            // if(terminate) on Ctrl-D, don't stop processing csv records
    129148            // Instead, copy remaining records of input csv file into output csv file
    130149            isReallyInMRI = isReallyInMRI.toUpperCase();
    131             csvWriter.printRecord(url, countryCode, isReallyInMRI);
     150            if(qualityLevel == null) {
     151            csvWriter.printRecord(url, countryCode, isReallyInMRI);
     152            } else {
     153            csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel);
     154            }
    132155            csvWriter.flush();
    133156            logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
    134                 + " - " + isReallyInMRI);
     157                + " - " + isReallyInMRI + " - " + qualityLevel);
    135158        }
    136159        else {
     
    141164            System.err.println(String.format("FULL-TEXT for record %d:\n%s\n", recordCount, fulltext));
    142165           
    143             //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode);
     166            //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + qualityLevel);
    144167           
    145168            // Read Input until Ctrl-D: read System.In as bufferedReader
     
    188211            }
    189212           
    190             // save the record
    191             csvWriter.printRecord(url, countryCode, isReallyInMRI);
     213            // Save the CSV record - even if quality level is null
     214            // Because we don't want to lose the line that used to exist in the file
     215            if(qualityLevel == null) {
     216            csvWriter.printRecord(url, countryCode, isReallyInMRI);
     217            } else {
     218            csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel);
     219            }
    192220            csvWriter.flush();
    193221           
    194             if(isReallyInMRI == null) {
     222            if(isReallyInMRI == null) { // if sys.in readLine() was terminated with Ctrl-D
    195223            terminate = true;
    196224            System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating.");
     
    214242    }
    215243   
    216    
     244    /**
     245     * Similar to processCSV() above, but for entering the page quality level of each web page
     246     * This goes into the QUALITY_LEVEL_COLUMN column of the csv file.
     247     * Web pages from some web sites commonly recurring in the csv input file tend to be largely
     248     * navigation menus, so preset to NAV. Others are known to be low quality for text resources
     249     * as they only have nav menus and pictures despite these being largely in Māori,
     250     * which can also go under NAV.
     251     * Other web sites have little text overall whether Māori or mixed with English, nav included,
     252     * (LITTLE_TEXT), or significantly mixed (MRI+ENG/...) text even if a decent amount of text
     253     * (MIXED_TEXT). Some sites may largely have standalone words for learning (WORDS).
     254     * Other than known websites that have regular content of one of the above types,
     255     * the user can enter these values for rarer websites whose web pages may pop up:
     256     * NAV, LITTLE_TEXT, MIXED_TEXT, WORDS, SIGNIFICANTLY_MAORI (for decent amounts of MRI text)
     257     * MAORI_PARAGRAPHS (for largely continuous paras in MRI even if there are paras in other
     258     * langs) and OTHER_LANGUAGES if text not in MRI but mostly in other language,
     259     * POEMS_OR_SONGS for content that's largely songs or poetry.
     260    */   
     261    public String processCSV_QualityLevelColumn() {
     262
     263    Map<String, String> predefinedDefaultsMap = new HashMap<String, String>();
     264    predefinedDefaultsMap.put("tetaurawhiri.govt.nz", NAV);
     265    predefinedDefaultsMap.put("tmoa.tki.org.nz", SIGNIFICANTLY_MAORI);
     266    predefinedDefaultsMap.put("paekupu.co.nz", MIXED_TEXT); // html is mixed, but display is more MRI
     267    predefinedDefaultsMap.put("m.biblepub.com", SIGNIFICANTLY_MAORI);
     268    predefinedDefaultsMap.put("biblehub.com", SIGNIFICANTLY_MAORI);
     269    predefinedDefaultsMap.put("pukoro.co.nz", WORDS);
     270    predefinedDefaultsMap.put("mi.wikipedia.org", MIXED_TEXT);
     271    predefinedDefaultsMap.put("mi.m.wikipedia.org", WORDS);
     272    predefinedDefaultsMap.put("tkkmmokopuna.school.nz", NAV);
     273    predefinedDefaultsMap.put("twtop.school.nz", NAV);
     274    predefinedDefaultsMap.put("animations.tewhanake.maori.nz", MAORI_PARAGRAPHS);
     275    predefinedDefaultsMap.put("csunplugged.org", SIGNIFICANTLY_MAORI);
     276    predefinedDefaultsMap.put("waiata.maori.nz", POEMS_OR_SONGS);
     277   
     278    final String USER_PROMPT = "Enter qualityLevel value of\n\t? | (N)AV | (L)ITTLE_TEXT | (M)IXED_TEXT | (S)IGNIFICANTLY_MAORI | MAORI_(P)ARAGRAPHS"
     279        + "\n\t | PO(E)MS_OR_SONGS | S(I)NGLE_MRI_SENTENCE | (W)ORDS | (O)THER_LANGUAGES\n\tfor (%d): %s - %s > ";
     280        //"Enter isMRI value of Y|N|? for (" + count + "): " + url + " - " + countryCode + " > ";
     281   
     282    boolean terminate = false;
     283    CSVParser parser = null;
     284   
     285    try {
     286        parser = CSVParser.parse(webPageURLsCSVFile, java.nio.charset.Charset.forName("US-ASCII"), CSVFormat.RFC4180);
     287    } catch(Exception e) {
     288        logger.error("Failed to parse input CSV file " + Utility.getFilePath(webPageURLsCSVFile), e);
     289        return "Failed";
     290    }
     291   
     292    try (
     293         CSVPrinter csvWriter = new CSVPrinter(new FileWriter(tmpOutFile), CSVFormat.DEFAULT.withQuoteMode(QuoteMode.MINIMAL));
     294         ) {
     295
     296        int recordCount = 0;
     297        for (CSVRecord csvRecord : parser) {       
     298        //if(terminate) condition handled further below
     299       
     300        //logger.debug("Got record: " + csvRecord.toString());
     301       
     302        String url = csvRecord.get(URL_COLUMN);
     303        if(url.equals("")) { // skip empty lines
     304            continue;
     305        }
     306       
     307        recordCount++;
     308        String countryCode = csvRecord.get(COUNTRY_CODE_COLUMN);
     309        String isReallyInMRI = "";
     310        String qualityLevel = "";
     311       
     312        if(csvRecord.isSet(IS_REALLY_IN_MRI_COLUMN)) {
     313            isReallyInMRI = csvRecord.get(IS_REALLY_IN_MRI_COLUMN);
     314        }
     315       
     316        if(csvRecord.isSet(QUALITY_LEVEL_COLUMN)) {
     317            qualityLevel = csvRecord.get(QUALITY_LEVEL_COLUMN);
     318
     319            /*
     320            qualityLevel = qualityLevel.toUpperCase();
     321           
     322            if(qualityLevel.equals("N")) {
     323            qualityLevel = NAV;
     324            } else if(qualityLevel.equals("L")) {
     325            qualityLevel = LITTLE_TEXT;
     326            } else if(qualityLevel.equals("M")) {
     327            qualityLevel = MIXED_TEXT;
     328            } else if(qualityLevel.equals("P")) {
     329            qualityLevel = MAORI_PARAGRAPHS;
     330            } else if(qualityLevel.equals("S")) {
     331            qualityLevel = SIGNIFICANTLY_MAORI;
     332            } else if(qualityLevel.equals("W")) {
     333            qualityLevel = WORDS;
     334            } else if(qualityLevel.equals("O")) {
     335            qualityLevel = OTHER_LANGUAGE;
     336            } else if(qualityLevel.equals("E")) {
     337            qualityLevel = POEMS_OR_SONGS;
     338            } else if(qualityLevel.equals("I")) {
     339            qualityLevel = SINGLE_MRI_SENTENCE;
     340            }
     341            // else remains at whatever was already in the file or
     342            // else "" if no qualityLevel column for this record present in the file yet
     343            */
     344
     345            // Force valid values or ""
     346            qualityLevel = getFullQualityLevelNameUppercased(qualityLevel);
     347        }       
     348       
     349        if(terminate || (!qualityLevel.equals("") && !qualityLevel.equals("?"))) {
     350            // if(terminate) on Ctrl-D, don't stop processing csv records
     351            // Instead, copy remaining records of input csv file into output csv file     
     352           
     353            csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel);
     354            csvWriter.flush();
     355            logger.info("Got record " + recordCount + ": " + url + " - " + countryCode
     356                + " - " + isReallyInMRI + " - " + qualityLevel);           
     357        }
     358        else {
     359           
     360            // First, display full text for web page record with matching url
     361            // so the user can look at it to decide whether it is indeed overall in MRI or not.
     362            String fulltext = mongodbQueryer.displayFullTextOfPage(url);
     363            System.err.println(String.format("\nFULL-TEXT for record %d:\n%s\n", recordCount, fulltext));
     364           
     365            //logger.info("Got record " + recordCount + ": " + url + " - " + countryCode + " - " + qualityLevel);
     366           
     367            // Read Input until Ctrl-D: read System.In as bufferedReader
     368            // https://stackoverflow.com/questions/5837823/read-input-until-controld
     369            // Ctrl-C is already taken care if, see
     370            // https://coderanch.com/t/279136/java/terminated-program-Control-close-open
     371            // "Whenever a process is terminated/killed(CTRL-C), the file descriptors are released. You really do not need to close the stream in such cases."
     372            // So I just need to flush the csv print writer after every record is written
     373            // and Ctrl-C won't lose any of the data thus far entered by the user.
     374           
     375            BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));           
     376           
     377            boolean done = false;           
     378           
     379            // Work out default if basic URLs present in defaults map
     380            // If it is, use its value as default for this URL
     381            String basicURL = Utility.stripProtocolAndWWWFromURL(Utility.getDomainForURL(url, false));
     382            String predefQualityLevel = predefinedDefaultsMap.get(basicURL);
     383           
     384            System.out.println(String.format(USER_PROMPT, recordCount, url, countryCode));
     385            if(predefQualityLevel != null) {
     386            System.err.println("\tDefault for this domain: " + predefQualityLevel
     387                       + ". Press Enter to accept >");
     388            }           
     389           
     390            boolean previouslyQuestionMark = false;
     391            String oldQualityLevel = qualityLevel;
     392           
     393            if(qualityLevel.equals("?")) {
     394            previouslyQuestionMark = true;
     395            System.err.println("\t? entered last time. Press Enter to keep >");
     396            }
     397            while(!done && ((qualityLevel = systemIn.readLine()) != null)) {           
     398            //logger.debug("@@ Got: |" + qualityLevel + "|");
     399
     400            // If the user hit enter, it means they accepted
     401            // - the previous value entered, if it was a ?
     402            // - or want the default for the URL if any displayed
     403            // - or want SIGNIFICANTLY_MAORI if no default displayed
     404            if(qualityLevel.equals("")) { // User just hit enter without other chars
     405                if(previouslyQuestionMark) {
     406                qualityLevel = "?";
     407                } else {
     408                qualityLevel = (predefQualityLevel == null) ? SIGNIFICANTLY_MAORI : predefQualityLevel;
     409                }
     410
     411                oldQualityLevel = qualityLevel;
     412            }
     413            else {
     414                // force valid values - will return "" if invalid value
     415                qualityLevel = getFullQualityLevelNameUppercased(qualityLevel);
     416            }
     417
     418            // only if qualityLevel entered was invalid, would it now
     419            // have been changed to ""
     420            if(!qualityLevel.equals("")) {
     421                oldQualityLevel = qualityLevel;
     422                done = true;
     423            } else {
     424                System.out.println("@@ UNRECOGNISED. "
     425                   + String.format(USER_PROMPT, recordCount, url, countryCode));
     426            }
     427            }           
     428           
     429            // Save the CSV record - even if quality level is null
     430            // Because we don't want to lose the line that used to exist in the file
     431            csvWriter.printRecord(url, countryCode, isReallyInMRI, qualityLevel);
     432            csvWriter.flush();
     433           
     434            if(qualityLevel == null) { // if sys.in readLine() was terminated with Ctrl-D
     435            terminate = true;
     436            System.out.println("--- Got Ctrl-D (Lin)/Ctrl-Z (Win). Terminating. ---");
     437            } else {
     438            System.out.println("User entered: " + oldQualityLevel);
     439           
     440            }           
     441        }
     442        }
     443
     444        if(terminate = true) {
     445        System.out.println("User entered Ctrl-D (Lin)/Ctrl-Z (Win) - terminating.");
     446        }
     447       
     448    } catch(Exception e) {
     449        e.printStackTrace();
     450        logger.error("Exception occurred when processing CSV file or writing out file:\n"
     451             + Utility.getFilePath(tmpOutFile));
     452        logger.error(e.getMessage(), e);
     453    }
     454   
     455   
     456    return Utility.getFilePath(tmpOutFile);
     457    }
     458
     459    public String getFullQualityLevelNameUppercased(String qualityLevel) {
     460   
     461    qualityLevel = qualityLevel.toUpperCase();
     462
     463    if(qualityLevel.equals("N")) {
     464        return NAV;
     465    } else if(qualityLevel.equals("L")) {
     466        return LITTLE_TEXT;
     467    } else if(qualityLevel.equals("M")) {
     468        return MIXED_TEXT;
     469    } else if(qualityLevel.equals("S")) {
     470        return SIGNIFICANTLY_MAORI;
     471    } else if(qualityLevel.equals("P")) {
     472        return MAORI_PARAGRAPHS;
     473    } else if(qualityLevel.equals("W")) {
     474        return WORDS;
     475    } else if(qualityLevel.equals("O")) {
     476        return OTHER_LANGUAGES;
     477    } else if(qualityLevel.equals("E")) {
     478        return POEMS_OR_SONGS;
     479    } else if(qualityLevel.equals("I")) {       
     480        return SINGLE_MRI_SENTENCE;
     481    } else if(qualityLevel.equals(NAV)
     482          || qualityLevel.equals(LITTLE_TEXT)
     483          || qualityLevel.equals(MIXED_TEXT)
     484          || qualityLevel.equals(SIGNIFICANTLY_MAORI)
     485          || qualityLevel.equals(MAORI_PARAGRAPHS)
     486          || qualityLevel.equals(WORDS)
     487          || qualityLevel.equals(OTHER_LANGUAGES)
     488          || qualityLevel.equals(POEMS_OR_SONGS)
     489          || qualityLevel.equals(SINGLE_MRI_SENTENCE)) {
     490        return qualityLevel;
     491    }
     492    return "";
     493    }
    217494   
    218495    public static void printUsage() {
     
    266543        public void run() {
    267544            logger.info("@@@@@@@@@@@@@@@@@@@@@@@@");
    268             logger.info("WARNING!!!");
    269             logger.info("Got Ctrl-C. INCOMPLETE generated temp CSV file: " +
     545            logger.info("WARNING: If Ctrl-C was pressed, then");
     546            logger.info("\tan INCOMPLETE temp CSV file would have been generated at: " +
    270547                inspector.getCSVOutputFilename());
    271             logger.info(String.format("Copy remaining records from input file %s into this file.",
     548            logger.info(String.format("\tSo copy remaining records from input file %s into this file.",
    272549                          Utility.getFilePath(inputFile)));
    273550            logger.info("@@@@@@@@@@@@@@@@@@@@@@@@");
    274551        }
    275552        }));
    276        
    277         String filename = inspector.processCSV();
     553
     554        //String filename = inspector.processCSV();
     555        String filename = inspector.processCSV_QualityLevelColumn();
     556       
    278557       
    279558        logger.info("Generated temp CSV file: " + filename);
Note: See TracChangeset for help on using the changeset viewer.