Changeset 32745


Ignore:
Timestamp:
2019-02-05T23:03:16+13:00 (5 years ago)
Author:
ak19
Message:

More Western Wilson stuff. 1. Major changes to fix handling of utf8 stuff in db so uniqueness actually works: so finding (selecting) exact matches works and insert unique violations don't happen from code. Inserting is now made lowercase since only macrons matter and case doesn't. 2. The SQL db's MarkedWords table needs to specify the uniqueness of its utf8 marked_word column differently for the utf8-ness to work.

Location:
other-projects/the-macronizer/trunk/src
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • other-projects/the-macronizer/trunk/src/java/util/MacroniserLogFileData.java

    r32742 r32745  
    3535        sb.append(time.format(DateTimeFormatter.ofPattern("HH:mm:ss")));
    3636        sb.append("] DirectInput.doPost() \n Input:");
    37         sb.append(inputText);
     37        if(inputText != null) sb.append(inputText);
    3838        sb.append("Output:");
    39         sb.append(outputText);
     39        if(outputText != null) sb.append(outputText);
    4040
    4141        return sb.toString();
  • other-projects/the-macronizer/trunk/src/java/util/MacroniserLogFileProcessor.java

    r32742 r32745  
    1515
    1616public class MacroniserLogFileProcessor {
     17    static boolean debug = false;
    1718    /** ARGUMENTS:
    1819     *              /home/wjkw1/RESEARCH_2018-19/bash_test/loggingtest.log
     
    2627
    2728    public static void main(String[] args) {
    28         //TODO: re enable this method and add text to command line interface
    29         //checkArgs(args.length);
     29        checkArgs(args.length);
    3030        String filename = args[0];
    3131        //extracts using yesterdays date
    3232        extractFromLogFile(filename);
    3333
    34         //TODO: REMOVE
    35         int count = 0;
    36         for (MacroniserLogFileData entry: extractedEntries
    37              ) {
    38             printMessage((++count)+": "+entry.toString());
    39         }
     34
     35        if(debug) {
     36            int count = 0;
     37            for (MacroniserLogFileData entry : extractedEntries) {
     38                printMessage((++count) + ": " + entry.toString());
     39            }
     40        }
     41
    4042        if(exportToDB()) {
    41             // TODO: move the log file to processed folder
    42 
     43            System.out.println("Success. Move the log file " + filename + " to processed folder");
     44            System.exit(0);
    4345        } else {
    44             // TODO: move the log file to reprocess folder
     46            System.err.println("FAILED. Move the log file " + filename + " to reprocess folder");
     47            System.exit(-1);
    4548        }
    4649    }
     
    5760
    5861            //loop through all entries
    59             for (MacroniserLogFileData entry : extractedEntries
    60                     ) {
     62            for (MacroniserLogFileData entry : extractedEntries) {
    6163                //get the marked words from first entry
    6264                ArrayList<String> markedWords = getMarkedWordsFromOutput(entry.getOutputText());
    63                 if (markedWords != null) {
     65                // all these markedWords share the same date and time
     66                LocalDate date = entry.getDate();
     67                LocalTime time = entry.getTime();
     68
     69                if (markedWords == null) { // no words, enter NULL into db
     70                    if(debug) {
     71                        printMessage("word=NULL");
     72                        printMessage(date.toString());
     73                        printMessage(time.format(DateTimeFormatter.ofPattern("HH:mm:ss")));
     74                        printMessage(""); // newline
     75                    }
     76
     77                    MySQLAccess.Tuple tuple = new MySQLAccess.Tuple(null,date,time);
     78                    // b. add to DB
     79                    sqlAccess.addNewEntry(tuple);
     80                } else { // process all the words
    6481                    for (String word : markedWords) {
    65                         // create the tuple
    66                         MySQLAccess.Tuple tuple = new MySQLAccess.Tuple(word,entry.getDate(),entry.getTime());
    67                         //TODO: insert all into database
    68                         //insert all into database
    69                         printMessage(word);
    70                         printMessage(entry.getDate().toString());
    71                         printMessage(entry.getTime().format(DateTimeFormatter.ofPattern("HH:mm:ss")));
    72                         printMessage("");
    73 
     82                        if(debug) {
     83                            printMessage(word);
     84                            printMessage(date.toString());
     85                            printMessage(time.format(DateTimeFormatter.ofPattern("HH:mm:ss")));
     86                            printMessage(""); // newline
     87                        }
     88
     89                        //insert all into database:
     90                        // a. create the tuple: always entering into db as lowercase so we don't consider tō different from Tō
     91                        // whether when inserting into DB or searching for the term
     92                        MySQLAccess.Tuple tuple = new MySQLAccess.Tuple(word.toLowerCase(),date,time);
     93                        // b. add to DB
    7494                        sqlAccess.addNewEntry(tuple);
    7595                    }
     96
    7697                }
    7798            }
    7899            success = true;
    79100
    80         }catch (Exception e) {
    81             e.printStackTrace();
     101        } catch (Exception e) {
     102            e.printStackTrace(); // goes to std.err, see https://stackoverflow.com/questions/12095378/difference-between-e-printstacktrace-and-system-out-printlne
    82103            success = false;
    83104
     
    91112    //returns an array list of all marked words, null if there are none
    92113    private static ArrayList<String> getMarkedWordsFromOutput(String outputText) {
     114        if(outputText == null) { return null; }
     115
    93116        final Pattern TAG_REGEXP = Pattern.compile("<mark>(.+?)</mark>", Pattern.DOTALL);
    94117        final Matcher matcher = TAG_REGEXP.matcher(outputText);
     
    104127    //reads the log file and creates a list of data
    105128    private static void extractFromLogFile(String filename) {
    106         //TODO: change the date that is used
    107 //        LocalDate yesterday = LocalDate.now().minusDays(1L);
    108         LocalDate specifiedDate = LocalDate.of(2018,11,06);
    109129
    110130        BufferedReader br = null;
     
    133153                        //perform operations if not null, else error
    134154                        if(entry != null) {
    135                             ////check if entry date is after specified date
    136                             if(entry.getDate().isAfter(specifiedDate)){
    137                                 //stop processing
    138                                 break;
    139                             } else if (entry.getDate().isEqual(specifiedDate)){
    140                                 extractedEntries.add(entry);
    141                             }
     155                            extractedEntries.add(entry);
    142156                            //remove old content and keep new tag
    143157                            extractedSB.setLength(0);
    144158                            extractedSB.append(line);
    145159                        } else{
    146                             printErrorMsg("Parsing of entry in log file found an error, continuing on next lines...");
     160                            //printErrorMsg("Parsing of entry in log file found an error, continuing on next lines...");
    147161                            extractedSB.setLength(0);
    148162                            extractedSB.append(line);
     
    179193        //Get the input output portion of the string
    180194        String input_output = "";
    181         if(extractedString.charAt(0)=='I'){
     195        if (extractedString.charAt(0) == 'I') {
    182196
    183197            //Get the date and time of entry
     
    191205            input_output = extractedString.replaceAll(directInputRegexp, "").trim();
    192206
    193             int INDEX_STARTOF_INPUT = 6, INDEX_ENDOF_INPUT = getEndofInputIndex(input_output),
    194                     INDEX_STARTOF_OUTPUT = INDEX_ENDOF_INPUT + 8, INDEX_ENDOF_OUTPUT = input_output.length();
    195             entryInput = input_output.substring(INDEX_STARTOF_INPUT, INDEX_ENDOF_INPUT);
    196             entryOutput = input_output.substring(INDEX_STARTOF_OUTPUT, INDEX_ENDOF_OUTPUT);
    197 
     207            int INDEX_STARTOF_INPUT = 6;
     208            int INDEX_ENDOF_INPUT = getEndofInputIndex(input_output);
     209            int INDEX_STARTOF_OUTPUT = INDEX_ENDOF_INPUT + 8;
     210            int INDEX_ENDOF_OUTPUT = input_output.length();
     211            if (INDEX_ENDOF_INPUT == -1) {
     212                entryInput = null;
     213                entryOutput = null;
     214
     215            } else {
     216                entryInput = input_output.substring(INDEX_STARTOF_INPUT, INDEX_ENDOF_INPUT);
     217                entryOutput = input_output.substring(INDEX_STARTOF_OUTPUT, INDEX_ENDOF_OUTPUT);
     218            }
    198219            entry = new MacroniserLogFileData(entryDate, entryTime, entryInput, entryOutput);
     220
    199221            return entry;
    200222
    201         } else if (extractedString.charAt(0)=='E') {
     223        } else if (extractedString.charAt(0) == 'E') {
    202224            input_output = extractedString.replaceAll(fileUploadRegexp, "");
    203225            return null;
     
    225247            return indexes.get(middle_index);
    226248        } else {
    227             printErrorMsg("No output tag could be found, error in log file.");
     249            printMessage("Warning: No output tag could be found. Probably NULL input.");
    228250            return -1;
    229251        }
  • other-projects/the-macronizer/trunk/src/java/util/MySQLAccess.java

    r32743 r32745  
    5656
    5757    private final String DB_NAME = "Macroniser";
    58     // TODO: from Properties file
     58    // obtained from Properties file:
    5959    private String USERNAME; //= "root"; by default
    6060    private String PASSWORD; //= "pinky";
     
    9191
    9292            }
    93             System.err.println("*** Found user: |" + USERNAME + "|");
    94             System.err.println("*** Found pwd: " + PASSWORD);
     93            //System.out.println("*** Found user: |" + USERNAME + "|");
     94            //System.out.println("*** Found pwd: " + PASSWORD);
    9595
    9696            // This will load the MySQL driver, each DB has its own driver Class.forName("com.mysql.jdbc.Driver");
     
    9999            statement = connect.createStatement();
    100100
    101             // TODO: can we use preparedStatement here instead of statement?
    102             int result = statement.executeUpdate("set names utf8mb4");
    103             System.err.println("Was set utf8 a success? " + result); // should return 0 for SQL stmts that return nothing
     101            int result = statement.executeUpdate("set names utf8mb4"); // should return 0 for SQL stmts that return nothing
     102
    104103            success = true;
    105104        } catch (SQLException e) {
     
    192191            insertOccurrence(word_id, time_id, date_id);
    193192
    194             System.err.println("The ID's are:(word, date, time) (" + word_id + "," + date_id + "," + time_id + ")");
     193            if(MacroniserLogFileProcessor.debug) {
     194                System.out.println("The IDs are:(word, date, time) (" + word_id + "," + date_id + "," + time_id + ")");
     195            }
    195196
    196197            connect.commit();
     
    234235    //gets the specified marked word's id
    235236    private int getMarkedWordID(String marked_word) throws SQLException {
     237        // in cases where the user didn't enter any input str, marked_word will be null. Still of interest, store to db as string "NULL"
     238        if(marked_word == null) {
     239            marked_word = "NULL";
     240        }
     241
    236242        // Warning: "select * from table WHERE str_field LIKE ?" does not work when we want exact matches (or matches featuring macrons)
    237243        // Use "WHERE BINARY str_filed = ?" instead
     
    253259    private int insertMarkedWord(String word) throws SQLException {
    254260
     261        if(word == null) {
     262            word = "NULL";
     263        }
     264
    255265        String query = "INSERT INTO MarkedWords (marked_word) VALUES (?)";
    256266        preparedStatement = connect.prepareStatement(query, Statement.RETURN_GENERATED_KEYS);
    257267        preparedStatement.setString(1, word);
     268
    258269
    259270        int word_id = -1;
     
    293304        while (resultSet.next()) {
    294305            date_id = resultSet.getInt("date_id");
    295             System.err.println(date_id + " - " + date.toString());
     306            if(MacroniserLogFileProcessor.debug) System.out.println(date_id + " - " + date.toString());
    296307        }
    297308        return date_id;
     
    382393
    383394    public static void main(String[] args) {
    384         System.err.println("Hello pinky!");
    385395        MySQLAccess mysql = new MySQLAccess();
    386396
  • other-projects/the-macronizer/trunk/src/sql-scripts/MySQL_Code.sql

    r32742 r32745  
    66
    77
     8/*
     9Unique constraint on marked_word field needs to be specified specially to deal with utf8
     10e.g. to distinguish between āno and anō, rather than treating them both the same breaking uniqueness.
     11https://stackoverflow.com/questions/25318479/unique-constraint-violation-with-utf-8-values
     12*/
    813CREATE TABLE IF NOT EXISTS MarkedWords (
    914    word_id INT AUTO_INCREMENT,
    10     marked_word varchar (255) UNIQUE,
     15    marked_word varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin UNIQUE,
    1116    PRIMARY KEY (word_id)
    1217    );
Note: See TracChangeset for help on using the changeset viewer.