Changeset 32745

Show
Ignore:
Timestamp:
05.02.2019 23:03:16 (13 days ago)
Author:
ak19
Message:

More Western Wilson stuff. 1. Major changes to fix handling of utf8 stuff in db so uniqueness actually works: so finding (selecting) exact matches works and insert unique violations don't happen from code. Inserting is now made lowercase since only macrons matter and case doesn't. 2. The SQL db's MarkedWords? table needs to specify the uniqueness of its utf8 marked_word column differently for the utf8-ness to work.

Location:
other-projects/the-macronizer/trunk/src
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • other-projects/the-macronizer/trunk/src/java/util/MacroniserLogFileData.java

    r32742 r32745  
    3535        sb.append(time.format(DateTimeFormatter.ofPattern("HH:mm:ss"))); 
    3636        sb.append("] DirectInput.doPost() \n Input:"); 
    37         sb.append(inputText); 
     37        if(inputText != null) sb.append(inputText); 
    3838        sb.append("Output:"); 
    39         sb.append(outputText); 
     39        if(outputText != null) sb.append(outputText); 
    4040 
    4141        return sb.toString(); 
  • other-projects/the-macronizer/trunk/src/java/util/MacroniserLogFileProcessor.java

    r32742 r32745  
    1515 
    1616public class MacroniserLogFileProcessor { 
     17    static boolean debug = false; 
    1718    /** ARGUMENTS: 
    1819     *              /home/wjkw1/RESEARCH_2018-19/bash_test/loggingtest.log 
     
    2627 
    2728    public static void main(String[] args) { 
    28         //TODO: re enable this method and add text to command line interface 
    29         //checkArgs(args.length); 
     29        checkArgs(args.length); 
    3030        String filename = args[0]; 
    3131        //extracts using yesterdays date 
    3232        extractFromLogFile(filename); 
    3333 
    34         //TODO: REMOVE 
    35         int count = 0; 
    36         for (MacroniserLogFileData entry: extractedEntries 
    37              ) { 
    38             printMessage((++count)+": "+entry.toString()); 
    39         } 
     34 
     35        if(debug) { 
     36            int count = 0; 
     37            for (MacroniserLogFileData entry : extractedEntries) { 
     38                printMessage((++count) + ": " + entry.toString()); 
     39            } 
     40        } 
     41 
    4042        if(exportToDB()) { 
    41             // TODO: move the log file to processed folder 
    42  
     43            System.out.println("Success. Move the log file " + filename + " to processed folder"); 
     44            System.exit(0); 
    4345        } else { 
    44             // TODO: move the log file to reprocess folder 
     46            System.err.println("FAILED. Move the log file " + filename + " to reprocess folder"); 
     47            System.exit(-1); 
    4548        } 
    4649    } 
     
    5760 
    5861            //loop through all entries 
    59             for (MacroniserLogFileData entry : extractedEntries 
    60                     ) { 
     62            for (MacroniserLogFileData entry : extractedEntries) { 
    6163                //get the marked words from first entry 
    6264                ArrayList<String> markedWords = getMarkedWordsFromOutput(entry.getOutputText()); 
    63                 if (markedWords != null) { 
     65                // all these markedWords share the same date and time 
     66                LocalDate date = entry.getDate(); 
     67                LocalTime time = entry.getTime(); 
     68 
     69                if (markedWords == null) { // no words, enter NULL into db 
     70                    if(debug) { 
     71                        printMessage("word=NULL"); 
     72                        printMessage(date.toString()); 
     73                        printMessage(time.format(DateTimeFormatter.ofPattern("HH:mm:ss"))); 
     74                        printMessage(""); // newline 
     75                    } 
     76 
     77                    MySQLAccess.Tuple tuple = new MySQLAccess.Tuple(null,date,time); 
     78                    // b. add to DB 
     79                    sqlAccess.addNewEntry(tuple); 
     80                } else { // process all the words 
    6481                    for (String word : markedWords) { 
    65                         // create the tuple 
    66                         MySQLAccess.Tuple tuple = new MySQLAccess.Tuple(word,entry.getDate(),entry.getTime()); 
    67                         //TODO: insert all into database 
    68                         //insert all into database 
    69                         printMessage(word); 
    70                         printMessage(entry.getDate().toString()); 
    71                         printMessage(entry.getTime().format(DateTimeFormatter.ofPattern("HH:mm:ss"))); 
    72                         printMessage(""); 
    73  
     82                        if(debug) { 
     83                            printMessage(word); 
     84                            printMessage(date.toString()); 
     85                            printMessage(time.format(DateTimeFormatter.ofPattern("HH:mm:ss"))); 
     86                            printMessage(""); // newline 
     87                        } 
     88 
     89                        //insert all into database: 
     90                        // a. create the tuple: always entering into db as lowercase so we don't consider tō different from Tō 
     91                        // whether when inserting into DB or searching for the term 
     92                        MySQLAccess.Tuple tuple = new MySQLAccess.Tuple(word.toLowerCase(),date,time); 
     93                        // b. add to DB 
    7494                        sqlAccess.addNewEntry(tuple); 
    7595                    } 
     96 
    7697                } 
    7798            } 
    7899            success = true; 
    79100 
    80         }catch (Exception e) { 
    81             e.printStackTrace(); 
     101        } catch (Exception e) { 
     102            e.printStackTrace(); // goes to std.err, see https://stackoverflow.com/questions/12095378/difference-between-e-printstacktrace-and-system-out-printlne 
    82103            success = false; 
    83104 
     
    91112    //returns an array list of all marked words, null if there are none 
    92113    private static ArrayList<String> getMarkedWordsFromOutput(String outputText) { 
     114        if(outputText == null) { return null; } 
     115 
    93116        final Pattern TAG_REGEXP = Pattern.compile("<mark>(.+?)</mark>", Pattern.DOTALL); 
    94117        final Matcher matcher = TAG_REGEXP.matcher(outputText); 
     
    104127    //reads the log file and creates a list of data 
    105128    private static void extractFromLogFile(String filename) { 
    106         //TODO: change the date that is used 
    107 //        LocalDate yesterday = LocalDate.now().minusDays(1L); 
    108         LocalDate specifiedDate = LocalDate.of(2018,11,06); 
    109129 
    110130        BufferedReader br = null; 
     
    133153                        //perform operations if not null, else error 
    134154                        if(entry != null) { 
    135                             ////check if entry date is after specified date 
    136                             if(entry.getDate().isAfter(specifiedDate)){ 
    137                                 //stop processing 
    138                                 break; 
    139                             } else if (entry.getDate().isEqual(specifiedDate)){ 
    140                                 extractedEntries.add(entry); 
    141                             } 
     155                            extractedEntries.add(entry); 
    142156                            //remove old content and keep new tag 
    143157                            extractedSB.setLength(0); 
    144158                            extractedSB.append(line); 
    145159                        } else{ 
    146                             printErrorMsg("Parsing of entry in log file found an error, continuing on next lines..."); 
     160                            //printErrorMsg("Parsing of entry in log file found an error, continuing on next lines..."); 
    147161                            extractedSB.setLength(0); 
    148162                            extractedSB.append(line); 
     
    179193        //Get the input output portion of the string 
    180194        String input_output = ""; 
    181         if(extractedString.charAt(0)=='I'){ 
     195        if (extractedString.charAt(0) == 'I') { 
    182196 
    183197            //Get the date and time of entry 
     
    191205            input_output = extractedString.replaceAll(directInputRegexp, "").trim(); 
    192206 
    193             int INDEX_STARTOF_INPUT = 6, INDEX_ENDOF_INPUT = getEndofInputIndex(input_output), 
    194                     INDEX_STARTOF_OUTPUT = INDEX_ENDOF_INPUT + 8, INDEX_ENDOF_OUTPUT = input_output.length(); 
    195             entryInput = input_output.substring(INDEX_STARTOF_INPUT, INDEX_ENDOF_INPUT); 
    196             entryOutput = input_output.substring(INDEX_STARTOF_OUTPUT, INDEX_ENDOF_OUTPUT); 
    197  
     207            int INDEX_STARTOF_INPUT = 6; 
     208            int INDEX_ENDOF_INPUT = getEndofInputIndex(input_output); 
     209            int INDEX_STARTOF_OUTPUT = INDEX_ENDOF_INPUT + 8; 
     210            int INDEX_ENDOF_OUTPUT = input_output.length(); 
     211            if (INDEX_ENDOF_INPUT == -1) { 
     212                entryInput = null; 
     213                entryOutput = null; 
     214 
     215            } else { 
     216                entryInput = input_output.substring(INDEX_STARTOF_INPUT, INDEX_ENDOF_INPUT); 
     217                entryOutput = input_output.substring(INDEX_STARTOF_OUTPUT, INDEX_ENDOF_OUTPUT); 
     218            } 
    198219            entry = new MacroniserLogFileData(entryDate, entryTime, entryInput, entryOutput); 
     220 
    199221            return entry; 
    200222 
    201         } else if (extractedString.charAt(0)=='E') { 
     223        } else if (extractedString.charAt(0) == 'E') { 
    202224            input_output = extractedString.replaceAll(fileUploadRegexp, ""); 
    203225            return null; 
     
    225247            return indexes.get(middle_index); 
    226248        } else { 
    227             printErrorMsg("No output tag could be found, error in log file."); 
     249            printMessage("Warning: No output tag could be found. Probably NULL input."); 
    228250            return -1; 
    229251        } 
  • other-projects/the-macronizer/trunk/src/java/util/MySQLAccess.java

    r32743 r32745  
    5656 
    5757    private final String DB_NAME = "Macroniser"; 
    58     // TODO: from Properties file 
     58    // obtained from Properties file: 
    5959    private String USERNAME; //= "root"; by default 
    6060    private String PASSWORD; //= "pinky"; 
     
    9191 
    9292            } 
    93             System.err.println("*** Found user: |" + USERNAME + "|"); 
    94             System.err.println("*** Found pwd: " + PASSWORD); 
     93            //System.out.println("*** Found user: |" + USERNAME + "|"); 
     94            //System.out.println("*** Found pwd: " + PASSWORD); 
    9595 
    9696            // This will load the MySQL driver, each DB has its own driver Class.forName("com.mysql.jdbc.Driver"); 
     
    9999            statement = connect.createStatement(); 
    100100 
    101             // TODO: can we use preparedStatement here instead of statement? 
    102             int result = statement.executeUpdate("set names utf8mb4"); 
    103             System.err.println("Was set utf8 a success? " + result); // should return 0 for SQL stmts that return nothing 
     101            int result = statement.executeUpdate("set names utf8mb4"); // should return 0 for SQL stmts that return nothing 
     102 
    104103            success = true; 
    105104        } catch (SQLException e) { 
     
    192191            insertOccurrence(word_id, time_id, date_id); 
    193192 
    194             System.err.println("The ID's are:(word, date, time) (" + word_id + "," + date_id + "," + time_id + ")"); 
     193            if(MacroniserLogFileProcessor.debug) { 
     194                System.out.println("The IDs are:(word, date, time) (" + word_id + "," + date_id + "," + time_id + ")"); 
     195            } 
    195196 
    196197            connect.commit(); 
     
    234235    //gets the specified marked word's id 
    235236    private int getMarkedWordID(String marked_word) throws SQLException { 
     237        // in cases where the user didn't enter any input str, marked_word will be null. Still of interest, store to db as string "NULL" 
     238        if(marked_word == null) { 
     239            marked_word = "NULL"; 
     240        } 
     241 
    236242        // Warning: "select * from table WHERE str_field LIKE ?" does not work when we want exact matches (or matches featuring macrons) 
    237243        // Use "WHERE BINARY str_filed = ?" instead 
     
    253259    private int insertMarkedWord(String word) throws SQLException { 
    254260 
     261        if(word == null) { 
     262            word = "NULL"; 
     263        } 
     264 
    255265        String query = "INSERT INTO MarkedWords (marked_word) VALUES (?)"; 
    256266        preparedStatement = connect.prepareStatement(query, Statement.RETURN_GENERATED_KEYS); 
    257267        preparedStatement.setString(1, word); 
     268 
    258269 
    259270        int word_id = -1; 
     
    293304        while (resultSet.next()) { 
    294305            date_id = resultSet.getInt("date_id"); 
    295             System.err.println(date_id + " - " + date.toString()); 
     306            if(MacroniserLogFileProcessor.debug) System.out.println(date_id + " - " + date.toString()); 
    296307        } 
    297308        return date_id; 
     
    382393 
    383394    public static void main(String[] args) { 
    384         System.err.println("Hello pinky!"); 
    385395        MySQLAccess mysql = new MySQLAccess(); 
    386396 
  • other-projects/the-macronizer/trunk/src/sql-scripts/MySQL_Code.sql

    r32742 r32745  
    66 
    77 
     8/* 
     9Unique constraint on marked_word field needs to be specified specially to deal with utf8 
     10e.g. to distinguish between āno and anō, rather than treating them both the same breaking uniqueness. 
     11https://stackoverflow.com/questions/25318479/unique-constraint-violation-with-utf-8-values 
     12*/ 
    813CREATE TABLE IF NOT EXISTS MarkedWords ( 
    914    word_id INT AUTO_INCREMENT, 
    10     marked_word varchar (255) UNIQUE, 
     15    marked_word varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin UNIQUE, 
    1116    PRIMARY KEY (word_id) 
    1217    );