Changeset 33728


Ignore:
Timestamp:
2019-11-29T21:46:20+13:00 (4 years ago)
Author:
ak19
Message:

Introducing method that I've tested separately to decode a string that contains html hex entities back to unicode characters. Don't know if it's the most optimal solution, because it's my own code. Didn't know how to google for existing Java solutions, which surely must be out there.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33727 r33728  
    3434import org.greenstone.gatherer.DebugStream;
    3535
     36import java.util.regex.Matcher;
     37import java.util.regex.Pattern;
     38
     39
     40
    3641/** Static access class that contains many of the methods used to work with filename encodings.
    3742* Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager
     
    291296
    292297//************************* GETTING THE URL ENCODING OF FILENAMES *********************************
     298
     299    /**
     300     * Given a String containing hexentities, will convert back into the unicode version of the String.
     301     * e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a
     302     * I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern
     303     * and contains a copy of Utility.debugUnicodeString(String) with the following main function:
     304        public static void main(String args[]) {
     305            String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt";       
     306            System.err.println("About to decode hex string: " + str);
     307            String result = decodeStringContainingHexEntities(str);
     308            System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result));
     309        }
     310    */
     311    public static String decodeStringContainingHexEntities(String str) {
     312        String result = "";
     313        boolean done = false;       
     314        Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
     315        Matcher matcher = hexPattern.matcher(str);
     316       
     317        int searchFromIndex = 0;
     318        int endMatchIndex = -1;
     319       
     320        while(matcher.find(searchFromIndex)) {
     321            String hexPart = matcher.group();
     322            //System.err.println("Found hexpart match: " + hexPart);
     323           
     324            int startMatchIndex = matcher.start();
     325            endMatchIndex = matcher.end();
     326            result += str.substring(searchFromIndex, startMatchIndex);         
     327           
     328            String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match
     329            // https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string
     330            // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int
     331           
     332            //System.err.println("hexNumberStr so far: " + hexNumberStr);           
     333            int tmpDigit = Integer.parseInt(hexNumberStr);
     334            //System.err.println("As digit: " + tmpDigit);
     335            hexNumberStr = String.format("%04d", tmpDigit);
     336            //System.err.println("2 hexNumberStr so far: " + hexNumberStr);
     337            hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD"
     338            //int hexNumber = Integer.parseInt(hexNumberStr);
     339            int hexNumber = Integer.decode(hexNumberStr);
     340            String hexNumberAsChar = Character.toString((char) hexNumber);
     341            result += hexNumberAsChar;
     342             
     343            searchFromIndex = endMatchIndex;
     344           
     345        }
     346       
     347        if(endMatchIndex != -1) {
     348            result += str.substring(endMatchIndex);
     349            //System.err.println("suffix: " + str.substring(endMatchIndex));
     350        }
     351       
     352        return result;
     353    }   
     354
     355    /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */
     356     public static String fileNameToHex(String filename) {
     357        /*String filename_url_encoded = "";
     358        try {
     359            URI filename_uri = new URI(filename);
     360            String filename_ascii = filename_uri.toASCIIString();
     361            String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
     362            filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
     363            return filename_url_encoded;
     364        } catch (Exception e) {
     365            e.printStackTrace();
     366            // Give up trying to convert
     367            filename_url_encoded = filename;
     368        }
     369        return filename_url_encoded;
     370        */
     371       
     372        String hexFilename = "";
     373        for(int i = 0; i < filename.length(); i++) {
     374            int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code
     375           
     376            // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
     377            // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
     378            if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing
     379                hexFilename += filename.charAt(i);
     380            } else {
     381                hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
     382            }
     383        }
     384         
     385        return hexFilename;
     386     }
     387
    293388    // Dr Bainbridge's methods
    294389    /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,
     
    358453        return filename_url_encoded;
    359454    }
    360 
    361     /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */
    362      public static String fileNameToHex(String filename) {
    363         /*String filename_url_encoded = "";
    364         try {
    365             URI filename_uri = new URI(filename);
    366             String filename_ascii = filename_uri.toASCIIString();
    367             String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
    368             filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
    369             return filename_url_encoded;
    370         } catch (Exception e) {
    371             e.printStackTrace();
    372             // Give up trying to convert
    373             filename_url_encoded = filename;
    374         }
    375         return filename_url_encoded;
    376         */
    377        
    378         String hexFilename = "";
    379         for(int i = 0; i < filename.length(); i++) {
    380             int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code
    381            
    382             // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
    383             // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
    384             if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing
    385                 hexFilename += filename.charAt(i);
    386             } else {
    387                 hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];"
    388             }
    389         }
    390          
    391         return hexFilename;
    392      }
    393    
    394455   
    395456    // For unicode codepoints see:
Note: See TracChangeset for help on using the changeset viewer.