Changeset 33728

Show
Ignore:
Timestamp:
29.11.2019 21:46:20 (7 days ago)
Author:
ak19
Message:

Introducing method that I've tested separately to decode a string that contains html hex entities back to unicode characters. Don't know if it's the most optimal solution, because it's my own code. Didn't know how to google for existing Java solutions, which surely must be out there.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33727 r33728  
    3434import org.greenstone.gatherer.DebugStream; 
    3535 
     36import java.util.regex.Matcher; 
     37import java.util.regex.Pattern; 
     38 
     39 
     40 
    3641/** Static access class that contains many of the methods used to work with filename encodings.  
    3742* Works closely with classes FileNode, CollectionTreeNode, MetadataXMLFile, MetadataXMLFileManager 
     
    291296 
    292297//************************* GETTING THE URL ENCODING OF FILENAMES ********************************* 
     298 
     299    /**  
     300     * Given a String containing hexentities, will convert back into the unicode version of the String. 
     301     * e.g. A string like "02 Tēnā Koutou\.mp3" will be returned as "02 Tena Koutou\.mp3" with macrons on e and a 
     302     * I've tested this in a separate file that imports java.util.regex.Matcher and java.util.regex.Pattern 
     303     * and contains a copy of Utility.debugUnicodeString(String) with the following main function: 
     304        public static void main(String args[]) { 
     305            String str = "02 Tēnā Koutou\\.mp3"; // or more basic case: String str = "mmmmānnnnēpppp\\.txt";         
     306            System.err.println("About to decode hex string: " + str); 
     307            String result = decodeStringContainingHexEntities(str); 
     308            System.err.println("Decoded hex string: " + result + " - debug unicode form: " + debugUnicodeString(result)); 
     309        } 
     310    */ 
     311    public static String decodeStringContainingHexEntities(String str) { 
     312        String result = ""; 
     313        boolean done = false;        
     314        Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 
     315        Matcher matcher = hexPattern.matcher(str); 
     316         
     317        int searchFromIndex = 0; 
     318        int endMatchIndex = -1; 
     319         
     320        while(matcher.find(searchFromIndex)) { 
     321            String hexPart = matcher.group(); 
     322            //System.err.println("Found hexpart match: " + hexPart); 
     323             
     324            int startMatchIndex = matcher.start(); 
     325            endMatchIndex = matcher.end(); 
     326            result += str.substring(searchFromIndex, startMatchIndex);           
     327             
     328            String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match 
     329            // https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string 
     330            // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int 
     331             
     332            //System.err.println("hexNumberStr so far: " + hexNumberStr);            
     333            int tmpDigit = Integer.parseInt(hexNumberStr); 
     334            //System.err.println("As digit: " + tmpDigit); 
     335            hexNumberStr = String.format("%04d", tmpDigit);  
     336            //System.err.println("2 hexNumberStr so far: " + hexNumberStr); 
     337            hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD" 
     338            //int hexNumber = Integer.parseInt(hexNumberStr); 
     339            int hexNumber = Integer.decode(hexNumberStr); 
     340            String hexNumberAsChar = Character.toString((char) hexNumber); 
     341            result += hexNumberAsChar; 
     342              
     343            searchFromIndex = endMatchIndex; 
     344             
     345        } 
     346         
     347        if(endMatchIndex != -1) { 
     348            result += str.substring(endMatchIndex); 
     349            //System.err.println("suffix: " + str.substring(endMatchIndex)); 
     350        } 
     351         
     352        return result; 
     353    }    
     354 
     355    /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */ 
     356     public static String fileNameToHex(String filename) { 
     357        /*String filename_url_encoded = ""; 
     358        try { 
     359            URI filename_uri = new URI(filename); 
     360            String filename_ascii = filename_uri.toASCIIString(); 
     361            String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 
     362            filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);  
     363            return filename_url_encoded; 
     364        } catch (Exception e) { 
     365            e.printStackTrace(); 
     366            // Give up trying to convert 
     367            filename_url_encoded = filename;  
     368        } 
     369        return filename_url_encoded; 
     370        */ 
     371         
     372        String hexFilename = ""; 
     373        for(int i = 0; i < filename.length(); i++) { 
     374            int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code 
     375             
     376            // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 
     377            // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 
     378            if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing 
     379                hexFilename += filename.charAt(i); 
     380            } else { 
     381                hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];" 
     382            } 
     383        } 
     384           
     385        return hexFilename; 
     386     } 
     387 
    293388    // Dr Bainbridge's methods 
    294389    /* On Linux machines that are set to using an ISO-8859 (Latin) type encoding,  
     
    358453        return filename_url_encoded; 
    359454    } 
    360  
    361     /** Attempting to produce the equivalent method fileToURLEncoding() above, but taking a String as input parameter */ 
    362      public static String fileNameToHex(String filename) { 
    363         /*String filename_url_encoded = ""; 
    364         try { 
    365             URI filename_uri = new URI(filename); 
    366             String filename_ascii = filename_uri.toASCIIString(); 
    367             String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1"); 
    368             filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);  
    369             return filename_url_encoded; 
    370         } catch (Exception e) { 
    371             e.printStackTrace(); 
    372             // Give up trying to convert 
    373             filename_url_encoded = filename;  
    374         } 
    375         return filename_url_encoded; 
    376         */ 
    377          
    378         String hexFilename = ""; 
    379         for(int i = 0; i < filename.length(); i++) { 
    380             int charCode = filename.codePointAt(i); // unicode codepoint / ASCII code 
    381              
    382             // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 
    383             // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 
    384             if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing 
    385                 hexFilename += filename.charAt(i); 
    386             } else { 
    387                 hexFilename += "&#x" + String.format("%x", charCode).toUpperCase() + ";"; // looks like: "&#x[up-to-4-hexdigits-in-UPPERCASE];" 
    388             } 
    389         } 
    390            
    391         return hexFilename; 
    392      } 
    393      
    394455     
    395456    // For unicode codepoints see: