Changeset 33746 for main/trunk


Ignore:
Timestamp:
2019-12-03T17:31:17+13:00 (4 years ago)
Author:
ak19
Message:
  1. Bugfix for dealing with + in filenames: file-level metadata now sticks and also ends up in doc.xml on build, as should happen. 2. Better (more optimal) bugfix for & in filenames, to get metadata to still stick after yesterday's first bugfix for this. Sadly, the improved code no longer needs the new function I introduced yesterday (escapeAllCharWithHexEntity). Leaving the function in, in case it ever comes in handy or as an idea. 3. Refactoring some code. 4. Removed some debugging statements. But some things are still largely commented out. Will remove hereafter.
Location:
main/trunk/gli/src/org/greenstone/gatherer/metadata
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33745 r33746  
    7373     * a file, it means this still needs to be retrieved. */
    7474    public static Map map = new HashMap();
     75   
     76    /** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */
     77    public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
     78   
    7579
    7680//*********************** BUSY REFRESHING / REQUIRING  REFRESH *********************
     
    310314    */
    311315    public static String decodeStringContainingHexEntities(String str) {
    312         String result = "";
    313         Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
    314         Matcher matcher = hexPattern.matcher(str);
     316        String result = "";     
     317        Matcher matcher = HEX_PATTERN.matcher(str);
    315318       
    316319        int searchFromIndex = 0;
     
    327330            String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match
    328331            // https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string
    329             // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int
    330            
    331             //System.err.println("hexNumberStr so far: " + hexNumberStr);           
     332            // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int           
     333           
     334            //System.err.println("hexNumberStr so far: " + hexNumberStr);
    332335            hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD"
    333336            //int hexNumber = Integer.parseInt(hexNumberStr);
     
    388391    */
    389392    public static String escapeAllCharWithHexEntity(String str, char CHARACTER/*, String hexCodeString*/) {
     393       
     394        if(str.indexOf(CHARACTER) == -1) { // nothing to replace, we're done
     395            return str;         
     396        }
     397       
    390398        String char_as_string = Character.toString(CHARACTER);
    391399        String hexCodeString = hexEntityForChar(char_as_string);
     
    393401        //System.err.println("@@@ hexCodeString for: " + char_as_string + " is: " + hexCodeString);
    394402       
    395         Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)");
    396         Matcher hexPatternMatch = hexPattern.matcher(str);     
    397        
    398         // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match hexPattern
     403        Matcher hexPatternMatch = HEX_PATTERN.matcher(str); // looks for a hex entity, which has the pattern "&#x....;"
     404       
     405        // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match HEX_PATTERN
    399406        int searchIndex = 0;
    400407       
     
    423430                   
    424431                    // String has been modified, so have to update Matcher
    425                     hexPatternMatch = hexPattern.matcher(str);
     432                    hexPatternMatch = HEX_PATTERN.matcher(str);
    426433                   
    427434                    if(searchIndex >= str.length()) {
     
    493500            String filename_ascii = filename_uri.toASCIIString();
    494501           
     502            // The URI.toASCIIString() call above only encodes values > 127.
     503            // But we also need to protect + and & signs in filenames
     504            filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased
     505            filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex           
     506           
    495507            // Before proceeding, protect & in the filename too.
    496508            // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with &
     
    499511            //filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD
    500512            //filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD
    501             filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities
    502            
     513            ///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities           
    503514           
    504515            String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");         
    505516            filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
    506517           
     518            //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller
     519            filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex
     520            filename_url_encoded = filename_url_encoded.replace("%26", "&");           
    507521        }
    508522        catch (Exception e) {
     
    605619       
    606620        File file = new File (filename);
    607         //return fileToURLEncoding(file);
    608621        String filename_url_encoded = fileToURLEncoding(file);
    609622       
     
    630643        String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");
    631644        return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path);
    632        
    633         /*
    634         String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");       
    635         if (curr_directory_path.endsWith("."+FilenameEncoding.URL_FILE_SEPARATOR)) {
    636             curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end           
    637         }
    638        
    639         File file = new File (filename);       
    640         String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath
    641        
    642         // now lop off the current dir prefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added
    643         filename_url_encoded = filename_url_encoded.substring(curr_directory_path.length());
    644         // remove any remaining slash prefix
    645         if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
    646             filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
    647         }
    648        
    649         return filename_url_encoded;
    650         */
    651645    }
    652646   
     
    660654        }
    661655       
    662         /*if (removeFilePathPrefix.endsWith("."+FilenameEncoding.URL_FILE_SEPARATOR)) {
    663             removeFilePathPrefix = removeFilePathPrefix.substring(0, removeFilePathPrefix.length()-2); // cut off /. at end     
    664         }
    665         */
    666        
    667656        File file = new File (filename);       
    668657        String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath
    669        
    670         System.err.println("@@@ full url encoded filename: " + filename_url_encoded);
    671658       
    672659        // now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r33744 r33746  
    668668        }
    669669       
    670         String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");
    671        
    672         //String curr_directory_path = FilenameEncoding.filenameToURLEncoding(".");
    673         //curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end
    674         System.err.println("@@@ curr_directory_path: " + curr_directory_path);
     670        String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); // returns the curr dir path after removing the /./ at end       
     671        //System.err.println("@@@ curr_directory_path: " + curr_directory_path);
    675672       
    676673        //System.err.println("PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true));
     
    689686                    // Reencode filename after parseXML() had the side-effect of decoding entities in filename elements
    690687                   
    691                     System.err.println("Filename before reencoding was: " + filename);                 
     688                    //System.err.println("Filename before reencoding was: " + filename);                   
    692689                   
    693690                    // Can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object
    694691                    // created by filenameToURLEncoding). So replace backslashes in regex with url-encoded hex-value of backslash, %5C.
    695                     String encoded_filename = filename.replace("\\", "%5C");
    696                    
    697                     /*
    698                     encoded_filename = FilenameEncoding.filenameToURLEncoding(encoded_filename);
    699                     // now lop off the metadataxml dir prefix the FilenameEncoding.filenameToURLEncoding(STRING) variant would have added
    700                     encoded_filename = encoded_filename.substring(curr_directory_path.length());
    701                     if (encoded_filename.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) {
    702                         encoded_filename = encoded_filename.substring(FilenameEncoding.URL_FILE_SEPARATOR.length());
    703                     }
    704                     */
     692                    String encoded_filename = filename.replace("\\", "%5C");
    705693                   
    706694                    // get the URL encoded filename preserving special encodings, with any curr_directory_path prefix removed
     
    708696                   
    709697                    // Reintrodudce the backslash characters in place of their %5C hex placeholders
    710                     encoded_filename = encoded_filename.replace("%5C", "\\");               
     698                    encoded_filename = encoded_filename.replace("%5C", "\\");
    711699                   
    712700                    // Update filename element in DOM
    713701                    XMLTools.setElementTextValue(filename_element, encoded_filename);
    714                     System.err.println("Filename after reencoding was: " + encoded_filename);
     702                    //System.err.println("Filename after reencoding was: " + encoded_filename);
    715703                }
    716704            }
Note: See TracChangeset for help on using the changeset viewer.