Changeset 33746

Show
Ignore:
Timestamp:
03.12.2019 17:31:17 (3 days ago)
Author:
ak19
Message:

1. Bugfix for dealing with + in filenames: file-level metadata now sticks and also ends up in doc.xml on build, as should happen. 2. Better (more optimal) bugfix for & in filenames, to get metadata to still stick after yesterday's first bugfix for this. Sadly, the improved code no longer needs the new function I introduced yesterday (escapeAllCharWithHexEntity). Leaving the function in, in case it ever comes in handy or as an idea. 3. Refactoring some code. 4. Removed some debugging statements. But some things are still largely commented out. Will remove hereafter.

Location:
main/trunk/gli/src/org/greenstone/gatherer/metadata
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/FilenameEncoding.java

    r33745 r33746  
    7373     * a file, it means this still needs to be retrieved. */ 
    7474    public static Map map = new HashMap(); 
     75     
     76    /** Compiled pattern for hex entities of characters. These are of the forn "&#x....;" with 1 to 4 digits */ 
     77    public static final Pattern HEX_PATTERN = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 
     78     
    7579 
    7680//*********************** BUSY REFRESHING / REQUIRING  REFRESH ********************* 
     
    310314    */ 
    311315    public static String decodeStringContainingHexEntities(String str) { 
    312         String result = ""; 
    313         Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 
    314         Matcher matcher = hexPattern.matcher(str); 
     316        String result = "";      
     317        Matcher matcher = HEX_PATTERN.matcher(str); 
    315318         
    316319        int searchFromIndex = 0; 
     
    327330            String hexNumberStr = hexPart.substring(3, hexPart.length()-1); // lose the "&#x" prefix and the ";" suffix to get just the hex number portion of the match 
    328331            // https://stackoverflow.com/questions/16625865/java-unicode-to-hex-string 
    329             // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int 
    330              
    331             //System.err.println("hexNumberStr so far: " + hexNumberStr);            
     332            // https://stackoverflow.com/questions/11194513/convert-hex-string-to-int            
     333             
     334            //System.err.println("hexNumberStr so far: " + hexNumberStr); 
    332335            hexNumberStr = "0x" + hexNumberStr; // e.g "0xDDDD" 
    333336            //int hexNumber = Integer.parseInt(hexNumberStr); 
     
    388391    */ 
    389392    public static String escapeAllCharWithHexEntity(String str, char CHARACTER/*, String hexCodeString*/) { 
     393         
     394        if(str.indexOf(CHARACTER) == -1) { // nothing to replace, we're done 
     395            return str;          
     396        } 
     397         
    390398        String char_as_string = Character.toString(CHARACTER); 
    391399        String hexCodeString = hexEntityForChar(char_as_string); 
     
    393401        //System.err.println("@@@ hexCodeString for: " + char_as_string + " is: " + hexCodeString); 
    394402         
    395         Pattern hexPattern = Pattern.compile("(&#x[0-9a-zA-Z]{1,4}+;)"); 
    396         Matcher hexPatternMatch = hexPattern.matcher(str);       
    397          
    398         // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match hexPattern 
     403        Matcher hexPatternMatch = HEX_PATTERN.matcher(str); // looks for a hex entity, which has the pattern "&#x....;" 
     404         
     405        // want to replace all & with &x26; (the hex for ampsersand) IFF the & is not already a hexcode/doesn't already match HEX_PATTERN 
    399406        int searchIndex = 0; 
    400407         
     
    423430                     
    424431                    // String has been modified, so have to update Matcher 
    425                     hexPatternMatch = hexPattern.matcher(str); 
     432                    hexPatternMatch = HEX_PATTERN.matcher(str); 
    426433                     
    427434                    if(searchIndex >= str.length()) { 
     
    493500            String filename_ascii = filename_uri.toASCIIString(); 
    494501             
     502            // The URI.toASCIIString() call above only encodes values > 127. 
     503            // But we also need to protect + and & signs in filenames 
     504            filename_ascii = filename_ascii.replace("+", "%252B"); // +'s ASCII code is 43 decimal, 2b in hex, 2B when uppercased 
     505            filename_ascii = filename_ascii.replace("&", "%2526"); // &'s ASCII code is 36 in decimal, and 26 in hex             
     506             
    495507            // Before proceeding, protect & in the filename too. 
    496508            // &'s ASCII code is 36 in decimal, and 26 in hex, so replace with & 
     
    499511            //filename_url_encoded = filename_url_encoded.replace("&", "&x26;");// SO THIS IS BAD 
    500512            //filename_url_encoded = filename_url_encoded.replace("&", hexEntityForChar("&"));// SAME, STILL BAD 
    501             filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities 
    502              
     513            ///filename_ascii = escapeAllCharWithHexEntity(filename_ascii, '&'); // Good: CAREFULLY replaces & that are not part of hex entities             
    503514             
    504515            String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");          
    505516            filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes); 
    506517             
     518            //filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // Don't do this, won't get regex escaped when converted back to a + by caller 
     519            filename_url_encoded = filename_url_encoded.replace("%2B", "+"); // + signs are special, as they will need to be escaped since the caller wants the filename representing a regex 
     520            filename_url_encoded = filename_url_encoded.replace("%26", "&");            
    507521        } 
    508522        catch (Exception e) { 
     
    605619         
    606620        File file = new File (filename); 
    607         //return fileToURLEncoding(file); 
    608621        String filename_url_encoded = fileToURLEncoding(file); 
    609622         
     
    630643        String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); 
    631644        return filenameToURLEncodingWithPrefixRemoved(filename, curr_directory_path); 
    632          
    633         /* 
    634         String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding(".");        
    635         if (curr_directory_path.endsWith("."+FilenameEncoding.URL_FILE_SEPARATOR)) { 
    636             curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end             
    637         } 
    638          
    639         File file = new File (filename);         
    640         String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath 
    641          
    642         // now lop off the current dir prefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added 
    643         filename_url_encoded = filename_url_encoded.substring(curr_directory_path.length()); 
    644         // remove any remaining slash prefix 
    645         if (filename_url_encoded.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 
    646             filename_url_encoded = filename_url_encoded.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); 
    647         } 
    648          
    649         return filename_url_encoded; 
    650         */ 
    651645    } 
    652646     
     
    660654        } 
    661655         
    662         /*if (removeFilePathPrefix.endsWith("."+FilenameEncoding.URL_FILE_SEPARATOR)) { 
    663             removeFilePathPrefix = removeFilePathPrefix.substring(0, removeFilePathPrefix.length()-2); // cut off /. at end      
    664         } 
    665         */ 
    666          
    667656        File file = new File (filename);         
    668657        String filename_url_encoded = fileToURLEncoding(file); // returns a full filepath 
    669          
    670         System.err.println("@@@ full url encoded filename: " + filename_url_encoded); 
    671658         
    672659        // now lop off the given removeFilePathPrefix that FilenameEncoding.filenameToURLEncoding(STRING) variant would have added 
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/MetadataXMLFile.java

    r33744 r33746  
    668668        } 
    669669         
    670         String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); 
    671          
    672         //String curr_directory_path = FilenameEncoding.filenameToURLEncoding("."); 
    673         //curr_directory_path = curr_directory_path.substring(0, curr_directory_path.length()-2); // cut off /. at end 
    674         System.err.println("@@@ curr_directory_path: " + curr_directory_path); 
     670        String curr_directory_path = FilenameEncoding.fullFilepathToURLEncoding("."); // returns the curr dir path after removing the /./ at end         
     671        //System.err.println("@@@ curr_directory_path: " + curr_directory_path); 
    675672         
    676673        //System.err.println("PARSED loaded_file contains:\n" +  XMLTools.elementToString(doc.getDocumentElement(), true)); 
     
    689686                    // Reencode filename after parseXML() had the side-effect of decoding entities in filename elements 
    690687                     
    691                     System.err.println("Filename before reencoding was: " + filename);                   
     688                    //System.err.println("Filename before reencoding was: " + filename);                     
    692689                     
    693690                    // Can't convert to URI with backslash-escaped chars (backslash used in regexed filename are illegal in URI object 
    694691                    // created by filenameToURLEncoding). So replace backslashes in regex with url-encoded hex-value of backslash, %5C. 
    695                     String encoded_filename = filename.replace("\\", "%5C");  
    696                      
    697                     /* 
    698                     encoded_filename = FilenameEncoding.filenameToURLEncoding(encoded_filename); 
    699                     // now lop off the metadataxml dir prefix the FilenameEncoding.filenameToURLEncoding(STRING) variant would have added 
    700                     encoded_filename = encoded_filename.substring(curr_directory_path.length()); 
    701                     if (encoded_filename.startsWith(FilenameEncoding.URL_FILE_SEPARATOR)) { 
    702                         encoded_filename = encoded_filename.substring(FilenameEncoding.URL_FILE_SEPARATOR.length()); 
    703                     } 
    704                     */ 
     692                    String encoded_filename = filename.replace("\\", "%5C"); 
    705693                     
    706694                    // get the URL encoded filename preserving special encodings, with any curr_directory_path prefix removed 
     
    708696                     
    709697                    // Reintrodudce the backslash characters in place of their %5C hex placeholders 
    710                     encoded_filename = encoded_filename.replace("%5C", "\\");                
     698                    encoded_filename = encoded_filename.replace("%5C", "\\"); 
    711699                     
    712700                    // Update filename element in DOM 
    713701                    XMLTools.setElementTextValue(filename_element, encoded_filename); 
    714                     System.err.println("Filename after reencoding was: " + encoded_filename); 
     702                    //System.err.println("Filename after reencoding was: " + encoded_filename); 
    715703                } 
    716704            }