Changeset 33757 for main/trunk


Ignore:
Timestamp:
2019-12-07T01:40:13+13:00 (4 years ago)
Author:
ak19
Message:
  1. Windows bugfix for getting exMeta to be loaded into GLI where there are subdirs involved in the Gather pane, or there are non-ASCII filenames, or the file rename method is set to base64. 2. Bugfix for Linux and Windows: Using Base64 to rename files was still a problem despite the previous commit (which was supposed to have fixed all GLI exMeta loading issues on Linux) in the special case where a subfolder was pure ASCII. The perl code wouldn't base64 encode such subdirs. However, GLI won't know which part of a relative file path to decode based on the file rename method used and which parts are not to be decoded. So GLI uniformly decoded them, and ASCII named subfolders that were not base64 encoded (but contained files that were to be renamed with base64) got base64 decoded into garbage, so that exMeta still did not get attached. 3. This commit contains debug stmts.
Location:
main/trunk
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java

    r33756 r33757  
    3434import org.greenstone.gatherer.util.Utility;
    3535
    36 import org.apache.commons.codec.binary.Base64;
    37 
    38 //import org.greenstone.gatherer.feedback.Base64;
     36//import org.greenstone.gatherer.feedback.Base64; // decode() from Base64 didn't work
     37import org.apache.commons.codec.binary.Base64; // decoding from Base64 works
    3938
    4039/** This class represents one doc.xml file */
     
    5857    }
    5958
     59    /**
     60     * Checks if various versions of the file object's filename, denoted relatively by file_relative_path,
     61     * occur in the source_file_name_to_description_elements_mapping map
     62    */
     63    private ArrayList findSourceFileMapKeyMatch(File file, String file_relative_path) {
     64        ArrayList description_elements_list = null;
     65               
     66        System.err.println("Looking for key " + file_relative_path);
     67        description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
     68        if(description_elements_list != null) {
     69            System.err.println("   Found key matching REGULAR filepath: " + file_relative_path);
     70            return description_elements_list;           
     71        }
     72        else if(!Utility.isWindows()) { // couldn't find a matching key, we're done
     73            System.err.println("Unable to find meta for regular file path form " + file_relative_path);
     74            return null;
     75        }
     76       
     77        // Now we can try windows short filename as map key
     78       
     79        String win_short_file_relative_path = "";
     80        try{
     81            win_short_file_relative_path = Utility.getWindowsShortFileName(file.getAbsolutePath());             
     82            //System.err.println("@@@ Searching for short file name: " + win_short_file_relative_path);
     83        } catch(Exception e) { // we're done trying to find a matching key
     84            System.err.println("Failed to convert to windows short file name: " + win_short_file_relative_path);           
     85            return null;
     86        }
     87       
     88        // Got a windows short file name, lop off import folder again
     89        int import_index = win_short_file_relative_path.indexOf("import");
     90        if (import_index != -1) {
     91            win_short_file_relative_path = win_short_file_relative_path.substring(import_index + "import".length() + 1);
     92        }
     93           
     94        System.err.println("### Looking for Windows short file name |" + win_short_file_relative_path +  "| in map of sourcefilenames to doc.xml's ex meta.");
     95        description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(win_short_file_relative_path);
     96        if (description_elements_list != null) {
     97            System.err.println("   Found key matching FULL win shortfile path: " + win_short_file_relative_path);
     98            return description_elements_list; // found
     99        }
     100       
     101        // else, check whether a map key is matched by any REMAINING combination of windows shortfile path and regular path:
     102        // - windows shortfilename's rel-dir-path with regular tailname
     103        // - and regular rel-dir-path with windows shortfilename's tailname
     104               
     105        String shortFileTailName = win_short_file_relative_path;
     106        String shortFileRelDirPath = "";
     107        int lastSep = win_short_file_relative_path.lastIndexOf(File.separator);
     108        if(lastSep != -1) {         
     109            shortFileTailName = win_short_file_relative_path.substring(lastSep+1);
     110            shortFileRelDirPath = win_short_file_relative_path.substring(0, lastSep+1); // include the slash
     111        }
     112       
     113        String fileTailName = file_relative_path;
     114        String fileRelDirPath = "";
     115        lastSep = file_relative_path.lastIndexOf(File.separator);
     116        if(lastSep != -1) {         
     117            fileTailName = file_relative_path.substring(lastSep+1);
     118            fileRelDirPath = file_relative_path.substring(0, lastSep+1); // include the slash
     119        }
     120       
     121        String path = shortFileRelDirPath + fileTailName;
     122        System.err.println("### Looking for Windows short file name |" + path +  "| in map of sourcefilenames to doc.xml's ex meta.");
     123        description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
     124       
     125        if(description_elements_list != null) {
     126            System.err.println("   Found key matching MIX of win shortfile path and regular path: " + path);
     127            return description_elements_list; // found
     128        }
     129
     130        // try the other combination
     131        path = fileRelDirPath + shortFileTailName;
     132        System.err.println("### Looking for Windows short file name |" + path +  "| in map of sourcefilenames to doc.xml's ex meta.");
     133        description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
     134       
     135        if(description_elements_list != null) {
     136            System.err.println("   Found key matching MIX of regular path and win shortfile path: " + path);
     137            return description_elements_list; // found
     138        }       
     139       
     140        return description_elements_list;
     141    }
     142   
    60143
    61144    public ArrayList getMetadataExtractedFromFile(File file)
     
    70153    }
    71154
    72     ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
    73     ///    System.err.println("@@@ relFilename: " + relFilename);
    74     ///}
     155    for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
     156        System.err.println("\n@@@ relFilename: " + relFilename);
     157    }
    75158   
    76159    // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
    77     ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
     160    //ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
     161    ArrayList description_elements_list = findSourceFileMapKeyMatch(file, file_relative_path);
    78162    if (description_elements_list == null) {
    79         // ...it doesn't
    80         return metadata_values;
     163            // ...it doesn't
     164            System.err.println("Unable to find meta for (regular file path form) " + file_relative_path);
     165            if(Utility.isWindows()) {
     166                System.err.println("    Or for windows shortFile path form, or for combinations with regular file path form");
     167            }
     168            return metadata_values; // we're done
    81169    }
    82170
     
    275363            }
    276364           
    277             ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
     365            System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
    278366            // Remember this for quick access later
    279367            if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
     
    326414        gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
    327415        source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
    328         }
     416        }       
    329417    }
    330418    catch (FileNotFoundException exception) {
     
    379467        // General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
    380468        byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
    381         ///System.err.println("Got base64 string: " + importFilePathParts[i]);
    382         ///System.err.println("Decoded from base64 to bytes: " + bytes);
     469        System.err.println("Got base64 string: " + importFilePathParts[i]);
     470        System.err.println("Decoded from base64 to bytes: " + new String(bytes, System.getProperty("file.encoding")));
    383471        // Using system file.encoding to interpret the resulting bytestring as a String,
    384472        // just as we always did with URL decoding method
     
    397485    decoded_gsdlsourcefilename += file_ext;
    398486   
    399     ///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
     487    System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
    400488
    401489    return decoded_gsdlsourcefilename;
  • main/trunk/gli/src/org/greenstone/gatherer/util/Utility.java

    r33729 r33757  
    114114     
    115115      return result;
     116    }
     117   
     118    /**
     119     * The following calls a method in WindowsNativeFunctions.java to retrieve Windows short file names
     120     * taken from http://dolf.trieschnigg.nl/eightpointthree/eightpointthree.html
     121     * which uses the the non-JNI NativeCall jar file for which WindowsNativeFunctions imports com.eaio.nativecall.*
     122     *
     123     * returns the short filename (8.3) for a file in Windows
     124     *
     125     * @param longFileName - must be the full path to an actual existing file
     126     * @return a string with the short filename, or null if an error occurred or the
     127     *         file does not exist.
     128     */
     129    public static String getWindowsShortFileName(String longFileName) throws Exception {
     130        if(!Utility.isWindows()) {
     131            return longFileName;
     132        } else {
     133            //return WindowsNativeFunctions.getEightPointThree(longFileName);
     134            return getMSDOSName(longFileName);
     135        }
     136    }
     137   
     138    /** 
     139     * getMSDOSName() and its helper function getAbsolutePath(fileName)
     140     * are from https://stackoverflow.com/questions/18893284/how-to-get-short-filenames-in-windows-using-java
     141     * getMSDOSName() modified to use our SafeProcess class.
     142     *
     143     * @param fileName - the regular fileName to be converted. Must be the full path to an actual existing file
     144     * @return Windows shortfile name for the fileName parameter given.
     145     */
     146    public static String getMSDOSName(String fileName)
     147        throws IOException, InterruptedException {
     148
     149        /*
     150        String path = getAbsolutePath(fileName);
     151       
     152        changed "+ fileName.toUpperCase() +" to "path"
     153        Process process =
     154            Runtime.getRuntime().exec(
     155                "cmd /c for %I in (\"" + path + "\") do @echo %~fsI");
     156       
     157        process.waitFor();
     158       
     159        byte[] data = new byte[65536];
     160        int size = process.getInputStream().read(data);
     161
     162        if (size <= 0) {
     163            return null;
     164        }
     165
     166        return new String(data, 0, size).replaceAll("\\r\\n", "");
     167        */
     168        String path = getAbsolutePath(fileName);
     169       
     170        SafeProcess process = new SafeProcess("cmd /c for %I in (\"" + path + "\") do @echo %~fsI");
     171        int returnVal = process.runProcess();
     172        if(returnVal != 0) {
     173            return null;
     174        }
     175
     176        String data = process.getStdOutput();
     177        if(data == null) {
     178            return null;           
     179        }
     180        else return data.replaceAll("\\r\\n", "");
     181    }
     182    public static String getAbsolutePath(String fileName)
     183        throws IOException {
     184        File file = new File(fileName);
     185        String path = file.getAbsolutePath();
     186
     187        if (file.exists() == false)
     188            file = new File(path);
     189
     190        path = file.getCanonicalPath();
     191
     192        if (file.isDirectory() && (path.endsWith(File.separator) == false))
     193            path += File.separator;
     194
     195        return path;
    116196    }
    117197   
  • main/trunk/greenstone2/perllib/doc.pm

    r33756 r33757  
    298298    my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
    299299   
    300     my $dirpath = "";   
     300    my $dirpath = "";
     301   
     302    # Don't encode the first folder ("import" or "tmp"): GLI's DocXMLFile.java looks for literal "import"
     303    # or "tmp" before it knows what the file rename method to be used decode the rest of gsdlsourcefilename is.
     304    if(scalar (@dirs) > 1) {
     305        $dirpath = shift(@dirs);
     306    }
    301307    foreach my $subdir (@dirs) {
    302     ##print STDERR "@@@@ Found subdir: $subdir\n";
    303    
    304     # The import folder can be called anything, including in non-ASCII encodings.
    305     # Don't need to avoid encoding default import folder called "import", as it gets
    306     # URL/base64 encoded to ITSELF.
    307     # But can't encode (URL/base64 encode) any $ENV{'GSDLIMPORTDIR'} though if it's set,
    308     # as BasePlugout::get_doc_dir() removes any $ENV{'GSDLIMPORTDIR'} prefix. Because if
    309     # the $ENV{'GSDLIMPORTDIR'} part of gsdlsourcefilename is encoded here, the prefix
    310     # won't match with $ENV{'GSDLIMPORTDIR'}
    311 
    312     unless ($ENV{'GSDLIMPORTDIR'} && $subdir eq $ENV{'GSDLIMPORTDIR'}) {
    313         $subdir = &util::rename_file($subdir, $rename_method);
    314     }
     308    print STDERR "@@@@ Found subdir: $subdir\n";
     309   
     310    $subdir = &util::rename_file($subdir, $rename_method);
     311    #print STDERR "@@@@ encoded subdir: $subdir\n";
     312   
    315313    $dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
    316314    }
  • main/trunk/greenstone2/perllib/unicode.pm

    r33299 r33757  
    697697}
    698698
     699# Base64 encoding does not encode a pure ASCII to itself. This is important to know.
     700# If the $force_encode parameter is true, then this method WILL base64 encode whatever
     701# string is passed in, including any plain ASCII string.
     702# That means this method could double encode an already encoded string.
     703# However, this method is necessary because on the GLI end, we can't detect whether a plain
     704# ASCII string has been encoded or not. And if gsdlsourcefilerenamemethod is set to base64,
     705# then gli will always attempt to decode all parts of the relative path gsdlsourcefilename
     706# (except the "import" prefix, which is special) or none of the parts.
     707sub force_base64_encode {
     708    my ($text) = @_;
     709    my $force_encode = 1;
     710    return &base64_encode($text, $force_encode);
     711}
     712
    699713sub base64_encode {
    700     my ($text) = @_;
    701     if(!&conforms_to_mod_base64($text)) {
     714    my ($text, $force_encode) = @_;
     715    if($force_encode || !&conforms_to_mod_base64($text)) {
    702716    # return entity for underscore to underscore before encoding
    703717    $text =~ s/&\#095;/_/g;
  • main/trunk/greenstone2/perllib/util.pm

    r33013 r33757  
    12161216    }
    12171217    elsif ($rename_method eq "base64") {
    1218     $tailname = &unicode::base64_encode($tailname);
     1218    $tailname = &unicode::force_base64_encode($tailname);
    12191219    $tailname =~ s/\s*//sg;      # for some reason it adds spaces not just at end but also in middle
    12201220    }
Note: See TracChangeset for help on using the changeset viewer.