Changeset 33756 for main


Ignore:
Timestamp:
2019-12-05T21:58:02+13:00 (4 years ago)
Author:
ak19
Message:

Attempted bugfix for ex meta not always loading in gli for docs that are in subdirs when filenames are base64 encoded. This commit only testedand works on linux for my basic tests with subdirs and without. 1. Perl now encodes all subdirs and the filename in gsdlsourcefilename (but as before, not file extension). Can't encode entire relative path starting with import in one go, as other parts of the perl code do comparisons and remove file GSDLIMPORTDIR prefixes. 2. Perl now also writes out the file rename method used, which can be none, url or base64, into doc.xml. 3. GLI now decodes each part of the gsdlsourcefilename relative path based on the file rename method. e.g. for import/subdir/filename.ext the import, subdir and filename are decoded to reconstitute the filename as it originally was, with file extension stuck back on. This has allowed GLI to finally detect the ex meta associated with a gsdlsourcefilename in cases of subdirs in import or when dealing with base64 encoded filenames. Still need to test more complex cases on linux, then windows too.

Location:
main/trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java

    r23763 r33756  
    3434import org.greenstone.gatherer.util.Utility;
    3535
     36import org.apache.commons.codec.binary.Base64;
     37
     38//import org.greenstone.gatherer.feedback.Base64;
    3639
    3740/** This class represents one doc.xml file */
     
    4346    protected final String MetadataWrap;
    4447    protected final String MetadataItem;
     48
     49    protected final String FILE_RENAME_METHOD_NONE = "none";
     50    protected final String FILE_RENAME_METHOD_URL = "url";
     51    protected final String FILE_RENAME_METHOD_BASE64 = "base64";
    4552
    4653    public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem)
     
    6370    }
    6471
     72    ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
     73    ///    System.err.println("@@@ relFilename: " + relFilename);
     74    ///}
     75   
    6576    // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
    6677    ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
     
    175186    public void skimFile()
    176187    {
     188    String fileRenameMethod = null;
     189    String gsdlsourcefilename_value = null;
     190    boolean is_unix_path = false;
     191       
    177192    MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
    178193
     
    227242        // Extracted metadata! May have ex. so make sure we remove that
    228243        String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full);
     244        if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) {
     245            // Extract the element value
     246            int value_index = line.indexOf(">", name_index) + ">".length();
     247            fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index));         
     248        }
     249       
    229250        // Note which file this is for
    230         if (metadata_element_name.equals("gsdlsourcefilename")) {
     251        else if (metadata_element_name.equals("gsdlsourcefilename")) {
    231252            // Extract the gsdlsourcefilename element value
    232253            int value_index = line.indexOf(">", name_index) + ">".length();
    233             String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
     254            gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));
    234255
    235256            // We're only interested in the path relative to the import folder
     
    238259            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());
    239260
    240             boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
     261            is_unix_path = gsdlsourcefilename_value.startsWith("/");
    241262            gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);
    242263
    243             // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding
    244             // This is stored in the System's file.encoding property.
    245             gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding"));
     264            // (Will decode gsdlsourcefilename at end of this method, once we know
     265            // for certain the fileRenameMethod that was used to encode it.)
    246266
    247267            // Make sure the path matches the OS that is running
     
    254274                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
    255275            }
    256 
     276           
     277            ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value);
    257278            // Remember this for quick access later
    258279            if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) {
     
    289310
    290311        buffered_reader.close();
     312
     313        // Now that we're done skimming, we actually need to decode gsdlsourcefilename
     314        // based on whatever fileRenameMethod was used to encode it, so that we can
     315        // at last properly compare properly against filenames on the file system
     316        // in order to load the correct ex.meta for the file.
     317        // Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
     318        // we can finally perform the decoding of gsdlsourcefilename.       
     319        if(fileRenameMethod == null) {
     320        fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
     321        }
     322        // If gsdlsourcefilename was encoded, we remove it from the map under its encoded
     323        // filename, decode it and add it back into map using its decoded filename.
     324        if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {     
     325        ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);       
     326        gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
     327        source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
     328        }
    291329    }
    292330    catch (FileNotFoundException exception) {
     
    295333    catch (IOException exception) {
    296334        DebugStream.printStackTrace(exception);
    297     }
     335    } catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename
     336        DebugStream.printStackTrace(exception);
     337    }
    298338    }
    299 
    300 
     339   
     340    protected String decodeSourceFilename(String relative_sourcefile_path,
     341                      String encodingMethod, boolean is_unix_path)
     342    throws Exception
     343    {   
     344
     345    ///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path);
     346
     347    // First get the file extension. Both in Base64 and URL encoded strings,
     348    // the full-stop character (.) doesn't get encoded.
     349    // That means getting the file extension is straightforward.
     350   
     351    // Valid base64: "The 64 characters (hence the name Base64) are 10 digits,
     352    // 26 lowercase characters, 26 uppercase characters as well as the
     353    // Plus sign (+) and the Forward Slash (/).
     354    int fullstop = relative_sourcefile_path.indexOf(".");
     355    String file_ext = "";
     356    if(fullstop != -1) {
     357        file_ext = relative_sourcefile_path.substring(fullstop);
     358        relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop);
     359    }
     360   
     361    String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path);
     362   
     363    String decoded_gsdlsourcefilename = "";
     364
     365    String separator = is_unix_path ? "/" : "\\";
     366    for(int i = 0; i < importFilePathParts.length; i++) {
     367        String decoded_filePathPart = "";
     368        if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) {
     369        // URL decode each part of gsdlsourcefilename.
     370        // Need to set the decoder to use the default system encoding
     371        // This is stored in the System's file.encoding property.
     372        decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding"));
     373        }
     374        else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) {
     375        // Decoding with org.greenstone.gatherer.feedback.Base64 didn't work
     376        //byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]);
     377        // Using org.apache.commons.codec.binary.Base64 instead
     378        // https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html
     379        // General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8
     380        byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes());
     381        ///System.err.println("Got base64 string: " + importFilePathParts[i]);
     382        ///System.err.println("Decoded from base64 to bytes: " + bytes);
     383        // Using system file.encoding to interpret the resulting bytestring as a String,
     384        // just as we always did with URL decoding method
     385        decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding"));
     386        }       
     387       
     388        if(i == 0) {
     389        decoded_gsdlsourcefilename = decoded_filePathPart;
     390        } else {
     391        decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart;
     392        }
     393        ///System.err.println("Built up: " + decoded_gsdlsourcefilename);
     394    }
     395
     396    // add the file extension back in
     397    decoded_gsdlsourcefilename += file_ext;
     398   
     399    ///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename));
     400
     401    return decoded_gsdlsourcefilename;
     402    }   
     403   
     404    /**
     405     * Given a filepath, returns the parts between each file separator as an array.
     406     * For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"};
     407     */
     408    private static String[] getFilePathParts(String filepath, boolean is_unix_path) {   
     409    StringTokenizer tok;
     410    if(is_unix_path) {
     411        tok = new StringTokenizer(filepath, "/");           
     412    } else {
     413        tok = new StringTokenizer(filepath, "\\");         
     414    }
     415    String[] parts;
     416    int count = tok.countTokens();     
     417    if(count <= 0) {
     418        parts = new String[]{filepath};         
     419    } else {
     420        int i = 0;
     421        parts = new String[count];
     422        while(tok.hasMoreTokens()) {
     423        parts[i] = tok.nextToken();
     424        //System.err.println("Next part: " + parts[i]);
     425        i++;
     426        }       
     427    }
     428    return parts;       
     429    }
     430   
    301431    /*
    302432    public ArrayList getMetadataExtractedFromFile(File file)
  • main/trunk/greenstone2/perllib/doc.pm

    r33416 r33756  
    261261   
    262262#    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
     263##    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n";
    263264
    264265    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
     
    270271#    $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
    271272#    print STDERR "$source_filename\n";
    272    
     273
     274##    print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n";
     275   
    273276    $self->set_utf8_metadata_element ($self->get_top_section(),
    274277                 "gsdlsourcefilename",
    275278                 $source_filename);
     279
     280    # If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the
     281    # original filename (relative path to file) back for GLI to notice what ex Meta is associated with it
     282    $self->set_utf8_metadata_element($self->get_top_section(),
     283                     "gsdlsourcefilerenamemethod",
     284                     $rename_method);
    276285}
    277286
     
    284293#    print STDERR "-> $srcfilename -> ";
    285294    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
     295
     296    # encode any subdirs of "import" also, but not import itself
     297    my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/";   
     298    my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
     299   
     300    my $dirpath = "";   
     301    foreach my $subdir (@dirs) {
     302    ##print STDERR "@@@@ Found subdir: $subdir\n";
     303   
     304    # The import folder can be called anything, including in non-ASCII encodings.
     305    # Don't need to avoid encoding default import folder called "import", as it gets
     306    # URL/base64 encoded to ITSELF.
     307    # But can't encode (URL/base64 encode) any $ENV{'GSDLIMPORTDIR'} though if it's set,
     308    # as BasePlugout::get_doc_dir() removes any $ENV{'GSDLIMPORTDIR'} prefix. Because if
     309    # the $ENV{'GSDLIMPORTDIR'} part of gsdlsourcefilename is encoded here, the prefix
     310    # won't match with $ENV{'GSDLIMPORTDIR'}
     311
     312    unless ($ENV{'GSDLIMPORTDIR'} && $subdir eq $ENV{'GSDLIMPORTDIR'}) {
     313        $subdir = &util::rename_file($subdir, $rename_method);
     314    }
     315    $dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
     316    }
     317    if($dirpath ne "") {
     318    $dirname = $dirpath;
     319    }
     320   
    286321    $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
    287322
Note: See TracChangeset for help on using the changeset viewer.