Changeset 34394

Show
Ignore:
Timestamp:
15.09.2020 20:26:19 (8 days ago)
Author:
ak19
Message:

Bugfix 1 for GLI metadata slowdown: selecting multiple Gathererd files in GLI became very slow. Kathy and Dr Bainbridge had tracked this down to code I had added to support non basic ASCII filenames in GLI, which was making an expensive win operating system function call on Windows for each selected file, launching a Java Process for each. The speed of selecting multiple files is now back to being almost as fast as in 3.09. Tested on Windows and linux. Had to treat windows as a special case because I can't get the code modifications to work on Linux: the perl code stores a hex-encoded string for the filename that GLI now uses when OS is Windows and compares against the hex encoded name of a file selected. But on linux the hex encoded value generated by perl is not the same as that which java generates and after trying repeatedly, I'e not been able to succeed to get it to work. So the code behaves as before for Linux.

Location:
main/trunk
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java

    r33758 r34394  
    4141public abstract class DocXMLFile extends File 
    4242{ 
     43    static boolean isWin = Utility.isWindows(); 
     44    // For Linux, we continue using gsdlsourcefilename as key to the metadata mapping 
     45    // For Windows, we use the hex encoded long file paths as key 
     46    static String GSDL_SOURCE_FILE_METANAME = isWin ? "gsdlfullsourcepath" : "gsdlsourcefilename"; 
     47     
    4348    protected HashMap source_file_name_to_description_elements_mapping = new HashMap(); 
    44  
     49     
    4550    protected final String MetadataWrap; 
    4651    protected final String MetadataItem; 
     
    5762    } 
    5863 
    59     /** 
    60      * Checks if various versions of the file object's filename, denoted relatively by file_relative_path, 
    61      * occur in the source_file_name_to_description_elements_mapping map 
    62     */ 
    63     private ArrayList findSourceFileMapKeyMatch(File file, String file_relative_path) { 
    64         ArrayList description_elements_list = null; 
    65                  
    66         ///System.err.println("Looking for key " + file_relative_path); 
    67         description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 
    68         if(description_elements_list != null) { 
    69             ///System.err.println("   Found key matching REGULAR filepath: " + file_relative_path); 
    70             return description_elements_list;            
    71         } 
    72         else if(!Utility.isWindows()) { // couldn't find a matching key, we're done 
    73             ///System.err.println("Unable to find ex.meta for regular file path form " + file_relative_path); 
    74             return null; 
    75         } 
    76          
    77         // Now we can try windows short filename as map key 
    78          
    79         String win_short_file_relative_path = ""; 
    80         try{ 
    81             win_short_file_relative_path = Utility.getWindowsShortFileName(file.getAbsolutePath());              
    82             //System.err.println("@@@ Searching for short file name: " + win_short_file_relative_path); 
    83         } catch(Exception e) { // we're done trying to find a matching key 
    84             System.err.println("Failed to convert to windows short file name: " + win_short_file_relative_path);             
    85             return null; 
    86         } 
    87          
    88         // Got a windows short file name, lop off import folder again 
    89         int import_index = win_short_file_relative_path.indexOf("import"); 
    90         if (import_index != -1) { 
    91             win_short_file_relative_path = win_short_file_relative_path.substring(import_index + "import".length() + 1); 
    92         } 
    93              
    94         ///System.err.println("### Looking for Windows short file name |" + win_short_file_relative_path +  "| in map of sourcefilenames to doc.xml's ex meta."); 
    95         description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(win_short_file_relative_path); 
    96         if (description_elements_list != null) { 
    97             ///System.err.println("   Found key matching FULL win shortfile path: " + win_short_file_relative_path); 
    98             return description_elements_list; // found 
    99         } 
    100          
    101         // else, check whether a map key is matched by any REMAINING combination of windows shortfile path and regular path: 
    102         // - windows shortfilename's rel-dir-path with regular tailname 
    103         // - and regular rel-dir-path with windows shortfilename's tailname 
    104                  
    105         String shortFileTailName = win_short_file_relative_path; 
    106         String shortFileRelDirPath = ""; 
    107         int lastSep = win_short_file_relative_path.lastIndexOf(File.separator); 
    108         if(lastSep != -1) {          
    109             shortFileTailName = win_short_file_relative_path.substring(lastSep+1); 
    110             shortFileRelDirPath = win_short_file_relative_path.substring(0, lastSep+1); // include the slash 
    111         } 
    112          
    113         String fileTailName = file_relative_path; 
    114         String fileRelDirPath = ""; 
    115         lastSep = file_relative_path.lastIndexOf(File.separator); 
    116         if(lastSep != -1) {          
    117             fileTailName = file_relative_path.substring(lastSep+1); 
    118             fileRelDirPath = file_relative_path.substring(0, lastSep+1); // include the slash 
    119         } 
    120          
    121         String path = shortFileRelDirPath + fileTailName; 
    122         ///System.err.println("### Looking for Windows short file name |" + path +  "| in map of sourcefilenames to doc.xml's ex meta."); 
    123         description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path); 
    124          
    125         if(description_elements_list != null) { 
    126             ///System.err.println("   Found key matching MIX of win shortfile path and regular path: " + path); 
    127             return description_elements_list; // found 
    128         } 
    129  
    130         // try the other combination 
    131         path = fileRelDirPath + shortFileTailName; 
    132         ///System.err.println("### Looking for Windows short file name |" + path +  "| in map of sourcefilenames to doc.xml's ex meta."); 
    133         description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path); 
    134          
    135         if(description_elements_list != null) { 
    136             ///System.err.println("   Found key matching MIX of regular path and win shortfile path: " + path); 
    137             return description_elements_list; // found 
    138         }        
    139          
    140         // could not find gsdlsourcefilename in map 
    141         ///System.err.println("Unable to find ex.meta for regular file path form " + file_relative_path); 
    142         ///System.err.println("    Or for windows shortFile path form, or for combinations with regular file path form");        
    143          
    144         return description_elements_list; // returns null at this point 
    145     } 
    146      
    147  
    148     public ArrayList getMetadataExtractedFromFile(File file) 
     64    /** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII. 
     65     * But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */ 
     66    public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path)                                              
    14967    { 
    15068    // Build up a list of metadata extracted from this file 
    15169    ArrayList metadata_values = new ArrayList(); 
    152  
    153     String file_relative_path = file.getAbsolutePath(); 
    154     int import_index = file_relative_path.indexOf("import"); 
    155     if (import_index != -1) { 
    156         file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); 
    157     } 
    158  
     70     
    15971    ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) { 
    16072    ///    System.err.println("\n@@@ relFilename: " + relFilename); 
     
    16274     
    16375    // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file 
    164     //ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 
    165     ArrayList description_elements_list = findSourceFileMapKeyMatch(file, file_relative_path); 
     76    ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 
    16677    if (description_elements_list == null) { 
    16778            // ...it doesn't 
     79            ///System.err.println("Unable to find meta for file path form " + file_relative_path); 
    16880            return metadata_values; // we're done 
    16981    } 
     
    337249         
    338250        // Note which file this is for 
    339         else if (metadata_element_name.equals("gsdlsourcefilename")) { 
     251        //else if (metadata_element_name.equals("gsdlsourcefilename")) {     
     252        else if (metadata_element_name.equals(GSDL_SOURCE_FILE_METANAME)) { 
     253            // On Unix, GSDL_SOURCE_FILE_METANAME is the gsdlsourcefilename metadata field 
     254            // which may be encoded by the encoding denoted in fileRenameMethod (and will need decoding) 
     255            // On Windows, GSDL_SOURCE_FILE_METANAME is a different metadata field that 
     256            // will be hex encoded for non-ASCII chars 
     257             
    340258            // Extract the gsdlsourcefilename element value 
    341259            int value_index = line.indexOf(">", name_index) + ">".length(); 
     
    354272 
    355273            // Make sure the path matches the OS that is running 
    356             if (is_unix_path && Utility.isWindows()) { 
     274            if (is_unix_path && isWin) { 
    357275                // Convert path from Unix to Windows 
    358276                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); 
    359277            } 
    360             else if (!is_unix_path && !Utility.isWindows()) { 
     278            else if (!is_unix_path && !isWin) { 
    361279                // Convert path from Windows to Unix 
    362280                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); 
     
    369287            } 
    370288 
     289            // Would be better to store hex src file name decoded? But how do we know what encoding the filename is in 
     290            // https://stackoverflow.com/questions/13990941/how-to-convert-hex-string-to-java-string 
     291             
     292             
    371293            ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); 
    372294            } 
     
    383305            } 
    384306        } 
    385  
     307         
    386308        // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) 
    387309        if (metadata_element_name.startsWith("gsdl")) { 
     
    399321        buffered_reader.close(); 
    400322 
    401         // Now that we're done skimming, we actually need to decode gsdlsourcefilename 
    402         // based on whatever fileRenameMethod was used to encode it, so that we can 
    403         // at last properly compare properly against filenames on the file system 
    404         // in order to load the correct ex.meta for the file. 
    405         // Now that we should have both gsdlsourcefilename AND fileRenameMethod set, 
    406         // we can finally perform the decoding of gsdlsourcefilename.        
    407         if(fileRenameMethod == null) { 
    408         fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building 
     323        // ON WINDOWS, we're working with hex encoded full file path instead of with gsdlsourcefilename, 
     324        // so needn't bother decoding gsdlsourcefilename as it's unused. 
     325        // On UNIX, continue decoding gsdlsourcefilename as before 
     326        if(!isWin) { 
     327        // Now that we're done skimming, we actually need to decode gsdlsourcefilename 
     328        // based on whatever fileRenameMethod was used to encode it, so that we can 
     329        // at last properly compare properly against filenames on the file system 
     330        // in order to load the correct ex.meta for the file. 
     331        // Now that we should have both gsdlsourcefilename AND fileRenameMethod set, 
     332        // we can finally perform the decoding of gsdlsourcefilename.        
     333        if(fileRenameMethod == null) { 
     334            fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building 
     335        } 
     336         
     337        // If gsdlsourcefilename was encoded, we remove it from the map under its encoded 
     338        // filename, decode it and add it back into map using its decoded filename. 
     339        if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) { 
     340            ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value); 
     341            gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path); 
     342            source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list); 
     343        } 
    409344        } 
    410         // If gsdlsourcefilename was encoded, we remove it from the map under its encoded 
    411         // filename, decode it and add it back into map using its decoded filename. 
    412         if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {      
    413         ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);        
    414         gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path); 
    415         source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list); 
    416         }        
     345         
    417346    } 
    418347    catch (FileNotFoundException exception) { 
     
    708637 
    709638            // Make sure the path matches the OS that is running 
    710             if (is_unix_path && Utility.isWindows()) { 
     639            if (is_unix_path && isWin) { 
    711640                // Convert path from Unix to Windows 
    712641                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); 
    713642            } 
    714             else if (!is_unix_path && !Utility.isWindows()) { 
     643            else if (!is_unix_path && !isWin) { 
    715644                // Convert path from Windows to Unix 
    716645                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); 
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFileManager.java

    r17101 r34394  
    3131import java.util.*; 
    3232import org.greenstone.gatherer.DebugStream; 
    33  
     33import org.greenstone.gatherer.util.Utility; 
    3434 
    3535/** This class is a static class that manages the doc.xml files */ 
     
    3737{ 
    3838    static private ArrayList doc_xml_files = new ArrayList(); 
    39  
    4039 
    4140    static public void clearDocXMLFiles() 
     
    4746    static public ArrayList getMetadataExtractedFromFile(File file) 
    4847    { 
     48    // Work out relative file path and its hex encoded value here, 
     49    // avoids making DocXMLFile.java recalculate these each time 
     50    String file_relative_path = file.getAbsolutePath(); 
     51    int import_index = file_relative_path.indexOf("import"); 
     52    if (import_index != -1) { 
     53        file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); 
     54    } 
     55    String searchFileName = DocXMLFile.isWin ? Utility.stringToHex(file_relative_path) : file_relative_path; 
     56         
    4957    // Build up a list of metadata values extracted from this file 
    5058    ArrayList metadata_values = new ArrayList(); 
     
    5361    for (int i = 0; i < doc_xml_files.size(); i++) { 
    5462        DocXMLFile doc_xml_file = (DocXMLFile) doc_xml_files.get(i); 
    55         metadata_values.addAll(doc_xml_file.getMetadataExtractedFromFile(file)); 
     63        ///System.err.println("@@@@ Looking at doc.xml file: " + doc_xml_files.get(i)); 
     64        metadata_values.addAll(doc_xml_file.getMetadataExtractedFromFile(file, searchFileName)); 
    5665    } 
    5766 
  • main/trunk/gli/src/org/greenstone/gatherer/util/Utility.java

    r33777 r34394  
    9494    }    
    9595    } 
    96  
    97     // Copied from GS3 main java code at GSDL3SRCHOME\src\java\org\greenstone/util\Misc.java 
    98     // Debugging function to print a string's non-basic chars in hex, so stringToHex on all non-basic and non-printable ASCII 
    99     // Dr Bainbridge said that printing anything with charCode over 128 in hex is okay, but I'd already made extra allowances for non-printable ASCII 
    100     // Based on https://stackoverflow.com/questions/923863/converting-a-string-to-hexadecimal-in-java 
    101     public static String debugUnicodeString(String str) { 
    102       String result = ""; 
    103       for(int i = 0; i < str.length(); i++) { 
    104             int charCode = str.codePointAt(i); // unicode codepoint / ASCII code 
    105              
    106             // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 
    107             // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 
    108             if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing 
    109                 result += str.charAt(i); 
    110             } else { 
    111                 result += "x{" + String.format("%04x", charCode) + "}"; // looks like: x{4-char-codepoint} 
    112             } 
    113       } 
    114        
    115       return result; 
    116     } 
    117      
    118     /** 
     96     
     97    // Copied from GS3 main java code at GSDL3SRCHOME\src\java\org\greenstone/util\Misc.java 
     98    // Debugging function to print a string's non-basic chars in hex, so stringToHex on all non-basic and non-printable ASCII 
     99    // Dr Bainbridge said that printing anything with charCode over 128 in hex is okay, but I'd already made extra allowances for non-printable ASCII 
     100    // Based on https://stackoverflow.com/questions/923863/converting-a-string-to-hexadecimal-in-java 
     101    public static String debugUnicodeString(String str) { 
     102    String result = ""; 
     103    for(int i = 0; i < str.length(); i++) { 
     104        int charCode = str.codePointAt(i); // unicode codepoint / ASCII code 
     105         
     106        // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 
     107        // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 
     108        if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing 
     109        result += str.charAt(i); 
     110        } else { 
     111        result += "x{" + String.format("%04x", charCode) + "}"; // looks like: x{4-char-codepoint}               
     112        } 
     113    } 
     114     
     115    return result; 
     116    } 
     117     
     118    // Version of debugUnicodeString that, on Windows, mimics perl unicode::debug_unicode_string 
     119    // exactly by producing hex/unicode codepoints for ALL codepoints beyond ASCII 
     120    public static String stringToHex(String str) { 
     121    String result = ""; 
     122    for(int i = 0; i < str.length(); i++) { 
     123        int charCode = str.codePointAt(i); // unicode codepoint / ASCII code 
     124         
     125        if(charCode <=127) { // ASCII 
     126        result += str.charAt(i); 
     127        } else { // non-ASCII 
     128        result += "\\x{" + String.format("%04x", charCode) + "}"; // looks like: \x{4-char-codepoint} 
     129        } 
     130    }     
     131     
     132    return result; 
     133    } 
     134     
     135    /** 
    119136     * returns the short filename (8.3) for a file in Windows 
    120137     *  
  • main/trunk/greenstone2/perllib/doc.pm

    r34276 r34394  
    125125        # For Unix-based systems, there is no difference between the two 
    126126        $self->{'source_path'} = $source_filename; 
    127         } 
     127        }        
     128 
     129        # On Windows, the code above has ensured source_path is the Win long (full) path name. 
     130        # To help GLI associate metadata with an easily calculated and accurate representation of 
     131        # filenames, we now store the Win long path name, hex encoded. 
     132        # We're not using this field on Linux, as I can't get the hex encodings generated to match 
     133        # what GLI Java code generates. But for symmetry we store this field on Unix too, but we need 
     134        # to hex-encode source_path on Unix too, or it may not be UTF-8 and doc.xml will be invalid 
     135        my $hexencodedlongsourcepath = &unicode::debug_unicode_string($self->{'source_path'});       
     136        $self->set_utf8_metadata_element ($self->get_top_section(), "gsdlfullsourcepath", $hexencodedlongsourcepath); 
     137        
    128138    } 
    129139    else {