Changeset 34394 for main


Ignore:
Timestamp:
2020-09-15T20:26:19+12:00 (4 years ago)
Author:
ak19
Message:

Bugfix 1 for GLI metadata slowdown: selecting multiple Gathererd files in GLI became very slow. Kathy and Dr Bainbridge had tracked this down to code I had added to support non basic ASCII filenames in GLI, which was making an expensive win operating system function call on Windows for each selected file, launching a Java Process for each. The speed of selecting multiple files is now back to being almost as fast as in 3.09. Tested on Windows and linux. Had to treat windows as a special case because I can't get the code modifications to work on Linux: the perl code stores a hex-encoded string for the filename that GLI now uses when OS is Windows and compares against the hex encoded name of a file selected. But on linux the hex encoded value generated by perl is not the same as that which java generates and after trying repeatedly, I'e not been able to succeed to get it to work. So the code behaves as before for Linux.

Location:
main/trunk
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java

    r33758 r34394  
    4141public abstract class DocXMLFile extends File
    4242{
     43    static boolean isWin = Utility.isWindows();
     44    // For Linux, we continue using gsdlsourcefilename as key to the metadata mapping
     45    // For Windows, we use the hex encoded long file paths as key
     46    static String GSDL_SOURCE_FILE_METANAME = isWin ? "gsdlfullsourcepath" : "gsdlsourcefilename";
     47   
    4348    protected HashMap source_file_name_to_description_elements_mapping = new HashMap();
    44 
     49   
    4550    protected final String MetadataWrap;
    4651    protected final String MetadataItem;
     
    5762    }
    5863
    59     /**
    60      * Checks if various versions of the file object's filename, denoted relatively by file_relative_path,
    61      * occur in the source_file_name_to_description_elements_mapping map
    62     */
    63     private ArrayList findSourceFileMapKeyMatch(File file, String file_relative_path) {
    64         ArrayList description_elements_list = null;
    65                
    66         ///System.err.println("Looking for key " + file_relative_path);
    67         description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
    68         if(description_elements_list != null) {
    69             ///System.err.println("   Found key matching REGULAR filepath: " + file_relative_path);
    70             return description_elements_list;           
    71         }
    72         else if(!Utility.isWindows()) { // couldn't find a matching key, we're done
    73             ///System.err.println("Unable to find ex.meta for regular file path form " + file_relative_path);
    74             return null;
    75         }
    76        
    77         // Now we can try windows short filename as map key
    78        
    79         String win_short_file_relative_path = "";
    80         try{
    81             win_short_file_relative_path = Utility.getWindowsShortFileName(file.getAbsolutePath());             
    82             //System.err.println("@@@ Searching for short file name: " + win_short_file_relative_path);
    83         } catch(Exception e) { // we're done trying to find a matching key
    84             System.err.println("Failed to convert to windows short file name: " + win_short_file_relative_path);           
    85             return null;
    86         }
    87        
    88         // Got a windows short file name, lop off import folder again
    89         int import_index = win_short_file_relative_path.indexOf("import");
    90         if (import_index != -1) {
    91             win_short_file_relative_path = win_short_file_relative_path.substring(import_index + "import".length() + 1);
    92         }
    93            
    94         ///System.err.println("### Looking for Windows short file name |" + win_short_file_relative_path +  "| in map of sourcefilenames to doc.xml's ex meta.");
    95         description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(win_short_file_relative_path);
    96         if (description_elements_list != null) {
    97             ///System.err.println("   Found key matching FULL win shortfile path: " + win_short_file_relative_path);
    98             return description_elements_list; // found
    99         }
    100        
    101         // else, check whether a map key is matched by any REMAINING combination of windows shortfile path and regular path:
    102         // - windows shortfilename's rel-dir-path with regular tailname
    103         // - and regular rel-dir-path with windows shortfilename's tailname
    104                
    105         String shortFileTailName = win_short_file_relative_path;
    106         String shortFileRelDirPath = "";
    107         int lastSep = win_short_file_relative_path.lastIndexOf(File.separator);
    108         if(lastSep != -1) {         
    109             shortFileTailName = win_short_file_relative_path.substring(lastSep+1);
    110             shortFileRelDirPath = win_short_file_relative_path.substring(0, lastSep+1); // include the slash
    111         }
    112        
    113         String fileTailName = file_relative_path;
    114         String fileRelDirPath = "";
    115         lastSep = file_relative_path.lastIndexOf(File.separator);
    116         if(lastSep != -1) {         
    117             fileTailName = file_relative_path.substring(lastSep+1);
    118             fileRelDirPath = file_relative_path.substring(0, lastSep+1); // include the slash
    119         }
    120        
    121         String path = shortFileRelDirPath + fileTailName;
    122         ///System.err.println("### Looking for Windows short file name |" + path +  "| in map of sourcefilenames to doc.xml's ex meta.");
    123         description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
    124        
    125         if(description_elements_list != null) {
    126             ///System.err.println("   Found key matching MIX of win shortfile path and regular path: " + path);
    127             return description_elements_list; // found
    128         }
    129 
    130         // try the other combination
    131         path = fileRelDirPath + shortFileTailName;
    132         ///System.err.println("### Looking for Windows short file name |" + path +  "| in map of sourcefilenames to doc.xml's ex meta.");
    133         description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path);
    134        
    135         if(description_elements_list != null) {
    136             ///System.err.println("   Found key matching MIX of regular path and win shortfile path: " + path);
    137             return description_elements_list; // found
    138         }       
    139        
    140         // could not find gsdlsourcefilename in map
    141         ///System.err.println("Unable to find ex.meta for regular file path form " + file_relative_path);
    142         ///System.err.println("    Or for windows shortFile path form, or for combinations with regular file path form");       
    143        
    144         return description_elements_list; // returns null at this point
    145     }
    146    
    147 
    148     public ArrayList getMetadataExtractedFromFile(File file)
     64    /** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII.
     65     * But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */
     66    public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path)                                             
    14967    {
    15068    // Build up a list of metadata extracted from this file
    15169    ArrayList metadata_values = new ArrayList();
    152 
    153     String file_relative_path = file.getAbsolutePath();
    154     int import_index = file_relative_path.indexOf("import");
    155     if (import_index != -1) {
    156         file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
    157     }
    158 
     70   
    15971    ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) {
    16072    ///    System.err.println("\n@@@ relFilename: " + relFilename);
     
    16274   
    16375    // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file
    164     //ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
    165     ArrayList description_elements_list = findSourceFileMapKeyMatch(file, file_relative_path);
     76    ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path);
    16677    if (description_elements_list == null) {
    16778            // ...it doesn't
     79            ///System.err.println("Unable to find meta for file path form " + file_relative_path);
    16880            return metadata_values; // we're done
    16981    }
     
    337249       
    338250        // Note which file this is for
    339         else if (metadata_element_name.equals("gsdlsourcefilename")) {
     251        //else if (metadata_element_name.equals("gsdlsourcefilename")) {   
     252        else if (metadata_element_name.equals(GSDL_SOURCE_FILE_METANAME)) {
     253            // On Unix, GSDL_SOURCE_FILE_METANAME is the gsdlsourcefilename metadata field
     254            // which may be encoded by the encoding denoted in fileRenameMethod (and will need decoding)
     255            // On Windows, GSDL_SOURCE_FILE_METANAME is a different metadata field that
     256            // will be hex encoded for non-ASCII chars
     257           
    340258            // Extract the gsdlsourcefilename element value
    341259            int value_index = line.indexOf(">", name_index) + ">".length();
     
    354272
    355273            // Make sure the path matches the OS that is running
    356             if (is_unix_path && Utility.isWindows()) {
     274            if (is_unix_path && isWin) {
    357275                // Convert path from Unix to Windows
    358276                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
    359277            }
    360             else if (!is_unix_path && !Utility.isWindows()) {
     278            else if (!is_unix_path && !isWin) {
    361279                // Convert path from Windows to Unix
    362280                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
     
    369287            }
    370288
     289            // Would be better to store hex src file name decoded? But how do we know what encoding the filename is in
     290            // https://stackoverflow.com/questions/13990941/how-to-convert-hex-string-to-java-string
     291           
     292           
    371293            ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start));
    372294            }
     
    383305            }
    384306        }
    385 
     307       
    386308        // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata)
    387309        if (metadata_element_name.startsWith("gsdl")) {
     
    399321        buffered_reader.close();
    400322
    401         // Now that we're done skimming, we actually need to decode gsdlsourcefilename
    402         // based on whatever fileRenameMethod was used to encode it, so that we can
    403         // at last properly compare properly against filenames on the file system
    404         // in order to load the correct ex.meta for the file.
    405         // Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
    406         // we can finally perform the decoding of gsdlsourcefilename.       
    407         if(fileRenameMethod == null) {
    408         fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
     323        // ON WINDOWS, we're working with hex encoded full file path instead of with gsdlsourcefilename,
     324        // so needn't bother decoding gsdlsourcefilename as it's unused.
     325        // On UNIX, continue decoding gsdlsourcefilename as before
     326        if(!isWin) {
     327        // Now that we're done skimming, we actually need to decode gsdlsourcefilename
     328        // based on whatever fileRenameMethod was used to encode it, so that we can
     329        // at last properly compare properly against filenames on the file system
     330        // in order to load the correct ex.meta for the file.
     331        // Now that we should have both gsdlsourcefilename AND fileRenameMethod set,
     332        // we can finally perform the decoding of gsdlsourcefilename.       
     333        if(fileRenameMethod == null) {
     334            fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building
     335        }
     336       
     337        // If gsdlsourcefilename was encoded, we remove it from the map under its encoded
     338        // filename, decode it and add it back into map using its decoded filename.
     339        if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {
     340            ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);
     341            gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
     342            source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
     343        }
    409344        }
    410         // If gsdlsourcefilename was encoded, we remove it from the map under its encoded
    411         // filename, decode it and add it back into map using its decoded filename.
    412         if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) {     
    413         ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value);       
    414         gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path);
    415         source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list);
    416         }       
     345       
    417346    }
    418347    catch (FileNotFoundException exception) {
     
    708637
    709638            // Make sure the path matches the OS that is running
    710             if (is_unix_path && Utility.isWindows()) {
     639            if (is_unix_path && isWin) {
    711640                // Convert path from Unix to Windows
    712641                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");
    713642            }
    714             else if (!is_unix_path && !Utility.isWindows()) {
     643            else if (!is_unix_path && !isWin) {
    715644                // Convert path from Windows to Unix
    716645                gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");
  • main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFileManager.java

    r17101 r34394  
    3131import java.util.*;
    3232import org.greenstone.gatherer.DebugStream;
    33 
     33import org.greenstone.gatherer.util.Utility;
    3434
    3535/** This class is a static class that manages the doc.xml files */
     
    3737{
    3838    static private ArrayList doc_xml_files = new ArrayList();
    39 
    4039
    4140    static public void clearDocXMLFiles()
     
    4746    static public ArrayList getMetadataExtractedFromFile(File file)
    4847    {
     48    // Work out relative file path and its hex encoded value here,
     49    // avoids making DocXMLFile.java recalculate these each time
     50    String file_relative_path = file.getAbsolutePath();
     51    int import_index = file_relative_path.indexOf("import");
     52    if (import_index != -1) {
     53        file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
     54    }
     55    String searchFileName = DocXMLFile.isWin ? Utility.stringToHex(file_relative_path) : file_relative_path;
     56       
    4957    // Build up a list of metadata values extracted from this file
    5058    ArrayList metadata_values = new ArrayList();
     
    5361    for (int i = 0; i < doc_xml_files.size(); i++) {
    5462        DocXMLFile doc_xml_file = (DocXMLFile) doc_xml_files.get(i);
    55         metadata_values.addAll(doc_xml_file.getMetadataExtractedFromFile(file));
     63        ///System.err.println("@@@@ Looking at doc.xml file: " + doc_xml_files.get(i));
     64        metadata_values.addAll(doc_xml_file.getMetadataExtractedFromFile(file, searchFileName));
    5665    }
    5766
  • main/trunk/gli/src/org/greenstone/gatherer/util/Utility.java

    r33777 r34394  
    9494    }   
    9595    }
    96 
    97     // Copied from GS3 main java code at GSDL3SRCHOME\src\java\org\greenstone/util\Misc.java
    98     // Debugging function to print a string's non-basic chars in hex, so stringToHex on all non-basic and non-printable ASCII
    99     // Dr Bainbridge said that printing anything with charCode over 128 in hex is okay, but I'd already made extra allowances for non-printable ASCII
    100     // Based on https://stackoverflow.com/questions/923863/converting-a-string-to-hexadecimal-in-java
    101     public static String debugUnicodeString(String str) {
    102       String result = "";
    103       for(int i = 0; i < str.length(); i++) {
    104             int charCode = str.codePointAt(i); // unicode codepoint / ASCII code
    105            
    106             // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
    107             // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
    108             if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing
    109                 result += str.charAt(i);
    110             } else {
    111                 result += "x{" + String.format("%04x", charCode) + "}"; // looks like: x{4-char-codepoint}
    112             }
    113       }
    114      
    115       return result;
    116     }
    117    
    118     /**
     96   
     97    // Copied from GS3 main java code at GSDL3SRCHOME\src\java\org\greenstone/util\Misc.java
     98    // Debugging function to print a string's non-basic chars in hex, so stringToHex on all non-basic and non-printable ASCII
     99    // Dr Bainbridge said that printing anything with charCode over 128 in hex is okay, but I'd already made extra allowances for non-printable ASCII
     100    // Based on https://stackoverflow.com/questions/923863/converting-a-string-to-hexadecimal-in-java
     101    public static String debugUnicodeString(String str) {
     102    String result = "";
     103    for(int i = 0; i < str.length(); i++) {
     104        int charCode = str.codePointAt(i); // unicode codepoint / ASCII code
     105       
     106        // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png
     107        // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format)
     108        if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing
     109        result += str.charAt(i);
     110        } else {
     111        result += "x{" + String.format("%04x", charCode) + "}"; // looks like: x{4-char-codepoint}             
     112        }
     113    }
     114   
     115    return result;
     116    }
     117   
     118    // Version of debugUnicodeString that, on Windows, mimics perl unicode::debug_unicode_string
     119    // exactly by producing hex/unicode codepoints for ALL codepoints beyond ASCII
     120    public static String stringToHex(String str) {
     121    String result = "";
     122    for(int i = 0; i < str.length(); i++) {
     123        int charCode = str.codePointAt(i); // unicode codepoint / ASCII code
     124       
     125        if(charCode <=127) { // ASCII
     126        result += str.charAt(i);
     127        } else { // non-ASCII
     128        result += "\\x{" + String.format("%04x", charCode) + "}"; // looks like: \x{4-char-codepoint}
     129        }
     130    }   
     131   
     132    return result;
     133    }
     134   
     135    /**
    119136     * returns the short filename (8.3) for a file in Windows
    120137     *
  • main/trunk/greenstone2/perllib/doc.pm

    r34276 r34394  
    125125        # For Unix-based systems, there is no difference between the two
    126126        $self->{'source_path'} = $source_filename;
    127         }
     127        }       
     128
     129        # On Windows, the code above has ensured source_path is the Win long (full) path name.
     130        # To help GLI associate metadata with an easily calculated and accurate representation of
     131        # filenames, we now store the Win long path name, hex encoded.
     132        # We're not using this field on Linux, as I can't get the hex encodings generated to match
     133        # what GLI Java code generates. But for symmetry we store this field on Unix too, but we need
     134        # to hex-encode source_path on Unix too, or it may not be UTF-8 and doc.xml will be invalid
     135        my $hexencodedlongsourcepath = &unicode::debug_unicode_string($self->{'source_path'});     
     136        $self->set_utf8_metadata_element ($self->get_top_section(), "gsdlfullsourcepath", $hexencodedlongsourcepath);
     137       
    128138    }
    129139    else {
Note: See TracChangeset for help on using the changeset viewer.