Changeset 34394
- Timestamp:
- 2020-09-15T20:26:19+12:00 (4 years ago)
- Location:
- main/trunk
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java
r33758 r34394 41 41 public abstract class DocXMLFile extends File 42 42 { 43 static boolean isWin = Utility.isWindows(); 44 // For Linux, we continue using gsdlsourcefilename as key to the metadata mapping 45 // For Windows, we use the hex encoded long file paths as key 46 static String GSDL_SOURCE_FILE_METANAME = isWin ? "gsdlfullsourcepath" : "gsdlsourcefilename"; 47 43 48 protected HashMap source_file_name_to_description_elements_mapping = new HashMap(); 44 49 45 50 protected final String MetadataWrap; 46 51 protected final String MetadataItem; … … 57 62 } 58 63 59 /** 60 * Checks if various versions of the file object's filename, denoted relatively by file_relative_path, 61 * occur in the source_file_name_to_description_elements_mapping map 62 */ 63 private ArrayList findSourceFileMapKeyMatch(File file, String file_relative_path) { 64 ArrayList description_elements_list = null; 65 66 ///System.err.println("Looking for key " + file_relative_path); 67 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 68 if(description_elements_list != null) { 69 ///System.err.println(" Found key matching REGULAR filepath: " + file_relative_path); 70 return description_elements_list; 71 } 72 else if(!Utility.isWindows()) { // couldn't find a matching key, we're done 73 ///System.err.println("Unable to find ex.meta for regular file path form " + file_relative_path); 74 return null; 75 } 76 77 // Now we can try windows short filename as map key 78 79 String win_short_file_relative_path = ""; 80 try{ 81 win_short_file_relative_path = Utility.getWindowsShortFileName(file.getAbsolutePath()); 82 //System.err.println("@@@ Searching for short file name: " + win_short_file_relative_path); 83 } catch(Exception e) { // we're done trying to find a matching key 84 System.err.println("Failed to convert to windows short file name: " + win_short_file_relative_path); 85 return null; 86 } 87 88 // Got a windows short file name, lop off import folder again 89 int import_index = win_short_file_relative_path.indexOf("import"); 90 if (import_index != -1) { 91 win_short_file_relative_path = win_short_file_relative_path.substring(import_index + "import".length() + 1); 92 } 93 94 ///System.err.println("### Looking for Windows short file name |" + win_short_file_relative_path + "| in map of sourcefilenames to doc.xml's ex meta."); 95 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(win_short_file_relative_path); 96 if (description_elements_list != null) { 97 ///System.err.println(" Found key matching FULL win shortfile path: " + win_short_file_relative_path); 98 return description_elements_list; // found 99 } 100 101 // else, check whether a map key is matched by any REMAINING combination of windows shortfile path and regular path: 102 // - windows shortfilename's rel-dir-path with regular tailname 103 // - and regular rel-dir-path with windows shortfilename's tailname 104 105 String shortFileTailName = win_short_file_relative_path; 106 String shortFileRelDirPath = ""; 107 int lastSep = win_short_file_relative_path.lastIndexOf(File.separator); 108 if(lastSep != -1) { 109 shortFileTailName = win_short_file_relative_path.substring(lastSep+1); 110 shortFileRelDirPath = win_short_file_relative_path.substring(0, lastSep+1); // include the slash 111 } 112 113 String fileTailName = file_relative_path; 114 String fileRelDirPath = ""; 115 lastSep = file_relative_path.lastIndexOf(File.separator); 116 if(lastSep != -1) { 117 fileTailName = file_relative_path.substring(lastSep+1); 118 fileRelDirPath = file_relative_path.substring(0, lastSep+1); // include the slash 119 } 120 121 String path = shortFileRelDirPath + fileTailName; 122 ///System.err.println("### Looking for Windows short file name |" + path + "| in map of sourcefilenames to doc.xml's ex meta."); 123 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path); 124 125 if(description_elements_list != null) { 126 ///System.err.println(" Found key matching MIX of win shortfile path and regular path: " + path); 127 return description_elements_list; // found 128 } 129 130 // try the other combination 131 path = fileRelDirPath + shortFileTailName; 132 ///System.err.println("### Looking for Windows short file name |" + path + "| in map of sourcefilenames to doc.xml's ex meta."); 133 description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(path); 134 135 if(description_elements_list != null) { 136 ///System.err.println(" Found key matching MIX of regular path and win shortfile path: " + path); 137 return description_elements_list; // found 138 } 139 140 // could not find gsdlsourcefilename in map 141 ///System.err.println("Unable to find ex.meta for regular file path form " + file_relative_path); 142 ///System.err.println(" Or for windows shortFile path form, or for combinations with regular file path form"); 143 144 return description_elements_list; // returns null at this point 145 } 146 147 148 public ArrayList getMetadataExtractedFromFile(File file) 64 /** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII. 65 * But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */ 66 public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path) 149 67 { 150 68 // Build up a list of metadata extracted from this file 151 69 ArrayList metadata_values = new ArrayList(); 152 153 String file_relative_path = file.getAbsolutePath(); 154 int import_index = file_relative_path.indexOf("import"); 155 if (import_index != -1) { 156 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); 157 } 158 70 159 71 ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) { 160 72 /// System.err.println("\n@@@ relFilename: " + relFilename); … … 162 74 163 75 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file 164 //ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 165 ArrayList description_elements_list = findSourceFileMapKeyMatch(file, file_relative_path); 76 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 166 77 if (description_elements_list == null) { 167 78 // ...it doesn't 79 ///System.err.println("Unable to find meta for file path form " + file_relative_path); 168 80 return metadata_values; // we're done 169 81 } … … 337 249 338 250 // Note which file this is for 339 else if (metadata_element_name.equals("gsdlsourcefilename")) { 251 //else if (metadata_element_name.equals("gsdlsourcefilename")) { 252 else if (metadata_element_name.equals(GSDL_SOURCE_FILE_METANAME)) { 253 // On Unix, GSDL_SOURCE_FILE_METANAME is the gsdlsourcefilename metadata field 254 // which may be encoded by the encoding denoted in fileRenameMethod (and will need decoding) 255 // On Windows, GSDL_SOURCE_FILE_METANAME is a different metadata field that 256 // will be hex encoded for non-ASCII chars 257 340 258 // Extract the gsdlsourcefilename element value 341 259 int value_index = line.indexOf(">", name_index) + ">".length(); … … 354 272 355 273 // Make sure the path matches the OS that is running 356 if (is_unix_path && Utility.isWindows()) {274 if (is_unix_path && isWin) { 357 275 // Convert path from Unix to Windows 358 276 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); 359 277 } 360 else if (!is_unix_path && ! Utility.isWindows()) {278 else if (!is_unix_path && !isWin) { 361 279 // Convert path from Windows to Unix 362 280 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); … … 369 287 } 370 288 289 // Would be better to store hex src file name decoded? But how do we know what encoding the filename is in 290 // https://stackoverflow.com/questions/13990941/how-to-convert-hex-string-to-java-string 291 292 371 293 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); 372 294 } … … 383 305 } 384 306 } 385 307 386 308 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) 387 309 if (metadata_element_name.startsWith("gsdl")) { … … 399 321 buffered_reader.close(); 400 322 401 // Now that we're done skimming, we actually need to decode gsdlsourcefilename 402 // based on whatever fileRenameMethod was used to encode it, so that we can 403 // at last properly compare properly against filenames on the file system 404 // in order to load the correct ex.meta for the file. 405 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set, 406 // we can finally perform the decoding of gsdlsourcefilename. 407 if(fileRenameMethod == null) { 408 fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building 323 // ON WINDOWS, we're working with hex encoded full file path instead of with gsdlsourcefilename, 324 // so needn't bother decoding gsdlsourcefilename as it's unused. 325 // On UNIX, continue decoding gsdlsourcefilename as before 326 if(!isWin) { 327 // Now that we're done skimming, we actually need to decode gsdlsourcefilename 328 // based on whatever fileRenameMethod was used to encode it, so that we can 329 // at last properly compare properly against filenames on the file system 330 // in order to load the correct ex.meta for the file. 331 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set, 332 // we can finally perform the decoding of gsdlsourcefilename. 333 if(fileRenameMethod == null) { 334 fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building 335 } 336 337 // If gsdlsourcefilename was encoded, we remove it from the map under its encoded 338 // filename, decode it and add it back into map using its decoded filename. 339 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) { 340 ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value); 341 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path); 342 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list); 343 } 409 344 } 410 // If gsdlsourcefilename was encoded, we remove it from the map under its encoded 411 // filename, decode it and add it back into map using its decoded filename. 412 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) { 413 ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value); 414 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path); 415 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list); 416 } 345 417 346 } 418 347 catch (FileNotFoundException exception) { … … 708 637 709 638 // Make sure the path matches the OS that is running 710 if (is_unix_path && Utility.isWindows()) {639 if (is_unix_path && isWin) { 711 640 // Convert path from Unix to Windows 712 641 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); 713 642 } 714 else if (!is_unix_path && ! Utility.isWindows()) {643 else if (!is_unix_path && !isWin) { 715 644 // Convert path from Windows to Unix 716 645 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); -
main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFileManager.java
r17101 r34394 31 31 import java.util.*; 32 32 import org.greenstone.gatherer.DebugStream; 33 33 import org.greenstone.gatherer.util.Utility; 34 34 35 35 /** This class is a static class that manages the doc.xml files */ … … 37 37 { 38 38 static private ArrayList doc_xml_files = new ArrayList(); 39 40 39 41 40 static public void clearDocXMLFiles() … … 47 46 static public ArrayList getMetadataExtractedFromFile(File file) 48 47 { 48 // Work out relative file path and its hex encoded value here, 49 // avoids making DocXMLFile.java recalculate these each time 50 String file_relative_path = file.getAbsolutePath(); 51 int import_index = file_relative_path.indexOf("import"); 52 if (import_index != -1) { 53 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); 54 } 55 String searchFileName = DocXMLFile.isWin ? Utility.stringToHex(file_relative_path) : file_relative_path; 56 49 57 // Build up a list of metadata values extracted from this file 50 58 ArrayList metadata_values = new ArrayList(); … … 53 61 for (int i = 0; i < doc_xml_files.size(); i++) { 54 62 DocXMLFile doc_xml_file = (DocXMLFile) doc_xml_files.get(i); 55 metadata_values.addAll(doc_xml_file.getMetadataExtractedFromFile(file)); 63 ///System.err.println("@@@@ Looking at doc.xml file: " + doc_xml_files.get(i)); 64 metadata_values.addAll(doc_xml_file.getMetadataExtractedFromFile(file, searchFileName)); 56 65 } 57 66 -
main/trunk/gli/src/org/greenstone/gatherer/util/Utility.java
r33777 r34394 94 94 } 95 95 } 96 97 // Copied from GS3 main java code at GSDL3SRCHOME\src\java\org\greenstone/util\Misc.java 98 // Debugging function to print a string's non-basic chars in hex, so stringToHex on all non-basic and non-printable ASCII 99 // Dr Bainbridge said that printing anything with charCode over 128 in hex is okay, but I'd already made extra allowances for non-printable ASCII 100 // Based on https://stackoverflow.com/questions/923863/converting-a-string-to-hexadecimal-in-java 101 public static String debugUnicodeString(String str) { 102 String result = ""; 103 for(int i = 0; i < str.length(); i++) { 104 int charCode = str.codePointAt(i); // unicode codepoint / ASCII code 105 106 // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 107 // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 108 if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing 109 result += str.charAt(i); 110 } else { 111 result += "x{" + String.format("%04x", charCode) + "}"; // looks like: x{4-char-codepoint} 112 } 113 } 114 115 return result; 116 } 117 118 /** 96 97 // Copied from GS3 main java code at GSDL3SRCHOME\src\java\org\greenstone/util\Misc.java 98 // Debugging function to print a string's non-basic chars in hex, so stringToHex on all non-basic and non-printable ASCII 99 // Dr Bainbridge said that printing anything with charCode over 128 in hex is okay, but I'd already made extra allowances for non-printable ASCII 100 // Based on https://stackoverflow.com/questions/923863/converting-a-string-to-hexadecimal-in-java 101 public static String debugUnicodeString(String str) { 102 String result = ""; 103 for(int i = 0; i < str.length(); i++) { 104 int charCode = str.codePointAt(i); // unicode codepoint / ASCII code 105 106 // ASCII table: https://cdn.sparkfun.com/assets/home_page_posts/2/1/2/1/ascii_table_black.png 107 // If the unicode character code pt is less than the ASCII code for space and greater than for tilda, let's display the char in hex (x0000 format) 108 if((charCode >= 20 && charCode <= 126) || charCode == 9 || charCode == 10 || charCode == 13) { // space, tilda, TAB, LF, CR are printable, leave them in for XML element printing 109 result += str.charAt(i); 110 } else { 111 result += "x{" + String.format("%04x", charCode) + "}"; // looks like: x{4-char-codepoint} 112 } 113 } 114 115 return result; 116 } 117 118 // Version of debugUnicodeString that, on Windows, mimics perl unicode::debug_unicode_string 119 // exactly by producing hex/unicode codepoints for ALL codepoints beyond ASCII 120 public static String stringToHex(String str) { 121 String result = ""; 122 for(int i = 0; i < str.length(); i++) { 123 int charCode = str.codePointAt(i); // unicode codepoint / ASCII code 124 125 if(charCode <=127) { // ASCII 126 result += str.charAt(i); 127 } else { // non-ASCII 128 result += "\\x{" + String.format("%04x", charCode) + "}"; // looks like: \x{4-char-codepoint} 129 } 130 } 131 132 return result; 133 } 134 135 /** 119 136 * returns the short filename (8.3) for a file in Windows 120 137 * -
main/trunk/greenstone2/perllib/doc.pm
r34276 r34394 125 125 # For Unix-based systems, there is no difference between the two 126 126 $self->{'source_path'} = $source_filename; 127 } 127 } 128 129 # On Windows, the code above has ensured source_path is the Win long (full) path name. 130 # To help GLI associate metadata with an easily calculated and accurate representation of 131 # filenames, we now store the Win long path name, hex encoded. 132 # We're not using this field on Linux, as I can't get the hex encodings generated to match 133 # what GLI Java code generates. But for symmetry we store this field on Unix too, but we need 134 # to hex-encode source_path on Unix too, or it may not be UTF-8 and doc.xml will be invalid 135 my $hexencodedlongsourcepath = &unicode::debug_unicode_string($self->{'source_path'}); 136 $self->set_utf8_metadata_element ($self->get_top_section(), "gsdlfullsourcepath", $hexencodedlongsourcepath); 137 128 138 } 129 139 else {
Note:
See TracChangeset
for help on using the changeset viewer.