Changeset 34507
- Timestamp:
- 2020-10-22T01:48:03+13:00 (3 years ago)
- Location:
- main/trunk/gli/src/org/greenstone/gatherer/metadata
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java
r34394 r34507 32 32 import java.net.URLDecoder; 33 33 import org.greenstone.gatherer.DebugStream; 34 import org.greenstone.gatherer.Gatherer; 34 35 import org.greenstone.gatherer.util.Utility; 35 36 … … 42 43 { 43 44 static boolean isWin = Utility.isWindows(); 44 // For Linux, we continue using gsdlsourcefilename as key to the metadata mapping45 // For Windows, we use the hex encoded long file paths as key46 static String GSDL_SOURCE_FILE_METANAME = isWin ? "gsdlfullsourcepath" : "gsdlsourcefilename";47 45 48 46 protected HashMap source_file_name_to_description_elements_mapping = new HashMap(); … … 64 62 /** On Windows, file_relative_path will be hex-encoded for codepts beyond ASCII. 65 63 * But keys into the source_file_name_to_description_elements_mapping will then also match on Windows */ 66 public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path) 64 public ArrayList getMetadataExtractedFromFile(File file, String file_relative_path) 67 65 { 68 66 // Build up a list of metadata extracted from this file … … 249 247 250 248 // Note which file this is for 251 //else if (metadata_element_name.equals("gsdlsourcefilename")) { 252 else if (metadata_element_name.equals(GSDL_SOURCE_FILE_METANAME)) { 253 // On Unix, GSDL_SOURCE_FILE_METANAME is the gsdlsourcefilename metadata field 254 // which may be encoded by the encoding denoted in fileRenameMethod (and will need decoding) 255 // On Windows, GSDL_SOURCE_FILE_METANAME is a different metadata field that 256 // will be hex encoded for non-ASCII chars 249 else if (metadata_element_name.equals("gsdlsourcefilename")) { 250 // the gsdlsourcefilename metadata field may be encoded by the encoding denoted 251 // in fileRenameMethod (and will need decoding) 257 252 258 253 // Extract the gsdlsourcefilename element value 259 254 int value_index = line.indexOf(">", name_index) + ">".length(); 260 255 gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index)); 261 262 // We're only interested in the path relative to the import folder 263 256 257 // We're only interested in the path relative to the import folder 258 int import_index = gsdlsourcefilename_value.indexOf("import"); 264 259 if (import_index != -1) { 265 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length());266 267 is_unix_path = gsdlsourcefilename_value.startsWith("/");268 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1);269 270 // (Will decode gsdlsourcefilename at end of this method, once we know271 // for certain the fileRenameMethod that was used to encode it.)272 273 // Make sure the path matches the OS that is running274 if (is_unix_path && isWin) {275 // Convert path from Unix to Windows276 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\");277 }278 else if (!is_unix_path && !isWin) {279 // Convert path from Windows to Unix280 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/");281 }282 260 283 ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value); 284 // Remember this for quick access later 285 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) { 286 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList()); 287 } 288 289 // Would be better to store hex src file name decoded? But how do we know what encoding the filename is in 290 // https://stackoverflow.com/questions/13990941/how-to-convert-hex-string-to-java-string 291 292 293 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); 261 ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value); 294 262 } 295 263 … … 301 269 && !gsdlsourcefilename_value.endsWith("collect.cfg") 302 270 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) { 303 // We don't really know what is going on...304 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value);271 // We don't really know what is going on... 272 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value); 305 273 } 306 274 } … … 321 289 buffered_reader.close(); 322 290 323 // ON WINDOWS, we're working with hex encoded full file path instead of with gsdlsourcefilename, 324 // so needn't bother decoding gsdlsourcefilename as it's unused. 325 // On UNIX, continue decoding gsdlsourcefilename as before 326 if(!isWin) { 291 327 292 // Now that we're done skimming, we actually need to decode gsdlsourcefilename 328 293 // based on whatever fileRenameMethod was used to encode it, so that we can 329 // at last properly compare properlyagainst filenames on the file system294 // at last properly compare against filenames on the file system 330 295 // in order to load the correct ex.meta for the file. 331 296 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set, … … 337 302 // If gsdlsourcefilename was encoded, we remove it from the map under its encoded 338 303 // filename, decode it and add it back into map using its decoded filename. 339 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) { 340 ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value); 341 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path); 342 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list); 343 } 344 } 345 304 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) { 305 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path); 306 } 307 308 // Next, if Windows, check if dealing with Win 8.3 Short Filename 309 // In that case, convert short file name to full name - works only if the file exists 310 if(isWin && gsdlsourcefilename_value.indexOf("~") != -1) { 311 // gsdlsourcefilename is stored from import folder onwards: import/opt_subdir/filename.ext 312 // This may contain Win 8.3 shortening. To get Win Long filename, prefix current collection dir 313 // and if resulting file exists, getCanonicalPath() which produces Win Long filename. 314 File currentCollectionFolder = Gatherer.c_man.getCollection().getCollectionDirectory(); 315 File f = new File(currentCollectionFolder, /*"import" + File.separator +*/ gsdlsourcefilename_value); 316 ///System.err.println("### file: " + f.getAbsolutePath()); 317 318 if(f.exists()) { 319 gsdlsourcefilename_value = f.getCanonicalPath(); 320 ///System.err.println("### canon: " + gsdlsourcefilename_value); 321 322 } // else couldn't find a version of the filename stored in doc.xml that exists, giving up, leave gsdlsourcefilename_value as is 323 } 324 325 // We're only interested in the path relative to the import folder 326 int import_index = gsdlsourcefilename_value.indexOf("import"); 327 if (import_index != -1) { 328 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length()); 329 330 is_unix_path = gsdlsourcefilename_value.startsWith("/"); 331 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1); 332 333 // (Will decode gsdlsourcefilename at end of this method, once we know 334 // for certain the fileRenameMethod that was used to encode it.) 335 336 // Make sure the path matches the OS that is running 337 if (is_unix_path && isWin) { 338 // Convert path from Unix to Windows 339 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); 340 } 341 else if (!is_unix_path && !isWin) { 342 // Convert path from Windows to Unix 343 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); 344 } 345 346 ///System.err.println("@@@ into map: " + gsdlsourcefilename_value); 347 348 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) { 349 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList()); 350 } 351 352 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); 353 } 346 354 } 347 355 catch (FileNotFoundException exception) { … … 354 362 } 355 363 } 364 356 365 357 366 protected String decodeSourceFilename(String relative_sourcefile_path, -
main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFileManager.java
r34394 r34507 53 53 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); 54 54 } 55 String searchFileName = DocXMLFile.isWin ? Utility.stringToHex(file_relative_path) : file_relative_path; 56 55 57 56 // Build up a list of metadata values extracted from this file 58 57 ArrayList metadata_values = new ArrayList(); … … 62 61 DocXMLFile doc_xml_file = (DocXMLFile) doc_xml_files.get(i); 63 62 ///System.err.println("@@@@ Looking at doc.xml file: " + doc_xml_files.get(i)); 64 metadata_values.addAll(doc_xml_file.getMetadataExtractedFromFile(file, searchFileName));63 metadata_values.addAll(doc_xml_file.getMetadataExtractedFromFile(file, file_relative_path)); 65 64 } 66 65
Note:
See TracChangeset
for help on using the changeset viewer.