- Timestamp:
- 2019-12-05T21:58:02+13:00 (4 years ago)
- Location:
- main/trunk
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java
r23763 r33756 34 34 import org.greenstone.gatherer.util.Utility; 35 35 36 import org.apache.commons.codec.binary.Base64; 37 38 //import org.greenstone.gatherer.feedback.Base64; 36 39 37 40 /** This class represents one doc.xml file */ … … 43 46 protected final String MetadataWrap; 44 47 protected final String MetadataItem; 48 49 protected final String FILE_RENAME_METHOD_NONE = "none"; 50 protected final String FILE_RENAME_METHOD_URL = "url"; 51 protected final String FILE_RENAME_METHOD_BASE64 = "base64"; 45 52 46 53 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem) … … 63 70 } 64 71 72 ///for (Object relFilename : source_file_name_to_description_elements_mapping.keySet()) { 73 /// System.err.println("@@@ relFilename: " + relFilename); 74 ///} 75 65 76 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file 66 77 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); … … 175 186 public void skimFile() 176 187 { 188 String fileRenameMethod = null; 189 String gsdlsourcefilename_value = null; 190 boolean is_unix_path = false; 191 177 192 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 178 193 … … 227 242 // Extracted metadata! May have ex. so make sure we remove that 228 243 String metadata_element_name = MetadataTools.getMetadataElementName(metadata_element_name_full); 244 if(metadata_element_name.equals("gsdlsourcefilerenamemethod")) { 245 // Extract the element value 246 int value_index = line.indexOf(">", name_index) + ">".length(); 247 fileRenameMethod = line.substring(value_index, line.indexOf("<", value_index)); 248 } 249 229 250 // Note which file this is for 230 if (metadata_element_name.equals("gsdlsourcefilename")) {251 else if (metadata_element_name.equals("gsdlsourcefilename")) { 231 252 // Extract the gsdlsourcefilename element value 232 253 int value_index = line.indexOf(">", name_index) + ">".length(); 233 Stringgsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index));254 gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index)); 234 255 235 256 // We're only interested in the path relative to the import folder … … 238 259 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length()); 239 260 240 booleanis_unix_path = gsdlsourcefilename_value.startsWith("/");261 is_unix_path = gsdlsourcefilename_value.startsWith("/"); 241 262 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1); 242 263 243 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding 244 // This is stored in the System's file.encoding property. 245 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding")); 264 // (Will decode gsdlsourcefilename at end of this method, once we know 265 // for certain the fileRenameMethod that was used to encode it.) 246 266 247 267 // Make sure the path matches the OS that is running … … 254 274 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); 255 275 } 256 276 277 ///System.err.println("@@@@ Found gsdlsourcefilename: " + gsdlsourcefilename_value); 257 278 // Remember this for quick access later 258 279 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) { … … 289 310 290 311 buffered_reader.close(); 312 313 // Now that we're done skimming, we actually need to decode gsdlsourcefilename 314 // based on whatever fileRenameMethod was used to encode it, so that we can 315 // at last properly compare properly against filenames on the file system 316 // in order to load the correct ex.meta for the file. 317 // Now that we should have both gsdlsourcefilename AND fileRenameMethod set, 318 // we can finally perform the decoding of gsdlsourcefilename. 319 if(fileRenameMethod == null) { 320 fileRenameMethod = FILE_RENAME_METHOD_URL; // default for building 321 } 322 // If gsdlsourcefilename was encoded, we remove it from the map under its encoded 323 // filename, decode it and add it back into map using its decoded filename. 324 if(!fileRenameMethod.equals(FILE_RENAME_METHOD_NONE)) { 325 ArrayList value_list = (ArrayList) source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value); 326 gsdlsourcefilename_value = decodeSourceFilename(gsdlsourcefilename_value, fileRenameMethod, is_unix_path); 327 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, value_list); 328 } 291 329 } 292 330 catch (FileNotFoundException exception) { … … 295 333 catch (IOException exception) { 296 334 DebugStream.printStackTrace(exception); 297 } 335 } catch (Exception exception) { // e.g. exception decoding gsdlsourcefilename 336 DebugStream.printStackTrace(exception); 337 } 298 338 } 299 300 339 340 protected String decodeSourceFilename(String relative_sourcefile_path, 341 String encodingMethod, boolean is_unix_path) 342 throws Exception 343 { 344 345 ///System.err.println("*** relative_sourcefile_path: " + relative_sourcefile_path); 346 347 // First get the file extension. Both in Base64 and URL encoded strings, 348 // the full-stop character (.) doesn't get encoded. 349 // That means getting the file extension is straightforward. 350 351 // Valid base64: "The 64 characters (hence the name Base64) are 10 digits, 352 // 26 lowercase characters, 26 uppercase characters as well as the 353 // Plus sign (+) and the Forward Slash (/). 354 int fullstop = relative_sourcefile_path.indexOf("."); 355 String file_ext = ""; 356 if(fullstop != -1) { 357 file_ext = relative_sourcefile_path.substring(fullstop); 358 relative_sourcefile_path = relative_sourcefile_path.substring(0, fullstop); 359 } 360 361 String[] importFilePathParts = DocXMLFile.getFilePathParts(relative_sourcefile_path, is_unix_path); 362 363 String decoded_gsdlsourcefilename = ""; 364 365 String separator = is_unix_path ? "/" : "\\"; 366 for(int i = 0; i < importFilePathParts.length; i++) { 367 String decoded_filePathPart = ""; 368 if(encodingMethod.equals(FILE_RENAME_METHOD_URL)) { 369 // URL decode each part of gsdlsourcefilename. 370 // Need to set the decoder to use the default system encoding 371 // This is stored in the System's file.encoding property. 372 decoded_filePathPart = URLDecoder.decode(importFilePathParts[i], System.getProperty("file.encoding")); 373 } 374 else{ // if(encodingMethod.equals(FILE_RENAME_METHOD_BASE64)) { 375 // Decoding with org.greenstone.gatherer.feedback.Base64 didn't work 376 //byte[] bytes = org.greenstone.gatherer.feedback.Base64.decode(importFilePathParts[i]); 377 // Using org.apache.commons.codec.binary.Base64 instead 378 // https://commons.apache.org/proper/commons-codec/archives/1.7/apidocs/org/apache/commons/codec/binary/Base64.html 379 // General info: https://stackoverflow.com/questions/43089541/difference-between-basic-and-url-base64-encoding-in-java-8 380 byte[] bytes = Base64.decodeBase64(importFilePathParts[i].getBytes()); 381 ///System.err.println("Got base64 string: " + importFilePathParts[i]); 382 ///System.err.println("Decoded from base64 to bytes: " + bytes); 383 // Using system file.encoding to interpret the resulting bytestring as a String, 384 // just as we always did with URL decoding method 385 decoded_filePathPart = (bytes == null) ? importFilePathParts[i] : new String(bytes, System.getProperty("file.encoding")); 386 } 387 388 if(i == 0) { 389 decoded_gsdlsourcefilename = decoded_filePathPart; 390 } else { 391 decoded_gsdlsourcefilename = decoded_gsdlsourcefilename + separator + decoded_filePathPart; 392 } 393 ///System.err.println("Built up: " + decoded_gsdlsourcefilename); 394 } 395 396 // add the file extension back in 397 decoded_gsdlsourcefilename += file_ext; 398 399 ///System.err.println("@@@@ decoded_gsdlsourcefilename: " + Utility.debugUnicodeString(decoded_gsdlsourcefilename)); 400 401 return decoded_gsdlsourcefilename; 402 } 403 404 /** 405 * Given a filepath, returns the parts between each file separator as an array. 406 * For example, "/Users/me/pinky.txt" should return {"Users", "me", "pinky.txt"}; 407 */ 408 private static String[] getFilePathParts(String filepath, boolean is_unix_path) { 409 StringTokenizer tok; 410 if(is_unix_path) { 411 tok = new StringTokenizer(filepath, "/"); 412 } else { 413 tok = new StringTokenizer(filepath, "\\"); 414 } 415 String[] parts; 416 int count = tok.countTokens(); 417 if(count <= 0) { 418 parts = new String[]{filepath}; 419 } else { 420 int i = 0; 421 parts = new String[count]; 422 while(tok.hasMoreTokens()) { 423 parts[i] = tok.nextToken(); 424 //System.err.println("Next part: " + parts[i]); 425 i++; 426 } 427 } 428 return parts; 429 } 430 301 431 /* 302 432 public ArrayList getMetadataExtractedFromFile(File file) -
main/trunk/greenstone2/perllib/doc.pm
r33416 r33756 261 261 262 262 # print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename "; 263 ## print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n"; 263 264 264 265 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir … … 270 271 # $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename); 271 272 # print STDERR "$source_filename\n"; 272 273 274 ## print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n"; 275 273 276 $self->set_utf8_metadata_element ($self->get_top_section(), 274 277 "gsdlsourcefilename", 275 278 $source_filename); 279 280 # If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the 281 # original filename (relative path to file) back for GLI to notice what ex Meta is associated with it 282 $self->set_utf8_metadata_element($self->get_top_section(), 283 "gsdlsourcefilerenamemethod", 284 $rename_method); 276 285 } 277 286 … … 284 293 # print STDERR "-> $srcfilename -> "; 285 294 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method); 295 296 # encode any subdirs of "import" also, but not import itself 297 my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/"; 298 my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname); 299 300 my $dirpath = ""; 301 foreach my $subdir (@dirs) { 302 ##print STDERR "@@@@ Found subdir: $subdir\n"; 303 304 # The import folder can be called anything, including in non-ASCII encodings. 305 # Don't need to avoid encoding default import folder called "import", as it gets 306 # URL/base64 encoded to ITSELF. 307 # But can't encode (URL/base64 encode) any $ENV{'GSDLIMPORTDIR'} though if it's set, 308 # as BasePlugout::get_doc_dir() removes any $ENV{'GSDLIMPORTDIR'} prefix. Because if 309 # the $ENV{'GSDLIMPORTDIR'} part of gsdlsourcefilename is encoded here, the prefix 310 # won't match with $ENV{'GSDLIMPORTDIR'} 311 312 unless ($ENV{'GSDLIMPORTDIR'} && $subdir eq $ENV{'GSDLIMPORTDIR'}) { 313 $subdir = &util::rename_file($subdir, $rename_method); 314 } 315 $dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir); 316 } 317 if($dirpath ne "") { 318 $dirname = $dirpath; 319 } 320 286 321 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename); 287 322
Note:
See TracChangeset
for help on using the changeset viewer.