Changeset 17009
- Timestamp:
- 2008-08-26T17:23:26+12:00 (16 years ago)
- Location:
- gli/trunk/src/org/greenstone/gatherer
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
gli/trunk/src/org/greenstone/gatherer/collection/CollectionManager.java
r16987 r17009 1315 1315 1316 1316 // Read through the doc.xml files in the archives directory 1317 File collection_archives_directory = new File(getLoadedCollectionArchivesDirectoryPath()); 1317 1318 1318 DocXMLFileManager.clearDocXMLFiles(); 1319 DocXMLFileManager.loadDocXMLFiles(collection_archives_directory); 1319 1320 if (Configuration.fedora_info.isActive()) { // FLI case 1321 File collection_export_directory = new File(getLoadedCollectionExportDirectoryPath()); 1322 DocXMLFileManager.loadDocXMLFiles(collection_export_directory,"docmets.xml"); 1323 } 1324 else { 1325 File collection_archives_directory = new File(getLoadedCollectionArchivesDirectoryPath()); 1326 DocXMLFileManager.loadDocXMLFiles(collection_archives_directory,"doc.xml"); 1327 } 1328 1320 1329 1321 1330 // Get a list of the collection specific classifiers and plugins … … 1358 1367 * more situations. */ 1359 1368 public static boolean canDoScheduling() { 1369 // Would be nice to support more of these, rather than returning false 1360 1370 if(Gatherer.isGsdlRemote) { 1361 1371 return false; 1362 1372 } 1363 1373 if(Gatherer.GS3) { 1374 return false; 1375 } 1376 if (Configuration.fedora_info.isActive()) { 1364 1377 return false; 1365 1378 } -
gli/trunk/src/org/greenstone/gatherer/metadata/DocXMLFile.java
r16671 r17009 36 36 37 37 /** This class represents one doc.xml file */ 38 public class DocXMLFile 39 38 39 public abstract class DocXMLFile extends File 40 40 { 41 private HashMap source_file_name_to_description_elements_mapping = new HashMap(); 42 41 protected HashMap source_file_name_to_description_elements_mapping = new HashMap(); 42 43 protected static String MetadataWrap = null; 44 protected static String MetadataItem = null; 43 45 44 46 public DocXMLFile(String doc_xml_file_path) … … 59 61 } 60 62 61 // Check whether this doc.xmlfile contains extracted metadata for the specified file63 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file 62 64 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 63 65 if (description_elements_list == null) { … … 68 70 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 69 71 70 // Parse the doc.xmlfile71 DebugStream.println("Applicable doc.xmlfile: " + this);72 // Parse the file 73 DebugStream.println("Applicable file: " + this); 72 74 try { 73 75 BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); … … 79 81 String line = null; 80 82 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 81 // Check if this line contains the start of a relevant Description element 83 // Check if this line contains the start of a relevant "Description" element 84 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format) 82 85 if (line_num == next_description_element_start) { 83 86 in_relevant_description_element = true; … … 91 94 92 95 // Check if this line contains the end of the relevant Description element 93 if (line.indexOf("</ Description>") != -1) {96 if (line.indexOf("</"+MetadataWrap+">") != -1) { 94 97 description_element_num++; 95 98 if (description_element_num == description_elements_list.size()) { … … 103 106 104 107 // If this line doesn't contain a complete Metadata element, we're not interested 105 if (line.indexOf("< Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {108 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) { 106 109 continue; 107 110 } … … 135 138 // Value trees are not stored for extracted metadata, so create a new value tree node now 136 139 int value_index = line.indexOf(">", name_index) + ">".length(); 137 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</ Metadata>"));140 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">")); 138 141 139 142 metadata_element.addMetadataValue(metadata_element_value); … … 158 161 159 162 163 164 160 165 /** 161 * Every doc.xmlfile must be skimmed when a collection is opened, for two reasons:166 * Every file must be skimmed when a collection is opened, for two reasons: 162 167 * - To build a mapping from source file to its corresponding doc.xml file 163 168 * - To get a complete list of all extracted metadata elements … … 167 172 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 168 173 169 // Skim the doc.xmlfile as quickly as possible (don't parse as XML), looking at the Metadata elements170 DebugStream.println("Skimming doc.xml file" + this + "...");174 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements 175 DebugStream.println("Skimming " + this + "..."); 171 176 try { 172 177 BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); … … 175 180 String line = null; 176 181 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 177 // This line contains the start of a Description element 178 if (line.indexOf("<Description>") != -1) { 182 // This line contains the start of a "MetadataWrap" element 183 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format) 184 if (line.indexOf("<"+MetadataWrap+">") != -1) { 179 185 if (description_element_start != -1) { 180 System.err.println("Parse error: previous Descriptionelement unfinished!");186 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!"); 181 187 } 182 188 description_element_start = line_num; … … 184 190 } 185 191 186 // This line contains the end of a Descriptionelement187 if (line.indexOf("</ Description>") != -1) {192 // This line contains the end of a "MetadataWrap" element 193 if (line.indexOf("</"+MetadataWrap+">") != -1) { 188 194 if (description_element_start == -1) { 189 System.err.println("Parse error: Descriptionelement unstarted!");195 System.err.println("Parse error: "+MetadataWrap+" element unstarted!"); 190 196 } 191 197 description_element_start = -1; … … 193 199 } 194 200 195 // If we're not in a Descriptionelement there shouldn't be any Metadata elements201 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements 196 202 if (description_element_start == -1) { 197 203 continue; … … 199 205 200 206 // This line doesn't contain a Metadata element, so we're not interested 201 if (line.indexOf("< Metadata") == -1) {202 DebugStream.println("Warning: Descriptionelement line doesn't contain Metadata element.");207 if (line.indexOf("<"+MetadataItem+" ") == -1) { 208 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element."); 203 209 continue; 204 210 } … … 217 223 String metadata_element_name = metadata_element_name_full; 218 224 219 // Note which file this doc.xmlis for225 // Note which file this is for 220 226 if (metadata_element_name.equals("gsdlsourcefilename")) { 221 227 // Extract the gsdlsourcefilename element value … … 283 289 } 284 290 } 291 292 293 /* 294 public ArrayList getMetadataExtractedFromFile(File file) 295 { 296 // Build up a list of metadata extracted from this file 297 ArrayList metadata_values = new ArrayList(); 298 299 String file_relative_path = file.getAbsolutePath(); 300 int import_index = file_relative_path.indexOf("import"); 301 if (import_index != -1) { 302 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); 303 } 304 305 // Check whether this doc.xml file contains extracted metadata for the specified file 306 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 307 if (description_elements_list == null) { 308 // ...it doesn't 309 return metadata_values; 310 } 311 312 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 313 314 // Parse the doc.xml file 315 DebugStream.println("Applicable doc.xml file: " + this); 316 try { 317 BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 318 319 int description_element_num = 0; 320 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); 321 boolean in_relevant_description_element = false; 322 323 String line = null; 324 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 325 // Check if this line contains the start of a relevant Description element 326 if (line_num == next_description_element_start) { 327 in_relevant_description_element = true; 328 continue; 329 } 330 331 // If we're not in a relevant Description element we don't care about anything 332 if (in_relevant_description_element == false) { 333 continue; 334 } 335 336 // Check if this line contains the end of the relevant Description element 337 if (line.indexOf("</Description>") != -1) { 338 description_element_num++; 339 if (description_element_num == description_elements_list.size()) { 340 break; 341 } 342 343 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); 344 in_relevant_description_element = false; 345 continue; 346 } 347 348 // If this line doesn't contain a complete Metadata element, we're not interested 349 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) { 350 continue; 351 } 352 353 // Extract the metadata element name 354 int name_index = line.indexOf(" name=\"") + " name=\"".length(); 355 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); 356 357 // If the metadata has a namespace it isn't extracted metadata, so we're not interested 358 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); 359 if (!metadata_set_namespace.equals("")) { 360 continue; 361 } 362 363 // Extracted metadata! 364 String metadata_element_name = metadata_element_name_full; 365 366 // We completely ignore bibliographic data 367 if (metadata_element_name.equals("SourceSegment")) { 368 buffered_reader.close(); 369 return new ArrayList(); 370 } 371 372 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) 373 if (metadata_element_name.startsWith("gsdl")) { 374 continue; 375 } 376 377 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); 378 379 // Value trees are not stored for extracted metadata, so create a new value tree node now 380 int value_index = line.indexOf(">", name_index) + ">".length(); 381 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>")); 382 383 metadata_element.addMetadataValue(metadata_element_value); 384 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value); 385 386 // Add the new metadata value to the list 387 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node); 388 metadata_values.add(metadata_value); 389 } 390 391 buffered_reader.close(); 392 } 393 catch (FileNotFoundException exception) { 394 DebugStream.printStackTrace(exception); 395 } 396 catch (IOException exception) { 397 DebugStream.printStackTrace(exception); 398 } 399 400 return metadata_values; 401 } 402 403 */ 404 405 /** 406 * Every doc.xml file must be skimmed when a collection is opened, for two reasons: 407 * - To build a mapping from source file to its corresponding doc.xml file 408 * - To get a complete list of all extracted metadata elements 409 */ 410 /* 411 public void skimFile() 412 { 413 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 414 415 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements 416 DebugStream.println("Skimming " + this + "..."); 417 try { 418 BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 419 int description_element_start = -1; 420 421 String line = null; 422 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 423 // This line contains the start of a Description element 424 if (line.indexOf("<Description>") != -1) { 425 if (description_element_start != -1) { 426 System.err.println("Parse error: previous Description element unfinished!"); 427 } 428 description_element_start = line_num; 429 continue; 430 } 431 432 // This line contains the end of a Description element 433 if (line.indexOf("</Description>") != -1) { 434 if (description_element_start == -1) { 435 System.err.println("Parse error: Description element unstarted!"); 436 } 437 description_element_start = -1; 438 continue; 439 } 440 441 // If we're not in a Description element there shouldn't be any Metadata elements 442 if (description_element_start == -1) { 443 continue; 444 } 445 446 // This line doesn't contain a Metadata element, so we're not interested 447 if (line.indexOf("<Metadata ") == -1) { 448 DebugStream.println("Warning: Description element line doesn't contain Metadata element."); 449 continue; 450 } 451 452 // Extract the metadata element name 453 int name_index = line.indexOf(" name=\"") + " name=\"".length(); 454 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); 455 456 // If the metadata has a namespace it isn't extracted metadata, so we're not interested 457 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); 458 if (!metadata_set_namespace.equals("")) { 459 continue; 460 } 461 462 // Extracted metadata! 463 String metadata_element_name = metadata_element_name_full; 464 465 // Note which file this doc.xml is for 466 if (metadata_element_name.equals("gsdlsourcefilename")) { 467 // Extract the gsdlsourcefilename element value 468 int value_index = line.indexOf(">", name_index) + ">".length(); 469 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index)); 470 471 // We're only interested in the path relative to the import folder 472 int import_index = gsdlsourcefilename_value.indexOf("import"); 473 if (import_index != -1) { 474 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length()); 475 476 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/"); 477 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1); 478 479 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding 480 // This is stored in the System's file.encoding property. 481 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding")); 482 483 // Make sure the path matches the OS that is running 484 if (is_unix_path && Utility.isWindows()) { 485 // Convert path from Unix to Windows 486 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); 487 } 488 else if (!is_unix_path && !Utility.isWindows()) { 489 // Convert path from Windows to Unix 490 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); 491 } 492 493 // Remember this for quick access later 494 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) { 495 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList()); 496 } 497 498 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); 499 } 500 501 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory 502 // This is true when the source files come from a zip file processed by ZIPPlug, for example 503 else if (gsdlsourcefilename_value.indexOf("tmp") == -1) { 504 // We don't really know what is going on... 505 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value); 506 } 507 } 508 509 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) 510 if (metadata_element_name.startsWith("gsdl")) { 511 continue; 512 } 513 514 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); 515 if (metadata_element == null) { 516 // This element isn't defined in ex.mds, so create it for this session 517 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name); 518 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name); 519 } 520 } 521 522 buffered_reader.close(); 523 } 524 catch (FileNotFoundException exception) { 525 DebugStream.printStackTrace(exception); 526 } 527 catch (IOException exception) { 528 DebugStream.printStackTrace(exception); 529 } 530 } 531 */ 532 285 533 } -
gli/trunk/src/org/greenstone/gatherer/metadata/DocXMLFileManager.java
r13337 r17009 60 60 61 61 62 static public void loadDocXMLFiles(File directory )62 static public void loadDocXMLFiles(File directory, String filename_match) 63 63 { 64 64 // Make sure the directory (archives) exists … … 72 72 File child_file = directory_files[i]; 73 73 if (child_file.isDirectory()) { 74 loadDocXMLFiles(child_file );74 loadDocXMLFiles(child_file,filename_match); 75 75 } 76 else if (child_file.getName().equals("doc.xml")) { 77 loadDocXMLFile(child_file); 76 else if (child_file.getName().equals(filename_match)) { 77 // e.g. doc.xml (for regular Greenstone, docmets.xml for Fedora) 78 79 loadDocXMLFile(child_file,filename_match); 78 80 } 79 81 } … … 81 83 82 84 83 static private void loadDocXMLFile(File doc_xml_file_file )85 static private void loadDocXMLFile(File doc_xml_file_file,String filename_match) 84 86 { 85 DocXMLFile doc_xml_file = new DocXMLFile(doc_xml_file_file.getAbsolutePath()); 86 try { 87 String file = doc_xml_file_file.getAbsolutePath(); 88 89 DocXMLFile doc_xml_file 90 = (filename_match.equals("docmets.xml")) ? new DocMetsXMLFile(file) : new DocGAFile(file); 91 92 try { 87 93 doc_xml_file.skimFile(); 88 94 doc_xml_files.add(doc_xml_file); -
gli/trunk/src/org/greenstone/gatherer/shell/GShell.java
r16129 r17009 405 405 fireMessage(type, typeAsString(type) + "> " + Dictionary.get("GShell.Parsing_Metadata_Start"), status, null); 406 406 DocXMLFileManager.clearDocXMLFiles(); 407 DocXMLFileManager.loadDocXMLFiles(new File(CollectionManager.getLoadedCollectionArchivesDirectoryPath())); 407 if (Configuration.fedora_info.isActive()) { // FLI case 408 File collection_export_directory = new File(CollectionManager.getLoadedCollectionExportDirectoryPath()); 409 DocXMLFileManager.loadDocXMLFiles(collection_export_directory,"docmets.xml"); 410 } 411 else { 412 File collection_archives_directory = new File(CollectionManager.getLoadedCollectionArchivesDirectoryPath()); 413 DocXMLFileManager.loadDocXMLFiles(collection_archives_directory,"doc.xml"); 414 } 415 416 408 417 fireMessage(type, typeAsString(type) + "> " + Dictionary.get("GShell.Parsing_Metadata_Complete"), status, null); 409 418 }
Note:
See TracChangeset
for help on using the changeset viewer.