Changeset 18362
- Timestamp:
- 2009-01-12T11:19:23+13:00 (15 years ago)
- Location:
- gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata
- Files:
-
- 4 edited
- 2 copied
Legend:
- Unmodified
- Added
- Removed
-
gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java
r12041 r18362 30 30 import java.io.*; 31 31 import java.util.*; 32 import java.net.URLDecoder; 32 33 import org.greenstone.gatherer.DebugStream; 33 34 import org.greenstone.gatherer.util.Utility; … … 35 36 36 37 /** This class represents one doc.xml file */ 37 public class DocXMLFile 38 38 39 public abstract class DocXMLFile extends File 39 40 { 40 private HashMap source_file_name_to_description_elements_mapping = new HashMap(); 41 42 43 public DocXMLFile(String doc_xml_file_path) 41 protected HashMap source_file_name_to_description_elements_mapping = new HashMap(); 42 43 protected final String MetadataWrap; 44 protected final String MetadataItem; 45 46 public DocXMLFile(String doc_xml_file_path, String metaWrap, String metaItem) 44 47 { 45 48 super(doc_xml_file_path); 49 this.MetadataWrap = metaWrap; 50 this.MetadataItem = metaItem; 46 51 } 47 52 … … 58 63 } 59 64 60 // Check whether this doc.xmlfile contains extracted metadata for the specified file65 // Check whether this file (i.e. doc.xml or docmets.xml on inheritance) file contains extracted metadata for the specified file 61 66 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 62 67 if (description_elements_list == null) { … … 67 72 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 68 73 69 // Parse the doc.xmlfile70 DebugStream.println("Applicable doc.xmlfile: " + this);74 // Parse the file 75 DebugStream.println("Applicable file: " + this); 71 76 try { 72 77 BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); … … 78 83 String line = null; 79 84 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 80 // Check if this line contains the start of a relevant Description element 85 // Check if this line contains the start of a relevant "Description" element 86 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format) 81 87 if (line_num == next_description_element_start) { 82 88 in_relevant_description_element = true; … … 90 96 91 97 // Check if this line contains the end of the relevant Description element 92 if (line.indexOf("</ Description>") != -1) {98 if (line.indexOf("</"+MetadataWrap+">") != -1) { 93 99 description_element_num++; 94 100 if (description_element_num == description_elements_list.size()) { … … 102 108 103 109 // If this line doesn't contain a complete Metadata element, we're not interested 104 if (line.indexOf("< Metadata ") == -1 || line.indexOf("</Metadata>") == -1) {110 if (line.indexOf("<"+MetadataItem+" ") == -1 || line.indexOf("</"+MetadataItem+">") == -1) { 105 111 continue; 106 112 } … … 134 140 // Value trees are not stored for extracted metadata, so create a new value tree node now 135 141 int value_index = line.indexOf(">", name_index) + ">".length(); 136 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</ Metadata>"));142 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</"+MetadataItem+">")); 137 143 138 144 metadata_element.addMetadataValue(metadata_element_value); … … 157 163 158 164 165 166 159 167 /** 160 * Every doc.xmlfile must be skimmed when a collection is opened, for two reasons:168 * Every file must be skimmed when a collection is opened, for two reasons: 161 169 * - To build a mapping from source file to its corresponding doc.xml file 162 170 * - To get a complete list of all extracted metadata elements … … 166 174 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 167 175 168 // Skim the doc.xmlfile as quickly as possible (don't parse as XML), looking at the Metadata elements169 DebugStream.println("Skimming doc.xml file" + this + "...");176 // Skim the file as quickly as possible (don't parse as XML), looking at the Metadata elements 177 DebugStream.println("Skimming " + this + "..."); 170 178 try { 171 179 BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); … … 174 182 String line = null; 175 183 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 176 // This line contains the start of a Description element 177 if (line.indexOf("<Description>") != -1) { 184 // This line contains the start of a "MetadataWrap" element 185 // (mets:xmlData in METS parlance, Description in GreenstoneArchive format) 186 if (line.indexOf("<"+MetadataWrap+">") != -1) { 178 187 if (description_element_start != -1) { 179 System.err.println("Parse error: previous Descriptionelement unfinished!");188 System.err.println("Parse error: previous " + MetadataWrap + " element unfinished!"); 180 189 } 181 190 description_element_start = line_num; … … 183 192 } 184 193 185 // This line contains the end of a Descriptionelement186 if (line.indexOf("</ Description>") != -1) {194 // This line contains the end of a "MetadataWrap" element 195 if (line.indexOf("</"+MetadataWrap+">") != -1) { 187 196 if (description_element_start == -1) { 188 System.err.println("Parse error: Descriptionelement unstarted!");197 System.err.println("Parse error: "+MetadataWrap+" element unstarted!"); 189 198 } 190 199 description_element_start = -1; … … 192 201 } 193 202 194 // If we're not in a Descriptionelement there shouldn't be any Metadata elements203 // If we're not in a"MetadataWrap" element there shouldn't be any Metadata elements 195 204 if (description_element_start == -1) { 196 205 continue; … … 198 207 199 208 // This line doesn't contain a Metadata element, so we're not interested 200 if (line.indexOf("< Metadata") == -1) {201 DebugStream.println("Warning: Descriptionelement line doesn't contain Metadata element.");209 if (line.indexOf("<"+MetadataItem+" ") == -1) { 210 DebugStream.println("Warning: "+MetadataWrap+" element line doesn't contain Metadata element."); 202 211 continue; 203 212 } … … 216 225 String metadata_element_name = metadata_element_name_full; 217 226 218 // Note which file this doc.xmlis for227 // Note which file this is for 219 228 if (metadata_element_name.equals("gsdlsourcefilename")) { 220 229 // Extract the gsdlsourcefilename element value … … 229 238 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/"); 230 239 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1); 240 241 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding 242 // This is stored in the System's file.encoding property. 243 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding")); 231 244 232 245 // Make sure the path matches the OS that is running … … 248 261 } 249 262 263 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory or 264 // (as in the case of using FLI) if it is the etc/collect.cfg or etc/collectionConfig.xml file 265 // which are the gsdlsourcefilenames for the fedora digital object representing a collection. 266 // This (tmp dir) is true when the source files come from a zip file processed by ZIPPlug, for example 267 else if (gsdlsourcefilename_value.indexOf("tmp") == -1 268 && !gsdlsourcefilename_value.endsWith("collect.cfg") 269 && !gsdlsourcefilename_value.endsWith("collectionConfig.xml")) { 270 // We don't really know what is going on... 271 System.err.println("Warning: Could not understand gsdlsourcefilename " + gsdlsourcefilename_value); 272 } 273 } 274 275 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) 276 if (metadata_element_name.startsWith("gsdl")) { 277 continue; 278 } 279 280 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); 281 if (metadata_element == null) { 282 // This element isn't defined in ex.mds, so create it for this session 283 DebugStream.println("Extracted metadata element not defined: " + metadata_element_name); 284 extracted_metadata_set.addMetadataElementForThisSession(metadata_element_name); 285 } 286 } 287 288 buffered_reader.close(); 289 } 290 catch (FileNotFoundException exception) { 291 DebugStream.printStackTrace(exception); 292 } 293 catch (IOException exception) { 294 DebugStream.printStackTrace(exception); 295 } 296 } 297 298 299 /* 300 public ArrayList getMetadataExtractedFromFile(File file) 301 { 302 // Build up a list of metadata extracted from this file 303 ArrayList metadata_values = new ArrayList(); 304 305 String file_relative_path = file.getAbsolutePath(); 306 int import_index = file_relative_path.indexOf("import"); 307 if (import_index != -1) { 308 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1); 309 } 310 311 // Check whether this doc.xml file contains extracted metadata for the specified file 312 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 313 if (description_elements_list == null) { 314 // ...it doesn't 315 return metadata_values; 316 } 317 318 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 319 320 // Parse the doc.xml file 321 DebugStream.println("Applicable doc.xml file: " + this); 322 try { 323 BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 324 325 int description_element_num = 0; 326 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); 327 boolean in_relevant_description_element = false; 328 329 String line = null; 330 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 331 // Check if this line contains the start of a relevant Description element 332 if (line_num == next_description_element_start) { 333 in_relevant_description_element = true; 334 continue; 335 } 336 337 // If we're not in a relevant Description element we don't care about anything 338 if (in_relevant_description_element == false) { 339 continue; 340 } 341 342 // Check if this line contains the end of the relevant Description element 343 if (line.indexOf("</Description>") != -1) { 344 description_element_num++; 345 if (description_element_num == description_elements_list.size()) { 346 break; 347 } 348 349 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); 350 in_relevant_description_element = false; 351 continue; 352 } 353 354 // If this line doesn't contain a complete Metadata element, we're not interested 355 if (line.indexOf("<Metadata ") == -1 || line.indexOf("</Metadata>") == -1) { 356 continue; 357 } 358 359 // Extract the metadata element name 360 int name_index = line.indexOf(" name=\"") + " name=\"".length(); 361 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); 362 363 // If the metadata has a namespace it isn't extracted metadata, so we're not interested 364 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); 365 if (!metadata_set_namespace.equals("")) { 366 continue; 367 } 368 369 // Extracted metadata! 370 String metadata_element_name = metadata_element_name_full; 371 372 // We completely ignore bibliographic data 373 if (metadata_element_name.equals("SourceSegment")) { 374 buffered_reader.close(); 375 return new ArrayList(); 376 } 377 378 // Ignore metadata starting with gsdl (used to be lowercase metadata and /metadata) 379 if (metadata_element_name.startsWith("gsdl")) { 380 continue; 381 } 382 383 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); 384 385 // Value trees are not stored for extracted metadata, so create a new value tree node now 386 int value_index = line.indexOf(">", name_index) + ">".length(); 387 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>")); 388 389 metadata_element.addMetadataValue(metadata_element_value); 390 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value); 391 392 // Add the new metadata value to the list 393 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node); 394 metadata_values.add(metadata_value); 395 } 396 397 buffered_reader.close(); 398 } 399 catch (FileNotFoundException exception) { 400 DebugStream.printStackTrace(exception); 401 } 402 catch (IOException exception) { 403 DebugStream.printStackTrace(exception); 404 } 405 406 return metadata_values; 407 } 408 409 */ 410 411 /** 412 * Every doc.xml file must be skimmed when a collection is opened, for two reasons: 413 * - To build a mapping from source file to its corresponding doc.xml file 414 * - To get a complete list of all extracted metadata elements 415 */ 416 /* 417 public void skimFile() 418 { 419 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 420 421 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements 422 DebugStream.println("Skimming " + this + "..."); 423 try { 424 BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 425 int description_element_start = -1; 426 427 String line = null; 428 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 429 // This line contains the start of a Description element 430 if (line.indexOf("<Description>") != -1) { 431 if (description_element_start != -1) { 432 System.err.println("Parse error: previous Description element unfinished!"); 433 } 434 description_element_start = line_num; 435 continue; 436 } 437 438 // This line contains the end of a Description element 439 if (line.indexOf("</Description>") != -1) { 440 if (description_element_start == -1) { 441 System.err.println("Parse error: Description element unstarted!"); 442 } 443 description_element_start = -1; 444 continue; 445 } 446 447 // If we're not in a Description element there shouldn't be any Metadata elements 448 if (description_element_start == -1) { 449 continue; 450 } 451 452 // This line doesn't contain a Metadata element, so we're not interested 453 if (line.indexOf("<Metadata ") == -1) { 454 DebugStream.println("Warning: Description element line doesn't contain Metadata element."); 455 continue; 456 } 457 458 // Extract the metadata element name 459 int name_index = line.indexOf(" name=\"") + " name=\"".length(); 460 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); 461 462 // If the metadata has a namespace it isn't extracted metadata, so we're not interested 463 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); 464 if (!metadata_set_namespace.equals("")) { 465 continue; 466 } 467 468 // Extracted metadata! 469 String metadata_element_name = metadata_element_name_full; 470 471 // Note which file this doc.xml is for 472 if (metadata_element_name.equals("gsdlsourcefilename")) { 473 // Extract the gsdlsourcefilename element value 474 int value_index = line.indexOf(">", name_index) + ">".length(); 475 String gsdlsourcefilename_value = line.substring(value_index, line.indexOf("<", value_index)); 476 477 // We're only interested in the path relative to the import folder 478 int import_index = gsdlsourcefilename_value.indexOf("import"); 479 if (import_index != -1) { 480 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length()); 481 482 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/"); 483 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(1); 484 485 // URL decode gsdlsourcefilename. Need to set the decoder to use the default system encoding 486 // This is stored in the System's file.encoding property. 487 gsdlsourcefilename_value = URLDecoder.decode(gsdlsourcefilename_value, System.getProperty("file.encoding")); 488 489 // Make sure the path matches the OS that is running 490 if (is_unix_path && Utility.isWindows()) { 491 // Convert path from Unix to Windows 492 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\/", "\\\\"); 493 } 494 else if (!is_unix_path && !Utility.isWindows()) { 495 // Convert path from Windows to Unix 496 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\\\", "/"); 497 } 498 499 // Remember this for quick access later 500 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) { 501 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList()); 502 } 503 504 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); 505 } 506 250 507 // Warn about an odd gsdlsourcefilename, except if it is the Greenstone "tmp" directory 251 508 // This is true when the source files come from a zip file processed by ZIPPlug, for example … … 278 535 } 279 536 } 537 */ 538 280 539 } -
gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/DocXMLFileManager.java
r13337 r18362 60 60 61 61 62 static public void loadDocXMLFiles(File directory )62 static public void loadDocXMLFiles(File directory, String filename_match) 63 63 { 64 64 // Make sure the directory (archives) exists … … 72 72 File child_file = directory_files[i]; 73 73 if (child_file.isDirectory()) { 74 loadDocXMLFiles(child_file );74 loadDocXMLFiles(child_file,filename_match); 75 75 } 76 else if (child_file.getName().equals("doc.xml")) { 77 loadDocXMLFile(child_file); 76 else if (child_file.getName().equals(filename_match)) { 77 // e.g. doc.xml (for regular Greenstone, docmets.xml for Fedora) 78 79 loadDocXMLFile(child_file,filename_match); 78 80 } 79 81 } … … 81 83 82 84 83 static private void loadDocXMLFile(File doc_xml_file_file )85 static private void loadDocXMLFile(File doc_xml_file_file,String filename_match) 84 86 { 85 DocXMLFile doc_xml_file = new DocXMLFile(doc_xml_file_file.getAbsolutePath()); 86 try { 87 String file = doc_xml_file_file.getAbsolutePath(); 88 89 // Need to do typecasts in the following to keep Java 1.4 happy 90 DocXMLFile doc_xml_file 91 = (filename_match.equals("docmets.xml")) 92 ? (DocXMLFile) new DocMetsXMLFile(file) 93 : (DocXMLFile) new DocGAFile(file); 94 95 try { 87 96 doc_xml_file.skimFile(); 88 97 doc_xml_files.add(doc_xml_file); -
gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/MetadataXMLFileManager.java
r13818 r18362 368 368 369 369 // Upload the files modified since last time, then reset the list 370 RemoteGreenstoneServer.uploadCollectionFiles(CollectionManager.getLoadedCollectionName(), (File[]) modified_metadata_xml_files.toArray(new File[0])); 370 Gatherer.remoteGreenstoneServer.uploadCollectionFiles( 371 CollectionManager.getLoadedCollectionName(), (File[]) modified_metadata_xml_files.toArray(new File[0])); 371 372 modified_metadata_xml_files.clear(); 372 373 } -
gli/branches/rtl-gli/src/org/greenstone/gatherer/metadata/ProfileXMLFile.java
r13808 r18362 110 110 // This is inefficient but for simplicity we'll just upload the file every time it is changed 111 111 if (Gatherer.isGsdlRemote) { 112 RemoteGreenstoneServer.uploadCollectionFile(CollectionManager.getLoadedCollectionName(), this);112 Gatherer.remoteGreenstoneServer.uploadCollectionFile(CollectionManager.getLoadedCollectionName(), this); 113 113 } 114 114
Note:
See TracChangeset
for help on using the changeset viewer.