Changeset 8204
- Timestamp:
- 2004-10-01T15:23:57+12:00 (20 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java
r8165 r8204 32 32 import org.greenstone.gatherer.Gatherer; 33 33 import org.greenstone.gatherer.util.Utility; 34 import org.greenstone.gatherer.util.XMLTools;35 import org.w3c.dom.*;36 34 37 35 … … 40 38 extends File 41 39 { 42 static final private String ARCHIVE_ELEMENT = "Archive"; 43 static final private String DESCRIPTION_ELEMENT = "Description"; 44 static final private String METADATA_ELEMENT = "Metadata"; 45 static final private String SECTION_ELEMENT = "Section"; 46 47 private ArrayList files_in_doc_xml_file = new ArrayList(); 40 private HashMap source_file_name_to_description_elements_mapping = new HashMap(); 48 41 49 42 … … 66 59 67 60 // Check whether this doc.xml file contains extracted metadata for the specified file 68 boolean contains_extracted_metadata_for_file = false; 69 for (int i = 0; i < files_in_doc_xml_file.size(); i++) { 70 if (file_relative_path.equals(files_in_doc_xml_file.get(i))) { 71 contains_extracted_metadata_for_file = true; 72 break; 61 ArrayList description_elements_list = (ArrayList) source_file_name_to_description_elements_mapping.get(file_relative_path); 62 if (description_elements_list == null) { 63 // ...it doesn't 64 return metadata_values; 65 } 66 67 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 68 69 // Parse the doc.xml file 70 System.err.println("Applicable doc.xml file: " + this); 71 try { 72 BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 73 74 int description_element_num = 0; 75 int next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); 76 boolean in_relevant_description_element = false; 77 78 String line = null; 79 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 80 // Check if this line contains the start of a relevant Description element 81 if (line_num == next_description_element_start) { 82 in_relevant_description_element = true; 83 continue; 84 } 85 86 // If we're not in a relevant Description element we don't care about anything 87 if (in_relevant_description_element == false) { 88 continue; 89 } 90 91 // Check if this line contains the end of the relevant Description element 92 if (line.indexOf("</Description>") != -1) { 93 description_element_num++; 94 if (description_element_num == description_elements_list.size()) { 95 break; 96 } 97 98 next_description_element_start = ((Integer) description_elements_list.get(description_element_num)).intValue(); 99 in_relevant_description_element = false; 100 continue; 101 } 102 103 // If this line doesn't contain a Metadata element, we're not interested 104 if (line.indexOf("<Metadata ") == -1) { 105 continue; 106 } 107 108 // Extract the metadata element name 109 int name_index = line.indexOf(" name=\"") + " name=\"".length(); 110 String metadata_element_name_full = line.substring(name_index, line.indexOf("\"", name_index)); 111 112 // If the metadata has a namespace it isn't extracted metadata, so we're not interested 113 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); 114 if (!metadata_set_namespace.equals("")) { 115 continue; 116 } 117 118 // Extracted metadata! 119 String metadata_element_name = metadata_element_name_full; 120 121 // We completely ignore bibliographic data 122 if (metadata_element_name.equals("SourceSegment")) { 123 return new ArrayList(); 124 } 125 126 // Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.) 127 // and those starting with '/' (/srclink) 128 char first_character = metadata_element_name.charAt(0); 129 if (Character.isLowerCase(first_character) || first_character == '/') { 130 continue; 131 } 132 133 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); 134 135 // Value trees are not stored for extracted metadata, so create a new value tree node now 136 int value_index = line.indexOf(">", name_index) + ">".length(); 137 String metadata_element_value = line.substring(value_index, line.lastIndexOf("</Metadata>")); 138 139 metadata_element.addMetadataValue(metadata_element_value); 140 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(metadata_element_value); 141 142 // Add the new metadata value to the list 143 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node); 144 metadata_values.add(metadata_value); 73 145 } 74 146 } 75 76 // ...it doesn't 77 if (!contains_extracted_metadata_for_file) { 78 return metadata_values; 79 } 80 81 // Parse the doc.xml file 82 Gatherer.println("Applicable doc.xml file: " + this); 83 Document document = XMLTools.parseXMLFile(this); 84 if (document == null) { 85 System.err.println("Error: Could not parse doc.xml file " + getAbsolutePath()); 86 return metadata_values; 87 } 88 89 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE); 90 91 // Read all the Archive elements in the file 92 NodeList archive_elements_nodelist = document.getElementsByTagName(ARCHIVE_ELEMENT); 93 for (int i = 0; i < archive_elements_nodelist.getLength(); i++) { 94 Element current_archive_element = (Element) archive_elements_nodelist.item(i); 95 96 // Read the child Section elements of the archive (but not all descendants) 97 ArrayList child_section_elements = XMLTools.getChildElementsByTagName(current_archive_element, SECTION_ELEMENT); 98 for (int j = 0; j < child_section_elements.size(); j++) { 99 Element current_section_element = (Element) child_section_elements.get(j); 100 101 // Read the Description elements of this section only (not child sections as well) 102 ArrayList child_description_elements = XMLTools.getChildElementsByTagName(current_section_element, DESCRIPTION_ELEMENT); 103 for (int k = 0; k < child_description_elements.size(); k++) { 104 Element current_description_element = (Element) child_description_elements.get(k); 105 106 // Read all the Metadata elements in this description element 107 NodeList metadata_elements_nodelist = current_description_element.getElementsByTagName(METADATA_ELEMENT); 108 for (int l = 0; l < metadata_elements_nodelist.getLength(); l++) { 109 Element current_metadata_element = (Element) metadata_elements_nodelist.item(l); 110 String metadata_element_name_full = current_metadata_element.getAttribute("name"); 111 112 // If the metadata has a namespace it isn't extracted metadata, so we're not interested 113 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full); 114 if (!metadata_set_namespace.equals("")) { 115 continue; 116 } 117 118 // Extracted metadata! 119 String metadata_element_name = metadata_element_name_full; 120 121 // We completely ignore bibliographic data 122 if (metadata_element_name.equals("SourceSegment")) { 123 return new ArrayList(); 124 } 125 126 // Ignore lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.) 127 // and those starting with '/' (/srclink) 128 char first_character = metadata_element_name.charAt(0); 129 if (Character.isLowerCase(first_character) || first_character == '/') { 130 continue; 131 } 132 133 MetadataElement metadata_element = extracted_metadata_set.getMetadataElementWithName(metadata_element_name); 134 135 // Value trees are not stored for extracted metadata, so create a new value tree node now 136 String current_metadata_element_value = XMLTools.getElementTextValue(current_metadata_element); 137 metadata_element.addMetadataValue(current_metadata_element_value); 138 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(current_metadata_element_value); 139 140 // Add the new metadata value to the list 141 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node); 142 metadata_values.add(metadata_value); 143 } 144 } 145 } 147 catch (Exception ex) { 148 System.err.println("Exception: " + ex); 149 ex.printStackTrace(); 146 150 } 147 151 … … 160 164 161 165 // Skim the doc.xml file as quickly as possible (don't parse as XML), looking at the Metadata elements 162 Gatherer.println("Skimming doc.xml file " + this + "...");166 System.err.println("Skimming doc.xml file " + this + "..."); 163 167 try { 164 168 BufferedReader buffered_reader = new BufferedReader(new FileReader(this)); 169 int description_element_start = -1; 170 165 171 String line = null; 166 while ((line = buffered_reader.readLine()) != null) { 167 // This line doesn't contain a metadata element 172 for (int line_num = 0; (line = buffered_reader.readLine()) != null; line_num++) { 173 // This line contains the start of a Description element 174 if (line.indexOf("<Description>") != -1) { 175 if (description_element_start != -1) { 176 System.err.println("Parse error: previous Description element unfinished!"); 177 } 178 description_element_start = line_num; 179 continue; 180 } 181 182 // This line contains the end of a Description element 183 if (line.indexOf("</Description>") != -1) { 184 if (description_element_start == -1) { 185 System.err.println("Parse error: Description element unstarted!"); 186 } 187 description_element_start = -1; 188 continue; 189 } 190 191 // If we're not in a Description element there shouldn't be any Metadata elements 192 if (description_element_start == -1) { 193 continue; 194 } 195 196 // This line doesn't contain a Metadata element, so we're not interested 168 197 if (line.indexOf("<Metadata ") == -1) { 198 System.err.println("Parse error: Description element line doesn't contain Metadata element."); 169 199 continue; 170 200 } … … 208 238 209 239 // Remember this for quick access later 210 files_in_doc_xml_file.add(gsdlsourcefilename_value); 240 if (source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value) == null) { 241 source_file_name_to_description_elements_mapping.put(gsdlsourcefilename_value, new ArrayList()); 242 } 243 244 ((ArrayList) source_file_name_to_description_elements_mapping.get(gsdlsourcefilename_value)).add(new Integer(description_element_start)); 211 245 } 212 246 else { … … 218 252 // We completely ignore bibliographic data 219 253 if (metadata_element_name.equals("SourceSegment")) { 220 files_in_doc_xml_file.clear();254 // !!! source_file_name_to_description_elements_mapping.remove(gsdlsourcefilename_value); 221 255 } 222 256
Note:
See TracChangeset
for help on using the changeset viewer.