- Timestamp:
- 2004-04-19T17:14:16+12:00 (20 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gli/src/org/greenstone/gatherer/msm/GreenstoneArchiveParser.java
r6859 r7204 61 61 62 62 private GShell shell; 63 64 static final String ignore_list[] = {"assocfilepath", "gsdl", "Identifier", "URL"}; //"Source", 63 private String file_path; 64 65 static final String ignore_list[] = { "assocfilepath", "gsdl", "Identifier", "SourceSegment", "URL" }; 65 66 66 67 public GreenstoneArchiveParser(GShellProgressMonitor progress, GShell shell) { … … 103 104 Document document = Utility.parse(file, false); 104 105 105 Gatherer.println("Parsed greenstone archive document: " + file.getAbsolutePath());106 106 // If we successfully parsed the document, then it is time to search through the DOM for the Metadata tags. 107 if (document != null) {108 Stringfile_path = null;107 if (document != null) { 108 file_path = null; 109 109 Element archive_element = document.getDocumentElement(); 110 110 // Retrieve the initial Section element 111 111 NodeList section_elements = archive_element.getElementsByTagName(StaticStrings.SECTION_ELEMENT); 112 // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements .113 if (section_elements.getLength() < 1) {112 // It may be the case that we have some sort of mutant freak archive file which doesn't have any section elements 113 if (section_elements.getLength() < 1) { 114 114 return count; 115 115 } 116 116 Element section_element = (Element) section_elements.item(0); 117 117 section_elements = null; 118 // Retrieve all of the Metadata sections. 118 119 // Retrieve all of the Metadata sections 119 120 NodeList metadata_elements = section_element.getElementsByTagName(StaticStrings.METADATA_ELEMENT); 120 121 section_element = null; 121 // We first zip through the retrieved metadata, and if we encounter the element 'SourceSegment' - a sure sign this collection came from a bibliographic type file - we break out of extracted metadata parsing as no sense could be made of the data extracted anyway (plus we suffer a death of thirty-thousand pointy bits of metadata!) 122 for(int i = 0; i < metadata_elements.getLength(); i++) { 122 123 // Zip through the retrieved metadata checking for SourceSegment elements 124 // These are a good sign of bibliographic files, which we must handle specially 125 boolean ignore_values = false; 126 for (int i = 0; i < metadata_elements.getLength(); i++) { 123 127 Element metadata_element = (Element) metadata_elements.item(i); 124 128 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE); 125 if(name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) { 126 return 0; 127 } 128 } 129 if (name.equalsIgnoreCase(StaticStrings.SOURCESEGMENT_VALUE)) { 130 ignore_values = true; 131 break; 132 } 133 } 134 129 135 // Now for each Metadata entry retrieved... 130 for (int i = 0; i < metadata_elements.getLength(); i++) {136 for (int i = 0; i < metadata_elements.getLength(); i++) { 131 137 Element metadata_element = (Element) metadata_elements.item(i); 132 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE); 133 // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to. 134 if(name.equals("gsdlsourcefilename")) { 135 file_path = MSMUtils.getValue(metadata_element); 136 } 137 else { 138 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata. 139 boolean ignore = false; 140 for(int j = 0; !ignore && j < ignore_list.length; j++) { 141 ignore = name.startsWith(ignore_list[j]); 142 } 143 // Otherwise ensure the metadata is present in our collection. 144 if(!ignore && file_path != null) { 145 // If we successfully retrieved a record we can continue. 146 if(file_path != null) { 147 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all). 148 ElementWrapper element = Gatherer.c_man.msm.getElement(name); 149 if(element == null) { 150 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE); 151 if(extracted_mds != null) { 152 element = extracted_mds.addElement(name, Gatherer.config.getLanguage()); 153 } 154 } 155 // If we successfully retrieved an element (and we should have) we can continue. 156 // WARNING!! There is one known exception - MARC records. Adding the extracted elements is all good, but adding the extracted metadata causes the whole thing to collapse in a pile of unhappy. 157 if(element != null && !file_path.endsWith(StaticStrings.MARC_EXTENSION) && (element.getNamespace().equals("") || element.getNamespace().equals(Utility.EXTRACTED_METADATA_NAMESPACE))) { 158 // Retrieve the metadata for the current file 159 File target_file = new File(file_path); 160 String value = ""; 161 try { 162 value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8")); 163 } 164 catch(IllegalArgumentException error) { // **** 165 value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element)); 166 } 167 catch(UnsupportedEncodingException error) { 168 Gatherer.printStackTrace(error); 169 } 170 // If we successfully retrieved a value we can continue. 171 if(value != null) { 172 // Create a new metadata object. 173 GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element); 174 GValueNode value_node = null; 175 if(value_tree != null) { 176 value_node = value_tree.getValue(value); 177 } 178 else { 179 value_node = new GValueNode(element.toString(), value); 180 } 181 Metadata metadata = new Metadata(element, value_node); 182 element.inc(); 183 ///ystem.err.println("Adding extracted metadata: " + metadata); 184 Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata)); 185 count++; 186 // All done. On to next metadata. 187 } 188 value = null; 189 target_file = null; 190 } 191 else { 192 Gatherer.println("Cannot retrieve metadata element " + name); 193 } 194 } 195 } 196 } 197 } 198 } 138 if (processMetadataElement(metadata_element, ignore_values) == true) { 139 count++; 140 } 141 } 142 } 143 199 144 return count; 200 145 } 146 147 148 private boolean processMetadataElement(Element metadata_element, boolean ignore_values) 149 { 150 String name = metadata_element.getAttribute(StaticStrings.NAME_ATTRIBUTE); 151 152 // There is also a special case when the metadata name is gsdlsourcefilename, as we use this to find the FileRecord we want to add metadata to. 153 if (name.equals("gsdlsourcefilename")) { 154 file_path = MSMUtils.getValue(metadata_element); 155 return false; 156 } 157 158 // Check if its name starts with, or is equal to, one of the values in our ignore list, and if so ignore this metadata 159 for (int j = 0; j < ignore_list.length; j++) { 160 if (name.startsWith(ignore_list[j])) { 161 return false; 162 } 163 } 164 165 if (file_path == null) { 166 return false; 167 } 168 169 // We now retrieve the appropriate element. If no such element exists we create a new one in the greenstone mds. Remember that no element in the greenstone mds has an associated value tree, so it is perfect for metadata elements with a small number of repeated values but where the values have no relation between files (such as encoding, where many files will be iso_8859_1, but if you change one you don't intend to change them all). 170 ElementWrapper element = Gatherer.c_man.msm.getElement(name); 171 if (element == null) { 172 MetadataSet extracted_mds = Gatherer.c_man.msm.getSet(Utility.EXTRACTED_METADATA_NAMESPACE); 173 if (extracted_mds != null) { 174 element = extracted_mds.addElement(name, Gatherer.config.getLanguage()); 175 } 176 } 177 178 // If ignore_values is set (bibliographic records) we don't care about the values 179 if (ignore_values == true) { 180 return false; 181 } 182 183 // Retrieve the metadata for the current file 184 File target_file = new File(file_path); 185 String value = ""; 186 try { 187 value = Utility.decodeGreenstone(URLDecoder.decode(MSMUtils.getValue(metadata_element), "UTF-8")); 188 } 189 catch (IllegalArgumentException error) { // **** 190 value = Utility.decodeGreenstone(MSMUtils.getValue(metadata_element)); 191 } 192 catch (UnsupportedEncodingException error) { 193 Gatherer.printStackTrace(error); 194 } 195 196 if (value == null) { 197 return false; 198 } 199 200 // Create a new metadata object. 201 GValueModel value_tree = Gatherer.c_man.msm.getValueTree(element); 202 GValueNode value_node = null; 203 if (value_tree != null) { 204 value_node = value_tree.getValue(value); 205 } 206 else { 207 value_node = new GValueNode(element.toString(), value); 208 } 209 210 Metadata metadata = new Metadata(element, value_node); 211 element.inc(); 212 ///ystem.err.println("Adding extracted metadata: " + metadata); 213 Gatherer.c_man.getCollection().msm.fireMetadataChanged(new MSMEvent(this, System.currentTimeMillis(), target_file, null, metadata)); 214 215 return true; 216 } 217 201 218 202 219 static final String metadata_ignore_list[] = {"assocfilepath", "gsdl", "Identifier","URL"};
Note:
See TracChangeset
for help on using the changeset viewer.