source: trunk/gli/src/org/greenstone/gatherer/metadata/DocXMLFile.java@ 7822

Last change on this file since 7822 was 7822, checked in by mdewsnip, 20 years ago

All this new code just lying around makes me nervous, so I'm committing it. It isn't complete, but it's getting there.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1package org.greenstone.gatherer.metadata;
2
3
4import java.io.*;
5import java.util.*;
6import org.greenstone.gatherer.util.Utility;
7import org.w3c.dom.*;
8
9
10/** This class represents one doc.xml file */
11public class DocXMLFile
12 extends File
13{
14 static final private String ARCHIVE_ELEMENT = "Archive";
15 static final private String DESCRIPTION_ELEMENT = "Description";
16 static final private String METADATA_ELEMENT = "Metadata";
17 static final private String SECTION_ELEMENT = "Section";
18
19 private ArrayList files_in_doc_xml_file = new ArrayList();
20
21
22 public DocXMLFile(String doc_xml_file_path)
23 {
24 super(doc_xml_file_path);
25
26 // Parse the doc.xml file
27 Document document = XMLTools.parseXMLFile(this);
28 if (document == null) {
29 System.err.println("Error: Could not parse doc.xml file " + getAbsolutePath());
30 return;
31 }
32
33 // Read all the Archive elements in the file
34 NodeList archive_elements_nodelist = document.getElementsByTagName(ARCHIVE_ELEMENT);
35 for (int i = 0; i < archive_elements_nodelist.getLength(); i++) {
36 Element current_archive_element = (Element) archive_elements_nodelist.item(i);
37
38 // Read the child Section elements of the archive (but not all descendants)
39 ArrayList child_section_elements = XMLTools.getChildElementsByTagName(current_archive_element, SECTION_ELEMENT);
40 for (int j = 0; j < child_section_elements.size(); j++) {
41 Element current_section_element = (Element) child_section_elements.get(j);
42
43 // Read the Description elements of this section only (not child sections as well)
44 ArrayList child_description_elements = XMLTools.getChildElementsByTagName(current_section_element, DESCRIPTION_ELEMENT);
45 for (int k = 0; k < child_description_elements.size(); k++) {
46 Element current_description_element = (Element) child_description_elements.get(k);
47
48 String gsdlsourcefilename_value = null;
49 boolean bibliographic_data = false;
50
51 // Read all the Metadata elements in this description element
52 NodeList metadata_elements_nodelist = current_description_element.getElementsByTagName(METADATA_ELEMENT);
53 for (int l = 0; l < metadata_elements_nodelist.getLength(); l++) {
54 Element current_metadata_element = (Element) metadata_elements_nodelist.item(l);
55 String metadata_element_name_full = current_metadata_element.getAttribute("name");
56
57 // Note which file this Section is for
58 if (metadata_element_name_full.equals("gsdlsourcefilename")) {
59 gsdlsourcefilename_value = XMLTools.getElementTextValue(current_metadata_element);
60 boolean is_unix_path = gsdlsourcefilename_value.startsWith("/");
61
62 // We're only interested in the path relative to the import folder
63 int import_index = gsdlsourcefilename_value.indexOf("import");
64 if (import_index != -1) {
65 gsdlsourcefilename_value = gsdlsourcefilename_value.substring(import_index + "import".length() + 1);
66 }
67
68 // Make sure the path matches the OS that is running
69 if (is_unix_path && Utility.isWindows()) {
70 // Convert path from Unix to Windows
71 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("/", File.separator);
72 }
73 if (!is_unix_path && !Utility.isWindows()) {
74 // Convert path from Windows to Unix
75 gsdlsourcefilename_value = gsdlsourcefilename_value.replaceAll("\\", File.separator);
76 }
77
78 System.err.println("gsdlsourcefilename: " + gsdlsourcefilename_value);
79 }
80
81 // We don't deal with bibliographic data
82 if (metadata_element_name_full.equals("SourceSegment")) {
83 bibliographic_data = true;
84 break;
85 }
86 }
87
88 // Remember this for quick access later
89 if (gsdlsourcefilename_value != null && !bibliographic_data) {
90 files_in_doc_xml_file.add(gsdlsourcefilename_value);
91 }
92 }
93 }
94 }
95 }
96
97
98 public ArrayList getMetadataExtractedFromFile(File file)
99 {
100 // Build up a list of metadata extracted from this file
101 ArrayList metadata_values = new ArrayList();
102
103 String file_relative_path = file.getAbsolutePath();
104 int import_index = file_relative_path.indexOf("import");
105 if (import_index != -1) {
106 file_relative_path = file_relative_path.substring(import_index + "import".length() + 1);
107 }
108
109 // Check whether this doc.xml file contains extracted metadata for the specified file
110 boolean contains_extracted_metadata_for_file = false;
111 for (int i = 0; i < files_in_doc_xml_file.size(); i++) {
112 if (file_relative_path.equals(files_in_doc_xml_file.get(i))) {
113 contains_extracted_metadata_for_file = true;
114 System.err.println("Found extracted metadata in file " + getAbsolutePath());
115 break;
116 }
117 }
118
119 // ...it doesn't
120 if (!contains_extracted_metadata_for_file) {
121 return metadata_values;
122 }
123
124 // Parse the doc.xml file
125 Document document = XMLTools.parseXMLFile(this);
126 if (document == null) {
127 System.err.println("Error: Could not parse doc.xml file " + getAbsolutePath());
128 return metadata_values;
129 }
130
131 MetadataSet extracted_metadata_set = MetadataSetManager.getMetadataSet(MetadataSetManager.EXTRACTED_METADATA_NAMESPACE);
132
133 // Read all the Archive elements in the file
134 NodeList archive_elements_nodelist = document.getElementsByTagName(ARCHIVE_ELEMENT);
135 for (int i = 0; i < archive_elements_nodelist.getLength(); i++) {
136 Element current_archive_element = (Element) archive_elements_nodelist.item(i);
137
138 // Read the child Section elements of the archive (but not all descendants)
139 ArrayList child_section_elements = XMLTools.getChildElementsByTagName(current_archive_element, SECTION_ELEMENT);
140 for (int j = 0; j < child_section_elements.size(); j++) {
141 Element current_section_element = (Element) child_section_elements.get(j);
142
143 // Read the Description elements of this section only (not child sections as well)
144 ArrayList child_description_elements = XMLTools.getChildElementsByTagName(current_section_element, DESCRIPTION_ELEMENT);
145 for (int k = 0; k < child_description_elements.size(); k++) {
146 Element current_description_element = (Element) child_description_elements.get(k);
147
148 // Read all the Metadata elements in this description element
149 NodeList metadata_elements_nodelist = current_description_element.getElementsByTagName(METADATA_ELEMENT);
150 for (int l = 0; l < metadata_elements_nodelist.getLength(); l++) {
151 Element current_metadata_element = (Element) metadata_elements_nodelist.item(l);
152 String metadata_element_name_full = current_metadata_element.getAttribute("name");
153
154 // If the metadata has no namespace, it is extracted metadata
155 String metadata_set_namespace = MetadataTools.getMetadataSetNamespace(metadata_element_name_full);
156 if (metadata_set_namespace.equals("")) {
157 metadata_set_namespace = MetadataSetManager.EXTRACTED_METADATA_NAMESPACE;
158
159 // We're not interested in lower-case metadata elements (gsdlsourcefilename, gsdlassocfile etc.)
160 if (Character.isLowerCase(metadata_element_name_full.charAt(0))) {
161 continue;
162 }
163
164 MetadataElement metadata_element = extracted_metadata_set.getMetadataElement(metadata_element_name_full);
165 if (metadata_element == null) {
166 metadata_element_name_full = metadata_set_namespace + MetadataTools.NAMESPACE_SEPARATOR + metadata_element_name_full;
167 metadata_element = new MetadataElement(metadata_element_name_full, null);
168 }
169
170 // Value trees are not stored for extracted metadata, so create a new value tree node now
171 String current_metadata_element_value = XMLTools.getElementTextValue(current_metadata_element);
172 metadata_element.addMetadataValue(current_metadata_element_value);
173 MetadataValueTreeNode metadata_value_tree_node = metadata_element.getMetadataValueTreeNode(current_metadata_element_value);
174
175 // Add the new metadata value to the list
176 MetadataValue metadata_value = new MetadataValue(metadata_element, metadata_value_tree_node);
177 metadata_values.add(metadata_value);
178 }
179 }
180 }
181 }
182 }
183
184 return metadata_values;
185 }
186}
Note: See TracBrowser for help on using the repository browser.