source: trunk/gli/src/org/greenstone/gatherer/util/XMLTools.java@ 8766

Last change on this file since 8766 was 8695, checked in by mdewsnip, 20 years ago

Now remove invalid characters from metadata values when they are entered. This prevents the problem of writing out XML files with invalid characters, which are barfed on when reading them back in.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.6 KB
Line 
1package org.greenstone.gatherer.util;
2
3
4import java.io.*;
5import java.net.*;
6import java.util.*;
7import org.apache.xerces.parsers.*;
8import org.apache.xml.serialize.*;
9import org.greenstone.gatherer.DebugStream;
10import org.w3c.dom.*;
11import org.xml.sax.*;
12
13
14/** This class is a static class containing useful XML functions */
15public class XMLTools
16{
17 /** Remove all of the child nodes from a certain node. */
18 static final public void clear(Node node)
19 {
20 while (node.hasChildNodes()) {
21 node.removeChild(node.getFirstChild());
22 }
23 }
24
25
26 static public ArrayList getChildElementsByTagName(Element parent_element, String element_name)
27 {
28 ArrayList child_elements = new ArrayList();
29
30 NodeList children_nodelist = parent_element.getChildNodes();
31 for (int i = 0; i < children_nodelist.getLength(); i++) {
32 Node child_node = children_nodelist.item(i);
33 if (child_node.getNodeType() == Node.ELEMENT_NODE && child_node.getNodeName().equals(element_name)) {
34 child_elements.add(child_node);
35 }
36 }
37
38 return child_elements;
39 }
40
41
42 static public String getElementTextValue(Element element)
43 {
44 // Find the first text node child
45 NodeList children_nodelist = element.getChildNodes();
46 for (int i = 0; i < children_nodelist.getLength(); i++) {
47 Node child_node = children_nodelist.item(i);
48 if (child_node.getNodeType() == Node.TEXT_NODE) {
49 return child_node.getNodeValue();
50 }
51 }
52
53 // None found
54 return "";
55 }
56
57
58 /** Method to retrieve the value of a given node.
59 * @param element The <strong>Element</strong> whose value we wish to find.
60 * Soon to be deprecated!
61 */
62 static final public String getValue(Node element) {
63 // If we've been given a subject node first retrieve its value node.
64 if(element.getNodeName().equals("Subject")) {
65 element = getNodeFromNamed(element, "Value");
66 }
67 // If we've got a value node, then reconstruct the text. Remember that DOM will split text over 256 characters into several text nodes
68 if(element != null && element.hasChildNodes()) {
69 StringBuffer text_buffer = new StringBuffer();
70 NodeList text_nodes = element.getChildNodes();
71 for(int i = 0; i < text_nodes.getLength(); i++) {
72 Node possible_text = text_nodes.item(i);
73 if(possible_text.getNodeName().equals(StaticStrings.TEXT_NODE)) {
74 text_buffer.append(possible_text.getNodeValue());
75 }
76 }
77 return text_buffer.toString();
78 }
79 return "";
80 }
81
82
83 /** Method to retrieve from the node given, a certain child node with the specified name.
84 * @param parent The <strong>Node</strong> whose children should be searched.
85 * @param name The required nodes name as a <strong>String</strong>.
86 * @return The requested <strong>Node</strong> if it is found, <i>null</i> otherwise.
87 * Soon to be deprecated!
88 */
89 static final public Node getNodeFromNamed(Node parent, String name) {
90 Node child = null;
91 for(Node i = parent.getFirstChild(); i != null && child == null;
92 i = i.getNextSibling()) {
93 if(i.getNodeName().equals(name)) {
94 child = i;
95 }
96 }
97 return child;
98 }
99
100
101 /** Parse an XML document from a given file */
102 static public Document parseXMLFile(File xml_file)
103 {
104 Document document = null;
105
106 try {
107 FileInputStream fis = new FileInputStream(xml_file);
108 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
109 Reader r = new BufferedReader(isr);
110 InputSource isc = new InputSource(r);
111 DOMParser parser = new DOMParser();
112 parser.setFeature("http://xml.org/sax/features/validation", false);
113 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
114 // May or may not be ignored, the documentation for Xerces is contradictory. If it works then parsing -should- be faster.
115 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", true);
116 parser.setFeature("http://apache.org/xml/features/dom/include-ignorable-whitespace", false);
117 parser.parse(isc);
118 document = parser.getDocument();
119 isr.close();
120 fis.close();
121 }
122 catch (Exception exception) {
123 DebugStream.printStackTrace(exception);
124 }
125
126 return document;
127 }
128
129
130 /** Removes characters that are invalid in XML (see http://www.w3.org/TR/2000/REC-xml-20001006#charsets) */
131 static public String removeInvalidCharacters(String text)
132 {
133 char[] safe_characters = new char[text.length()];
134 int j = 0;
135
136 char[] raw_characters = new char[text.length()];
137 text.getChars(0, text.length(), raw_characters, 0);
138 for (int i = 0; i < raw_characters.length; i++) {
139 char character = raw_characters[i];
140 if ((character >= 0x20 && character <= 0xD7FF) || character == 0x09 || character == 0x0A || character == 0x0D || (character >= 0xE000 && character <= 0xFFFD) || (character >= 0x10000 && character <= 0x10FFFF)) {
141 safe_characters[j] = character;
142 j++;
143 }
144 }
145
146 return new String(safe_characters, 0, j);
147 }
148
149
150 /** Set the #text node value of some element.
151 * @param element the Element whose value we wish to set
152 * @param value the new value for the element as a String
153 * Soon to be deprecated!
154 */
155 static final public void setValue(Element element, String value) {
156 // Remove any existing child node(s)
157 clear(element);
158 // Add new text node.
159 if (value != null) {
160 element.appendChild(element.getOwnerDocument().createTextNode(value));
161 }
162 }
163
164
165 /** Write an XML document to a given file */
166 static public void writeXMLFile(File xml_file, Document document)
167 {
168 try {
169 OutputStream os = new FileOutputStream(xml_file);
170 // Create an output format for our document.
171 OutputFormat f = new OutputFormat(document);
172 f.setEncoding("UTF-8");
173 f.setIndenting(true);
174 f.setLineWidth(0); // Why isn't this working!
175 f.setPreserveSpace(false);
176 // Create the necessary writer stream for serialization.
177 OutputStreamWriter osw = new OutputStreamWriter(os, "UTF-8");
178 Writer w = new BufferedWriter(osw);
179 // Generate a new serializer from the above.
180 XMLSerializer s = new XMLSerializer(w, f);
181 s.asDOMSerializer();
182 // Finally serialize the document to file.
183 s.serialize(document);
184 // And close.
185 os.close();
186 }
187 catch (Exception exception) {
188 DebugStream.printStackTrace(exception);
189 }
190 }
191
192
193 /** ------------ OLD FUNCTIONS FROM UTILITY ---------------
194 /** Using this method we can request that a certain document be written, as valid XML, to a certain output stream. This makes use of the Xerces Serialization suite, which should in no way be confused with the usual method of Serialization used by Java. */
195 static public boolean export(Document document, String filename) {
196 return export(document, new File(filename));
197 }
198
199 static public boolean export(Document document, File file) {
200 try {
201 OutputStream os = new FileOutputStream(file);
202 // Create an output format for our document.
203 OutputFormat f = new OutputFormat(document);
204 f.setEncoding("UTF-8");
205 f.setIndenting(true);
206 f.setLineWidth(0); // Why isn't this working!
207 f.setPreserveSpace(false);
208 // Create the necessary writer stream for serialization.
209 OutputStreamWriter osw = new OutputStreamWriter(os, "UTF-8");
210 Writer w = new BufferedWriter(osw);
211 // Generate a new serializer from the above.
212 XMLSerializer s = new XMLSerializer(w, f);
213 s.asDOMSerializer();
214 // Finally serialize the document to file.
215 s.serialize(document);
216 // And close.
217 os.close();
218 return true;
219 }
220 // A file not found exception is most likely thrown because the directory the metadata.xml file is attempting to be written to no longer has any files in it. I'll add a test in MetadataXMLFile to test for this, but if it still happens ignore it (a non-existant directory can't really have metadata added to it any way.
221 catch (Exception exception) {
222 if(!file.getName().endsWith(StaticStrings.METADATA_XML)) {
223 DebugStream.printStackTrace(exception);
224 return false;
225 }
226 return true;
227 }
228 }
229
230 /** Parse in a xml document from a given file. */
231 static public Document parse(File file) {
232 return parse(file, true);
233 }
234
235 /** Parse in a xml document from a given file. */
236 static public Document parse(File file, boolean noisey)
237 {
238 Document document = null;
239 try {
240 if (file.exists()) {
241 DebugStream.println("Parsing XML file: " + file);
242 FileInputStream fis = new FileInputStream(file);
243 document = parse(fis, noisey);
244 }
245 }
246 catch (Exception error) {
247 if(noisey) {
248 error.printStackTrace();
249 DebugStream.println("Exception in Utility.parse() - Unexpected");
250 }
251 else {
252 DebugStream.println("Exception in Utility.parse() - Expected");
253 DebugStream.printStackTrace(error);
254 }
255 }
256
257 return document;
258 }
259
260 /** Parse in a xml document from a given URL. */
261 static public Document parse(URL url, boolean noisey)
262 {
263 Document document = null;
264 try {
265
266 URLConnection connection = url.openConnection();
267 InputStream is = connection.getInputStream();
268 document = parse(is,noisey);
269 }
270 catch (Exception error) {
271 if(noisey) {
272 error.printStackTrace();
273 DebugStream.println("Exception in Utility.parse() - Unexpected");
274 }
275 else {
276 DebugStream.println("Exception in Utility.parse() - Expected");
277 DebugStream.printStackTrace(error);
278 }
279 }
280
281 return document;
282 }
283
284 /** Parse in a xml document from a given file. */
285 static public Document parse(InputStream is, boolean noisey) {
286 Document document = null;
287 try {
288 InputStreamReader isr = new InputStreamReader(is, "UTF-8");
289 Reader r = new BufferedReader(isr);
290 InputSource isc = new InputSource(r);
291 DOMParser parser = new DOMParser();
292 parser.setFeature("http://xml.org/sax/features/validation", false);
293 parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
294 // May or may not be ignored, the documentation for Xerces is contradictory. If it works then parsing -should- be faster.
295 parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", true);
296 parser.setFeature("http://apache.org/xml/features/dom/include-ignorable-whitespace", false);
297 parser.parse(isc);
298 document = parser.getDocument();
299 isr.close();
300 is.close();
301 parser = null;
302 isc = null;
303 r = null;
304 isr = null;
305 is = null;
306 }
307 catch (Exception error) {
308 if(noisey) {
309 error.printStackTrace();
310 DebugStream.println("Exception in Utility.parse() - Unexpected");
311 }
312 else {
313 DebugStream.println("Exception in Utility.parse() - Expected");
314 }
315 DebugStream.printStackTrace(error);
316 }
317 return document;
318 }
319}
Note: See TracBrowser for help on using the repository browser.