Context Navigation

source: trunk/gli/src/org/greenstone/gatherer/util/XMLTools.java@ 8766

Last change on this file since 8766 was 8695, checked in by mdewsnip, 20 years ago
Now remove invalid characters from metadata values when they are entered. This prevents the problem of writing out XML files with invalid characters, which are barfed on when reading them back in.
Property svn:keywords set to `Author Date Id Revision`
File size: 10.6 KB

Line
1	package org.greenstone.gatherer.util;
2
3
4	import java.io.*;
5	import java.net.*;
6	import java.util.*;
7	import org.apache.xerces.parsers.*;
8	import org.apache.xml.serialize.*;
9	import org.greenstone.gatherer.DebugStream;
10	import org.w3c.dom.*;
11	import org.xml.sax.*;
12
13
14	/** This class is a static class containing useful XML functions */
15	public class XMLTools
16	{
17	/** Remove all of the child nodes from a certain node. */
18	static final public void clear(Node node)
19	{
20	while (node.hasChildNodes()) {
21	node.removeChild(node.getFirstChild());
22	}
23	}
24
25
26	static public ArrayList getChildElementsByTagName(Element parent_element, String element_name)
27	{
28	ArrayList child_elements = new ArrayList();
29
30	NodeList children_nodelist = parent_element.getChildNodes();
31	for (int i = 0; i < children_nodelist.getLength(); i++) {
32	Node child_node = children_nodelist.item(i);
33	if (child_node.getNodeType() == Node.ELEMENT_NODE && child_node.getNodeName().equals(element_name)) {
34	child_elements.add(child_node);
35	}
36	}
37
38	return child_elements;
39	}
40
41
42	static public String getElementTextValue(Element element)
43	{
44	// Find the first text node child
45	NodeList children_nodelist = element.getChildNodes();
46	for (int i = 0; i < children_nodelist.getLength(); i++) {
47	Node child_node = children_nodelist.item(i);
48	if (child_node.getNodeType() == Node.TEXT_NODE) {
49	return child_node.getNodeValue();
50	}
51	}
52
53	// None found
54	return "";
55	}
56
57
58	/** Method to retrieve the value of a given node.
59	* @param element The <strong>Element</strong> whose value we wish to find.
60	* Soon to be deprecated!
61	*/
62	static final public String getValue(Node element) {
63	// If we've been given a subject node first retrieve its value node.
64	if(element.getNodeName().equals("Subject")) {
65	element = getNodeFromNamed(element, "Value");
66	}
67	// If we've got a value node, then reconstruct the text. Remember that DOM will split text over 256 characters into several text nodes
68	if(element != null && element.hasChildNodes()) {
69	StringBuffer text_buffer = new StringBuffer();
70	NodeList text_nodes = element.getChildNodes();
71	for(int i = 0; i < text_nodes.getLength(); i++) {
72	Node possible_text = text_nodes.item(i);
73	if(possible_text.getNodeName().equals(StaticStrings.TEXT_NODE)) {
74	text_buffer.append(possible_text.getNodeValue());
75	}
76	}
77	return text_buffer.toString();
78	}
79	return "";
80	}
81
82
83	/** Method to retrieve from the node given, a certain child node with the specified name.
84	* @param parent The <strong>Node</strong> whose children should be searched.
85	* @param name The required nodes name as a <strong>String</strong>.
86	* @return The requested <strong>Node</strong> if it is found, <i>null</i> otherwise.
87	* Soon to be deprecated!
88	*/
89	static final public Node getNodeFromNamed(Node parent, String name) {
90	Node child = null;
91	for(Node i = parent.getFirstChild(); i != null && child == null;
92	i = i.getNextSibling()) {
93	if(i.getNodeName().equals(name)) {
94	child = i;
95	}
96	}
97	return child;
98	}
99
100
101	/** Parse an XML document from a given file */
102	static public Document parseXMLFile(File xml_file)
103	{
104	Document document = null;
105
106	try {
107	FileInputStream fis = new FileInputStream(xml_file);
108	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
109	Reader r = new BufferedReader(isr);
110	InputSource isc = new InputSource(r);
111	DOMParser parser = new DOMParser();
112	parser.setFeature("http://xml.org/sax/features/validation", false);
113	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
114	// May or may not be ignored, the documentation for Xerces is contradictory. If it works then parsing -should- be faster.
115	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", true);
116	parser.setFeature("http://apache.org/xml/features/dom/include-ignorable-whitespace", false);
117	parser.parse(isc);
118	document = parser.getDocument();
119	isr.close();
120	fis.close();
121	}
122	catch (Exception exception) {
123	DebugStream.printStackTrace(exception);
124	}
125
126	return document;
127	}
128
129
130	/** Removes characters that are invalid in XML (see http://www.w3.org/TR/2000/REC-xml-20001006#charsets) */
131	static public String removeInvalidCharacters(String text)
132	{
133	char[] safe_characters = new char[text.length()];
134	int j = 0;
135
136	char[] raw_characters = new char[text.length()];
137	text.getChars(0, text.length(), raw_characters, 0);
138	for (int i = 0; i < raw_characters.length; i++) {
139	char character = raw_characters[i];
140	if ((character >= 0x20 && character <= 0xD7FF) \|\| character == 0x09 \|\| character == 0x0A \|\| character == 0x0D \|\| (character >= 0xE000 && character <= 0xFFFD) \|\| (character >= 0x10000 && character <= 0x10FFFF)) {
141	safe_characters[j] = character;
142	j++;
143	}
144	}
145
146	return new String(safe_characters, 0, j);
147	}
148
149
150	/** Set the #text node value of some element.
151	* @param element the Element whose value we wish to set
152	* @param value the new value for the element as a String
153	* Soon to be deprecated!
154	*/
155	static final public void setValue(Element element, String value) {
156	// Remove any existing child node(s)
157	clear(element);
158	// Add new text node.
159	if (value != null) {
160	element.appendChild(element.getOwnerDocument().createTextNode(value));
161	}
162	}
163
164
165	/** Write an XML document to a given file */
166	static public void writeXMLFile(File xml_file, Document document)
167	{
168	try {
169	OutputStream os = new FileOutputStream(xml_file);
170	// Create an output format for our document.
171	OutputFormat f = new OutputFormat(document);
172	f.setEncoding("UTF-8");
173	f.setIndenting(true);
174	f.setLineWidth(0); // Why isn't this working!
175	f.setPreserveSpace(false);
176	// Create the necessary writer stream for serialization.
177	OutputStreamWriter osw = new OutputStreamWriter(os, "UTF-8");
178	Writer w = new BufferedWriter(osw);
179	// Generate a new serializer from the above.
180	XMLSerializer s = new XMLSerializer(w, f);
181	s.asDOMSerializer();
182	// Finally serialize the document to file.
183	s.serialize(document);
184	// And close.
185	os.close();
186	}
187	catch (Exception exception) {
188	DebugStream.printStackTrace(exception);
189	}
190	}
191
192
193	/** ------------ OLD FUNCTIONS FROM UTILITY ---------------
194	/** Using this method we can request that a certain document be written, as valid XML, to a certain output stream. This makes use of the Xerces Serialization suite, which should in no way be confused with the usual method of Serialization used by Java. */
195	static public boolean export(Document document, String filename) {
196	return export(document, new File(filename));
197	}
198
199	static public boolean export(Document document, File file) {
200	try {
201	OutputStream os = new FileOutputStream(file);
202	// Create an output format for our document.
203	OutputFormat f = new OutputFormat(document);
204	f.setEncoding("UTF-8");
205	f.setIndenting(true);
206	f.setLineWidth(0); // Why isn't this working!
207	f.setPreserveSpace(false);
208	// Create the necessary writer stream for serialization.
209	OutputStreamWriter osw = new OutputStreamWriter(os, "UTF-8");
210	Writer w = new BufferedWriter(osw);
211	// Generate a new serializer from the above.
212	XMLSerializer s = new XMLSerializer(w, f);
213	s.asDOMSerializer();
214	// Finally serialize the document to file.
215	s.serialize(document);
216	// And close.
217	os.close();
218	return true;
219	}
220	// A file not found exception is most likely thrown because the directory the metadata.xml file is attempting to be written to no longer has any files in it. I'll add a test in MetadataXMLFile to test for this, but if it still happens ignore it (a non-existant directory can't really have metadata added to it any way.
221	catch (Exception exception) {
222	if(!file.getName().endsWith(StaticStrings.METADATA_XML)) {
223	DebugStream.printStackTrace(exception);
224	return false;
225	}
226	return true;
227	}
228	}
229
230	/** Parse in a xml document from a given file. */
231	static public Document parse(File file) {
232	return parse(file, true);
233	}
234
235	/** Parse in a xml document from a given file. */
236	static public Document parse(File file, boolean noisey)
237	{
238	Document document = null;
239	try {
240	if (file.exists()) {
241	DebugStream.println("Parsing XML file: " + file);
242	FileInputStream fis = new FileInputStream(file);
243	document = parse(fis, noisey);
244	}
245	}
246	catch (Exception error) {
247	if(noisey) {
248	error.printStackTrace();
249	DebugStream.println("Exception in Utility.parse() - Unexpected");
250	}
251	else {
252	DebugStream.println("Exception in Utility.parse() - Expected");
253	DebugStream.printStackTrace(error);
254	}
255	}
256
257	return document;
258	}
259
260	/** Parse in a xml document from a given URL. */
261	static public Document parse(URL url, boolean noisey)
262	{
263	Document document = null;
264	try {
265
266	URLConnection connection = url.openConnection();
267	InputStream is = connection.getInputStream();
268	document = parse(is,noisey);
269	}
270	catch (Exception error) {
271	if(noisey) {
272	error.printStackTrace();
273	DebugStream.println("Exception in Utility.parse() - Unexpected");
274	}
275	else {
276	DebugStream.println("Exception in Utility.parse() - Expected");
277	DebugStream.printStackTrace(error);
278	}
279	}
280
281	return document;
282	}
283
284	/** Parse in a xml document from a given file. */
285	static public Document parse(InputStream is, boolean noisey) {
286	Document document = null;
287	try {
288	InputStreamReader isr = new InputStreamReader(is, "UTF-8");
289	Reader r = new BufferedReader(isr);
290	InputSource isc = new InputSource(r);
291	DOMParser parser = new DOMParser();
292	parser.setFeature("http://xml.org/sax/features/validation", false);
293	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
294	// May or may not be ignored, the documentation for Xerces is contradictory. If it works then parsing -should- be faster.
295	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", true);
296	parser.setFeature("http://apache.org/xml/features/dom/include-ignorable-whitespace", false);
297	parser.parse(isc);
298	document = parser.getDocument();
299	isr.close();
300	is.close();
301	parser = null;
302	isc = null;
303	r = null;
304	isr = null;
305	is = null;
306	}
307	catch (Exception error) {
308	if(noisey) {
309	error.printStackTrace();
310	DebugStream.println("Exception in Utility.parse() - Unexpected");
311	}
312	else {
313	DebugStream.println("Exception in Utility.parse() - Expected");
314	}
315	DebugStream.printStackTrace(error);
316	}
317	return document;
318	}
319	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: