Context Navigation

source: trunk/gsdl3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 13210

Last change on this file since 13210 was 13210, checked in by kjdon, 17 years ago
Category is deprecated, should be using Logger instead. turned off external DTD loading in the parser
Property svn:keywords set to `Author Date Id Revision`
File size: 9.9 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.Document;
23	import org.w3c.dom.Node;
24	import org.w3c.dom.NodeList;
25	import org.w3c.dom.NamedNodeMap;
26	import org.xml.sax.InputSource;
27	import org.xml.sax.EntityResolver;
28	import org.apache.xerces.parsers.DOMParser;
29	import org.apache.xerces.dom.DocumentImpl; // for new Documents
30
31	// other java classes
32	import java.io.Reader;
33	import java.io.InputStreamReader;
34	import java.io.StringReader;
35	import java.io.File;
36	import java.io.FileInputStream;
37	import java.io.FileReader;
38
39	import org.apache.log4j.*;
40
41	/** XMLConverter - utility class for greenstone
42	*
43	* parses XML Strings into Documents, converts Nodes to Strings
44	* different parsers have different behaviour - can experiment in here
45	* now we only use xerces
46	*
47	* @author <a href="mailto:[email protected]">Katherine Don</a>
48	* @version $Revision: 13210 $
49	*
50	*/
51	public class XMLConverter {
52
53	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
54
55	/** xerces parser */
56	protected DOMParser parser = null;
57
58	private boolean outputEscaping = true;
59
60
61	/** the no-args constructor */
62	public XMLConverter() {
63	try {
64	this.parser = new DOMParser();
65	this.parser.setFeature("http://xml.org/sax/features/validation", false);
66	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
67	this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
68	// a performance test showed that having this on lead to increased
69	// memory use for small-medium docs, and not much gain for large
70	// docs.
71	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
72	this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
73	} catch (Exception e) {
74	logger.error(e.getMessage());
75	}
76	}
77
78	/** sets the entity resolver. pass in null to unset it */
79	public void setEntityResolver(EntityResolver er) {
80	this.parser.setEntityResolver(er);
81	}
82
83	/** returns a DOM Document */
84	public Document getDOM(String in) {
85
86	try {
87	Reader reader = new StringReader(in);
88	InputSource xml_source = new InputSource(reader);
89
90	this.parser.parse(xml_source);
91	Document doc = this.parser.getDocument();
92
93	return doc;
94
95	} catch (Exception e) {
96	logger.error(e.getMessage());
97	}
98	return null;
99	}
100
101	/** returns a DOM Document */
102	public Document getDOM(File in) {
103	try {
104	FileReader reader = new FileReader(in);
105	InputSource xml_source = new InputSource(reader);
106	this.parser.parse(xml_source);
107	Document doc = this.parser.getDocument();
108	return doc;
109
110	} catch (Exception e) {
111	logger.error(e.getMessage(), e);
112
113	}
114	return null;
115	}
116
117	/** returns a DOM document */
118	public Document getDOM(File in, String encoding) {
119	try {
120
121	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
122	InputSource xml_source = new InputSource(isr);
123
124	this.parser.parse(xml_source);
125	Document doc = this.parser.getDocument();
126
127	return doc;
128
129	} catch (Exception e) {
130	logger.error(e.getMessage());
131	}
132	return null;
133	}
134
135
136	/** creates a new empty DOM Document */
137	public Document newDOM() {
138	Document doc = new DocumentImpl();
139	return doc;
140	}
141
142	/** returns the Node as a String */
143	public String getString(Node xmlNode)
144	{
145	outputEscaping = true;
146	StringBuffer xmlRepresentation = new StringBuffer();
147	getString(xmlNode, xmlRepresentation, 0, false);
148	return xmlRepresentation.toString();
149	}
150
151	/** returns the node as a nicely formatted String - this introduces extra
152	* text nodes if the String is read back in as a DOM, so should only be
153	* used for printing */
154	public String getPrettyString(Node xmlNode) {
155
156	outputEscaping = true;
157	StringBuffer xmlRepresentation = new StringBuffer();
158	getString(xmlNode, xmlRepresentation, 0, true);
159	return xmlRepresentation.toString();
160	}
161
162	private void getString(Node xmlNode, StringBuffer xmlRepresentation,
163	int depth, boolean pretty)
164	{
165
166	if (xmlNode == null) {
167	xmlRepresentation.append("<null>");
168	return;
169	}
170
171	short nodeType = xmlNode.getNodeType();
172	String nodeName = xmlNode.getNodeName();
173
174	if (nodeType == Node.DOCUMENT_NODE) {
175	getString(((Document)xmlNode).getDocumentElement(), xmlRepresentation, depth, pretty);
176	return;
177	}
178	// Handle Element nodes
179	if (nodeType == Node.ELEMENT_NODE) {
180	if (pretty) {
181	xmlRepresentation.append("\n");
182	for (int i = 0; i < depth; i++) {
183	xmlRepresentation.append(" ");
184	}
185	}
186
187	// Write opening tag
188	xmlRepresentation.append("<");
189	xmlRepresentation.append(nodeName);
190
191	// Write the node attributes
192	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
193	for (int i = 0; i < nodeAttributes.getLength(); i++) {
194	Node attribute = nodeAttributes.item(i);
195	xmlRepresentation.append(" ");
196	xmlRepresentation.append(attribute.getNodeName());
197	xmlRepresentation.append("=\"");
198	xmlRepresentation.append(attribute.getNodeValue());
199	xmlRepresentation.append("\"");
200	}
201
202	// If the node has no children, close the opening tag and return
203	if (xmlNode.hasChildNodes() == false) {
204	// This produces somewhat ugly output, but it is necessary to compensate
205	// for display bugs in Netscape. Firstly, the space is needed before the
206	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
207	// example). Also, a newline character would be expected after the tag,
208	// but this causes problems with the display of links (the link text
209	// will contain a newline character, which is displayed badly).
210	xmlRepresentation.append(" />");
211	return;
212	}
213
214	// Close the opening tag
215	xmlRepresentation.append(">");
216
217	// Apply recursively to the children of this node
218	// hack for nodes next to text nodes - dont make them pretty
219	// this is needed for text inside a <pre> element - any new lines
220	// or spaces around the span elements show up in the text
221	NodeList children = xmlNode.getChildNodes();
222	boolean do_pretty = pretty;
223	for (int i = 0; i < children.getLength(); i++) {
224	if (children.item(i).getNodeType()==Node.TEXT_NODE) {
225	do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
226	}
227	getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
228	}
229
230	// Write closing tag
231	if (pretty) {
232	if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
233	for (int i = 0; i < depth; i++)
234	xmlRepresentation.append(" ");
235	}
236	}
237	xmlRepresentation.append("</");
238	xmlRepresentation.append(nodeName);
239	xmlRepresentation.append(">");
240	if (pretty) {
241	xmlRepresentation.append("\n");
242	}
243	}
244
245	// Handle Text nodes
246	else if (nodeType == Node.TEXT_NODE) {
247	String text = xmlNode.getNodeValue();
248
249	// Perform output escaping, if required
250	if (outputEscaping) {
251	text = text.replaceAll("&", "&"); // Must be done first!!
252	text = text.replaceAll("<", "<");
253	text = text.replaceAll(">", ">");
254	text = text.replaceAll("\"", """);
255	text = text.replaceAll("\'", "'");
256	}
257
258	// Remove any control-C characters
259	text = text.replaceAll("" + (char) 3, "");
260	xmlRepresentation.append(text);
261	}
262
263	// Handle Processing Instruction nodes
264	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
265	if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
266	outputEscaping = false;
267	}
268	else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
269	outputEscaping = true;
270	}
271	else {
272	logger.warn("Unhandled processing instruction " + nodeName);
273	}
274	}
275
276	else if (nodeType == Node.COMMENT_NODE) {
277	String text = xmlNode.getNodeValue();
278	xmlRepresentation.append("<!-- ");
279	xmlRepresentation.append(text);
280	xmlRepresentation.append(" -->");
281	}
282
283
284
285	// A type of node that is not handled yet
286	else {
287	logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
288	}
289
290	return;
291	}
292
293	protected static String getNodeTypeString(short node_type) {
294
295	String type = "";
296	switch(node_type) {
297	case Node.ATTRIBUTE_NODE:
298	type="ATTRIBUTE_NODE";
299	break;
300	case Node.CDATA_SECTION_NODE:
301	type="CDATA_SECTION_NODE";
302	break;
303	case Node.COMMENT_NODE:
304	type="COMMENT_NODE";
305	break;
306	case Node.DOCUMENT_FRAGMENT_NODE:
307	type="DOCUMENT_FRAGMENT_NODE";
308	break;
309	case Node.DOCUMENT_NODE:
310	type="DOCUMENT_NODE";
311	break;
312	case Node.DOCUMENT_TYPE_NODE:
313	type="DOCUMENT_TYPE_NODE";
314	break;
315	case Node.ELEMENT_NODE:
316	type="ELEMENT_NODE";
317	break;
318	case Node.ENTITY_NODE:
319	type="ENTITY_NODE";
320	break;
321	case Node.ENTITY_REFERENCE_NODE:
322	type="ENTITY_REFERENCE_NODE";
323	break;
324	case Node.NOTATION_NODE:
325	type="NOTATION_NODE";
326	break;
327	case Node.PROCESSING_INSTRUCTION_NODE:
328	type="PROCESSING_INSTRUCTION_NODE";
329	break;
330	case Node.TEXT_NODE:
331	type="TEXT_NODE";
332	break;
333	default:
334	type="UNKNOWN";
335	}
336
337	return type;
338	}
339	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: