Context Navigation

source: greenstone3/trunk/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 16374

Last change on this file since 16374 was 16374, checked in by davidb, 16 years ago
Change to Skin code (returning Node note Element) so XSLT encodes its DocType -- important information for IE to render resultant HTML correctly. This also required Skin.java to be changed from using DomResult to StreamResult. The former is known to have a problem with loosing its DocType info, and as it's then read-only, has no elegant way to put back this info.
Property svn:keywords set to `Author Date Id Revision`
File size: 10.4 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.Document;
23	import org.w3c.dom.DocumentType;
24	import org.w3c.dom.Node;
25	import org.w3c.dom.NodeList;
26	import org.w3c.dom.NamedNodeMap;
27	import org.xml.sax.InputSource;
28	import org.xml.sax.EntityResolver;
29	import org.apache.xerces.parsers.DOMParser;
30	import org.apache.xerces.dom.DocumentImpl; // for new Documents
31
32	// other java classes
33	import java.io.Reader;
34	import java.io.InputStreamReader;
35	import java.io.StringReader;
36	import java.io.File;
37	import java.io.FileInputStream;
38	import java.io.FileReader;
39
40	import org.apache.log4j.*;
41
42	/** XMLConverter - utility class for greenstone
43	*
44	* parses XML Strings into Documents, converts Nodes to Strings
45	* different parsers have different behaviour - can experiment in here
46	* now we only use xerces
47	*
48	* @author <a href="mailto:[email protected]">Katherine Don</a>
49	* @version $Revision: 16374 $
50	*
51	*/
52	public class XMLConverter {
53
54	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
55
56	/** xerces parser */
57	protected DOMParser parser = null;
58
59	private boolean outputEscaping = true;
60
61
62	/** the no-args constructor */
63	public XMLConverter() {
64	try {
65	this.parser = new DOMParser();
66	this.parser.setFeature("http://xml.org/sax/features/validation", false);
67	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
68	this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
69	// a performance test showed that having this on lead to increased
70	// memory use for small-medium docs, and not much gain for large
71	// docs.
72	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
73	this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
74	} catch (Exception e) {
75	logger.error(e.getMessage());
76	}
77	}
78
79	/** sets the entity resolver. pass in null to unset it */
80	public void setEntityResolver(EntityResolver er) {
81	this.parser.setEntityResolver(er);
82	}
83
84	/** returns a DOM Document */
85	public Document getDOM(String in) {
86
87	try {
88	Reader reader = new StringReader(in);
89	InputSource xml_source = new InputSource(reader);
90
91	this.parser.parse(xml_source);
92	Document doc = this.parser.getDocument();
93
94	return doc;
95
96	} catch (Exception e) {
97	logger.error(e.getMessage());
98	}
99	return null;
100	}
101
102	/** returns a DOM Document */
103	public Document getDOM(File in) {
104	try {
105	FileReader reader = new FileReader(in);
106	InputSource xml_source = new InputSource(reader);
107	this.parser.parse(xml_source);
108	Document doc = this.parser.getDocument();
109	return doc;
110
111	} catch (Exception e) {
112	logger.error(e.getMessage(), e);
113
114	}
115	return null;
116	}
117
118	/** returns a DOM document */
119	public Document getDOM(File in, String encoding) {
120	try {
121
122	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
123	InputSource xml_source = new InputSource(isr);
124
125	this.parser.parse(xml_source);
126	Document doc = this.parser.getDocument();
127
128	return doc;
129
130	} catch (Exception e) {
131	logger.error(e.getMessage());
132	}
133	return null;
134	}
135
136
137	/** creates a new empty DOM Document */
138	public Document newDOM() {
139	Document doc = new DocumentImpl();
140	return doc;
141	}
142
143	/** returns the Node as a String */
144	public String getString(Node xmlNode)
145	{
146	outputEscaping = true;
147	StringBuffer xmlRepresentation = new StringBuffer();
148	getString(xmlNode, xmlRepresentation, 0, false);
149	return xmlRepresentation.toString();
150	}
151
152	/** returns the node as a nicely formatted String - this introduces extra
153	* text nodes if the String is read back in as a DOM, so should only be
154	* used for printing */
155	public String getPrettyString(Node xmlNode) {
156
157	outputEscaping = true;
158	StringBuffer xmlRepresentation = new StringBuffer();
159	getString(xmlNode, xmlRepresentation, 0, true);
160	return xmlRepresentation.toString();
161	}
162
163	private void getString(Node xmlNode, StringBuffer xmlRepresentation,
164	int depth, boolean pretty)
165	{
166
167	if (xmlNode == null) {
168	xmlRepresentation.append("<null>");
169	return;
170	}
171
172	short nodeType = xmlNode.getNodeType();
173	String nodeName = xmlNode.getNodeName();
174
175	if (nodeType == Node.DOCUMENT_NODE) {
176	Document xmlDocNode = (Document)xmlNode;
177
178	if (xmlDocNode.getDoctype() == null) {
179	System.err.println("Doctype is null.");
180	}
181	else {
182
183	DocumentType dt = xmlDocNode.getDoctype();
184
185	String name = dt.getName();
186	String pid = dt.getPublicId();
187	String sid = dt.getSystemId();
188
189	String doctype_str = "<!DOCTYPE " + dt.getName() + " PUBLIC \"" + pid + "\"" + sid + "\">\n";
190
191	xmlRepresentation.append(doctype_str);
192	}
193	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
194	return;
195	}
196	// Handle Element nodes
197	if (nodeType == Node.ELEMENT_NODE) {
198	if (pretty) {
199	xmlRepresentation.append("\n");
200	for (int i = 0; i < depth; i++) {
201	xmlRepresentation.append(" ");
202	}
203	}
204
205	// Write opening tag
206	xmlRepresentation.append("<");
207	xmlRepresentation.append(nodeName);
208
209	// Write the node attributes
210	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
211	for (int i = 0; i < nodeAttributes.getLength(); i++) {
212	Node attribute = nodeAttributes.item(i);
213	xmlRepresentation.append(" ");
214	xmlRepresentation.append(attribute.getNodeName());
215	xmlRepresentation.append("=\"");
216	xmlRepresentation.append(attribute.getNodeValue());
217	xmlRepresentation.append("\"");
218	}
219
220	// If the node has no children, close the opening tag and return
221	if (xmlNode.hasChildNodes() == false) {
222	// This produces somewhat ugly output, but it is necessary to compensate
223	// for display bugs in Netscape. Firstly, the space is needed before the
224	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
225	// example). Also, a newline character would be expected after the tag,
226	// but this causes problems with the display of links (the link text
227	// will contain a newline character, which is displayed badly).
228	xmlRepresentation.append(" />");
229	return;
230	}
231
232	// Close the opening tag
233	xmlRepresentation.append(">");
234
235	// Apply recursively to the children of this node
236	// hack for nodes next to text nodes - dont make them pretty
237	// this is needed for text inside a <pre> element - any new lines
238	// or spaces around the span elements show up in the text
239	NodeList children = xmlNode.getChildNodes();
240	boolean do_pretty = pretty;
241	for (int i = 0; i < children.getLength(); i++) {
242	if (children.item(i).getNodeType()==Node.TEXT_NODE) {
243	do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
244	}
245	getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
246	}
247
248	// Write closing tag
249	if (pretty) {
250	if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
251	for (int i = 0; i < depth; i++)
252	xmlRepresentation.append(" ");
253	}
254	}
255	xmlRepresentation.append("</");
256	xmlRepresentation.append(nodeName);
257	xmlRepresentation.append(">");
258	if (pretty) {
259	xmlRepresentation.append("\n");
260	}
261	}
262
263	// Handle Text nodes
264	else if (nodeType == Node.TEXT_NODE) {
265	String text = xmlNode.getNodeValue();
266
267	// Perform output escaping, if required
268	if (outputEscaping) {
269	text = text.replaceAll("&", "&"); // Must be done first!!
270	text = text.replaceAll("<", "<");
271	text = text.replaceAll(">", ">");
272	text = text.replaceAll("\"", """);
273	text = text.replaceAll("\'", "'");
274	}
275
276	// Remove any control-C characters
277	text = text.replaceAll("" + (char) 3, "");
278	xmlRepresentation.append(text);
279	}
280
281	// Handle Processing Instruction nodes
282	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
283	if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
284	outputEscaping = false;
285	}
286	else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
287	outputEscaping = true;
288	}
289	else {
290	logger.warn("Unhandled processing instruction " + nodeName);
291	}
292	}
293
294	else if (nodeType == Node.COMMENT_NODE) {
295	String text = xmlNode.getNodeValue();
296	xmlRepresentation.append("<!-- ");
297	xmlRepresentation.append(text);
298	xmlRepresentation.append(" -->");
299	}
300
301
302
303	// A type of node that is not handled yet
304	else {
305	logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
306	}
307
308	return;
309	}
310
311	protected static String getNodeTypeString(short node_type) {
312
313	String type = "";
314	switch(node_type) {
315	case Node.ATTRIBUTE_NODE:
316	type="ATTRIBUTE_NODE";
317	break;
318	case Node.CDATA_SECTION_NODE:
319	type="CDATA_SECTION_NODE";
320	break;
321	case Node.COMMENT_NODE:
322	type="COMMENT_NODE";
323	break;
324	case Node.DOCUMENT_FRAGMENT_NODE:
325	type="DOCUMENT_FRAGMENT_NODE";
326	break;
327	case Node.DOCUMENT_NODE:
328	type="DOCUMENT_NODE";
329	break;
330	case Node.DOCUMENT_TYPE_NODE:
331	type="DOCUMENT_TYPE_NODE";
332	break;
333	case Node.ELEMENT_NODE:
334	type="ELEMENT_NODE";
335	break;
336	case Node.ENTITY_NODE:
337	type="ENTITY_NODE";
338	break;
339	case Node.ENTITY_REFERENCE_NODE:
340	type="ENTITY_REFERENCE_NODE";
341	break;
342	case Node.NOTATION_NODE:
343	type="NOTATION_NODE";
344	break;
345	case Node.PROCESSING_INSTRUCTION_NODE:
346	type="PROCESSING_INSTRUCTION_NODE";
347	break;
348	case Node.TEXT_NODE:
349	type="TEXT_NODE";
350	break;
351	default:
352	type="UNKNOWN";
353	}
354
355	return type;
356	}
357	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: