Context Navigation

source: greenstone3/trunk/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 16690

Last change on this file since 16690 was 16690, checked in by max, 16 years ago
Fix the missing space and double quotes in the Doctype output.
Property svn:keywords set to `Author Date Id Revision`
File size: 10.9 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.Document;
23	import org.w3c.dom.DocumentType;
24	import org.w3c.dom.Element;
25	import org.w3c.dom.Node;
26	import org.w3c.dom.NodeList;
27	import org.w3c.dom.NamedNodeMap;
28	import org.xml.sax.InputSource;
29	import org.xml.sax.EntityResolver;
30	import org.apache.xerces.parsers.DOMParser;
31	import org.apache.xerces.dom.DocumentImpl; // for new Documents
32
33	// other java classes
34	import java.io.Reader;
35	import java.io.InputStreamReader;
36	import java.io.StringReader;
37	import java.io.File;
38	import java.io.FileInputStream;
39	import java.io.FileReader;
40
41	import org.apache.log4j.*;
42
43	/** XMLConverter - utility class for greenstone
44	*
45	* parses XML Strings into Documents, converts Nodes to Strings
46	* different parsers have different behaviour - can experiment in here
47	* now we only use xerces
48	*
49	* @author <a href="mailto:[email protected]">Katherine Don</a>
50	* @version $Revision: 16690 $
51	*
52	*/
53	public class XMLConverter {
54
55	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
56
57	/** xerces parser */
58	protected DOMParser parser = null;
59
60	private boolean outputEscaping = true;
61
62
63	/** the no-args constructor */
64	public XMLConverter() {
65	try {
66	this.parser = new DOMParser();
67	this.parser.setFeature("http://xml.org/sax/features/validation", false);
68	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
69	this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
70	// a performance test showed that having this on lead to increased
71	// memory use for small-medium docs, and not much gain for large
72	// docs.
73	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
74	this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
75	} catch (Exception e) {
76	logger.error(e.getMessage());
77	}
78	}
79
80	/** sets the entity resolver. pass in null to unset it */
81	public void setEntityResolver(EntityResolver er) {
82	this.parser.setEntityResolver(er);
83	}
84
85
86	public Element nodeToElement(Node node)
87	{
88	short nodeType = node.getNodeType();
89
90	if (nodeType == Node.DOCUMENT_NODE) {
91	Document docNode = (Document)node;
92	return docNode.getDocumentElement() ;
93	}
94	else if (nodeType == Node.ELEMENT_NODE) {
95	return (Element)node;
96	}
97	else {
98	System.err.println("Expecting Document or Element node type but got " + node.getNodeName());
99	System.err.println("Returning null");
100	return null;
101	}
102	}
103
104	/** returns a DOM Document */
105	public Document getDOM(String in) {
106
107	try {
108	Reader reader = new StringReader(in);
109	InputSource xml_source = new InputSource(reader);
110
111	this.parser.parse(xml_source);
112	Document doc = this.parser.getDocument();
113
114	return doc;
115
116	} catch (Exception e) {
117	logger.error(e.getMessage());
118	}
119	return null;
120	}
121
122	/** returns a DOM Document */
123	public Document getDOM(File in) {
124	try {
125	FileReader reader = new FileReader(in);
126	InputSource xml_source = new InputSource(reader);
127	this.parser.parse(xml_source);
128	Document doc = this.parser.getDocument();
129	return doc;
130
131	} catch (Exception e) {
132	logger.error(e.getMessage(), e);
133
134	}
135	return null;
136	}
137
138	/** returns a DOM document */
139	public Document getDOM(File in, String encoding) {
140	try {
141
142	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
143	InputSource xml_source = new InputSource(isr);
144
145	this.parser.parse(xml_source);
146	Document doc = this.parser.getDocument();
147
148	return doc;
149
150	} catch (Exception e) {
151	logger.error(e.getMessage());
152	}
153	return null;
154	}
155
156
157	/** creates a new empty DOM Document */
158	public Document newDOM() {
159	Document doc = new DocumentImpl();
160	return doc;
161	}
162
163	/** returns the Node as a String */
164	public String getString(Node xmlNode)
165	{
166	outputEscaping = true;
167	StringBuffer xmlRepresentation = new StringBuffer();
168	getString(xmlNode, xmlRepresentation, 0, false);
169	return xmlRepresentation.toString();
170	}
171
172	/** returns the node as a nicely formatted String - this introduces extra
173	* text nodes if the String is read back in as a DOM, so should only be
174	* used for printing */
175	public String getPrettyString(Node xmlNode) {
176
177	outputEscaping = true;
178	StringBuffer xmlRepresentation = new StringBuffer();
179	getString(xmlNode, xmlRepresentation, 0, true);
180	return xmlRepresentation.toString();
181	}
182
183	private void getString(Node xmlNode, StringBuffer xmlRepresentation,
184	int depth, boolean pretty)
185	{
186
187	if (xmlNode == null) {
188	xmlRepresentation.append("<null>");
189	return;
190	}
191
192	short nodeType = xmlNode.getNodeType();
193	String nodeName = xmlNode.getNodeName();
194
195	if (nodeType == Node.DOCUMENT_NODE) {
196	Document xmlDocNode = (Document)xmlNode;
197
198	if (xmlDocNode.getDoctype() == null) {
199	System.err.println("Doctype is null.");
200	}
201	else {
202
203	DocumentType dt = xmlDocNode.getDoctype();
204
205	String name = dt.getName();
206	String pid = dt.getPublicId();
207	String sid = dt.getSystemId();
208
209	String doctype_str = "<!DOCTYPE " + dt.getName() + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
210
211	xmlRepresentation.append(doctype_str);
212	}
213	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
214	return;
215	}
216	// Handle Element nodes
217	if (nodeType == Node.ELEMENT_NODE) {
218	if (pretty) {
219	xmlRepresentation.append("\n");
220	for (int i = 0; i < depth; i++) {
221	xmlRepresentation.append(" ");
222	}
223	}
224
225	// Write opening tag
226	xmlRepresentation.append("<");
227	xmlRepresentation.append(nodeName);
228
229	// Write the node attributes
230	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
231	for (int i = 0; i < nodeAttributes.getLength(); i++) {
232	Node attribute = nodeAttributes.item(i);
233	xmlRepresentation.append(" ");
234	xmlRepresentation.append(attribute.getNodeName());
235	xmlRepresentation.append("=\"");
236	xmlRepresentation.append(attribute.getNodeValue());
237	xmlRepresentation.append("\"");
238	}
239
240	// If the node has no children, close the opening tag and return
241	if (xmlNode.hasChildNodes() == false) {
242	// This produces somewhat ugly output, but it is necessary to compensate
243	// for display bugs in Netscape. Firstly, the space is needed before the
244	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
245	// example). Also, a newline character would be expected after the tag,
246	// but this causes problems with the display of links (the link text
247	// will contain a newline character, which is displayed badly).
248	xmlRepresentation.append(" />");
249	return;
250	}
251
252	// Close the opening tag
253	xmlRepresentation.append(">");
254
255	// Apply recursively to the children of this node
256	// hack for nodes next to text nodes - dont make them pretty
257	// this is needed for text inside a <pre> element - any new lines
258	// or spaces around the span elements show up in the text
259	NodeList children = xmlNode.getChildNodes();
260	boolean do_pretty = pretty;
261	for (int i = 0; i < children.getLength(); i++) {
262	if (children.item(i).getNodeType()==Node.TEXT_NODE) {
263	do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
264	}
265	getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
266	}
267
268	// Write closing tag
269	if (pretty) {
270	if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
271	for (int i = 0; i < depth; i++)
272	xmlRepresentation.append(" ");
273	}
274	}
275	xmlRepresentation.append("</");
276	xmlRepresentation.append(nodeName);
277	xmlRepresentation.append(">");
278	if (pretty) {
279	xmlRepresentation.append("\n");
280	}
281	}
282
283	// Handle Text nodes
284	else if (nodeType == Node.TEXT_NODE) {
285	String text = xmlNode.getNodeValue();
286
287	// Perform output escaping, if required
288	if (outputEscaping) {
289	text = text.replaceAll("&", "&"); // Must be done first!!
290	text = text.replaceAll("<", "<");
291	text = text.replaceAll(">", ">");
292	text = text.replaceAll("\"", """);
293	text = text.replaceAll("\'", "'");
294	}
295
296	// Remove any control-C characters
297	text = text.replaceAll("" + (char) 3, "");
298	xmlRepresentation.append(text);
299	}
300
301	// Handle Processing Instruction nodes
302	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
303	if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
304	outputEscaping = false;
305	}
306	else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
307	outputEscaping = true;
308	}
309	else {
310	logger.warn("Unhandled processing instruction " + nodeName);
311	}
312	}
313
314	else if (nodeType == Node.COMMENT_NODE) {
315	String text = xmlNode.getNodeValue();
316	xmlRepresentation.append("<!-- ");
317	xmlRepresentation.append(text);
318	xmlRepresentation.append(" -->");
319	}
320
321
322
323	// A type of node that is not handled yet
324	else {
325	logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
326	}
327
328	return;
329	}
330
331	protected static String getNodeTypeString(short node_type) {
332
333	String type = "";
334	switch(node_type) {
335	case Node.ATTRIBUTE_NODE:
336	type="ATTRIBUTE_NODE";
337	break;
338	case Node.CDATA_SECTION_NODE:
339	type="CDATA_SECTION_NODE";
340	break;
341	case Node.COMMENT_NODE:
342	type="COMMENT_NODE";
343	break;
344	case Node.DOCUMENT_FRAGMENT_NODE:
345	type="DOCUMENT_FRAGMENT_NODE";
346	break;
347	case Node.DOCUMENT_NODE:
348	type="DOCUMENT_NODE";
349	break;
350	case Node.DOCUMENT_TYPE_NODE:
351	type="DOCUMENT_TYPE_NODE";
352	break;
353	case Node.ELEMENT_NODE:
354	type="ELEMENT_NODE";
355	break;
356	case Node.ENTITY_NODE:
357	type="ENTITY_NODE";
358	break;
359	case Node.ENTITY_REFERENCE_NODE:
360	type="ENTITY_REFERENCE_NODE";
361	break;
362	case Node.NOTATION_NODE:
363	type="NOTATION_NODE";
364	break;
365	case Node.PROCESSING_INSTRUCTION_NODE:
366	type="PROCESSING_INSTRUCTION_NODE";
367	break;
368	case Node.TEXT_NODE:
369	type="TEXT_NODE";
370	break;
371	default:
372	type="UNKNOWN";
373	}
374
375	return type;
376	}
377	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: