Context Navigation

source: greenstone3/trunk/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 16780

Last change on this file since 16780 was 16780, checked in by ak19, 16 years ago
Made new nodeToElement method in XMLConverter static so that some classes that need to use just this method, don't have to instantiate an XMLConverter containing DOMParser and all merely to use this method.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.1 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.Document;
23	import org.w3c.dom.DocumentType;
24	import org.w3c.dom.Element;
25	import org.w3c.dom.Node;
26	import org.w3c.dom.NodeList;
27	import org.w3c.dom.NamedNodeMap;
28	import org.xml.sax.InputSource;
29	import org.xml.sax.EntityResolver;
30	import org.apache.xerces.parsers.DOMParser;
31	import org.apache.xerces.dom.DocumentImpl; // for new Documents
32
33	// other java classes
34	import java.io.Reader;
35	import java.io.InputStreamReader;
36	import java.io.StringReader;
37	import java.io.File;
38	import java.io.FileInputStream;
39	import java.io.FileReader;
40
41	import org.apache.log4j.*;
42
43	/** XMLConverter - utility class for greenstone
44	*
45	* parses XML Strings into Documents, converts Nodes to Strings
46	* different parsers have different behaviour - can experiment in here
47	* now we only use xerces
48	*
49	* @author <a href="mailto:[email protected]">Katherine Don</a>
50	* @version $Revision: 16780 $
51	*
52	*/
53	public class XMLConverter {
54
55	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
56
57	/** xerces parser */
58	protected DOMParser parser = null;
59
60	private boolean outputEscaping = true;
61
62
63	/** the no-args constructor */
64	public XMLConverter() {
65	try {
66	this.parser = new DOMParser();
67	this.parser.setFeature("http://xml.org/sax/features/validation", false);
68	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
69	this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
70	// a performance test showed that having this on lead to increased
71	// memory use for small-medium docs, and not much gain for large
72	// docs.
73	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
74	this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
75	} catch (Exception e) {
76	logger.error(e.getMessage());
77	}
78	}
79
80	/** sets the entity resolver. pass in null to unset it */
81	public void setEntityResolver(EntityResolver er) {
82	this.parser.setEntityResolver(er);
83	}
84
85	/** Given a Node representing an Element or Document, will return the
86	* Element/docroot Element. Returns null if the Node was not an element. */
87	public static Element nodeToElement(Node node)
88	{
89	short nodeType = node.getNodeType();
90
91	if (nodeType == Node.DOCUMENT_NODE) {
92	Document docNode = (Document)node;
93	return docNode.getDocumentElement() ;
94	}
95	else if (nodeType == Node.ELEMENT_NODE) {
96	return (Element)node;
97	}
98	else {
99	String message = "Expecting Document or Element node type but got "
100	+ node.getNodeName() + "\nReturning null";
101	System.err.println(message);
102	logger.warn(message);
103	return null;
104	}
105	}
106
107	/** returns a DOM Document */
108	public Document getDOM(String in) {
109
110	try {
111	Reader reader = new StringReader(in);
112	InputSource xml_source = new InputSource(reader);
113
114	this.parser.parse(xml_source);
115	Document doc = this.parser.getDocument();
116
117	return doc;
118
119	} catch (Exception e) {
120	logger.error(e.getMessage());
121	}
122	return null;
123	}
124
125	/** returns a DOM Document */
126	public Document getDOM(File in) {
127	try {
128	FileReader reader = new FileReader(in);
129	InputSource xml_source = new InputSource(reader);
130	this.parser.parse(xml_source);
131	Document doc = this.parser.getDocument();
132	return doc;
133
134	} catch (Exception e) {
135	logger.error(e.getMessage(), e);
136
137	}
138	return null;
139	}
140
141	/** returns a DOM document */
142	public Document getDOM(File in, String encoding) {
143	try {
144
145	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
146	InputSource xml_source = new InputSource(isr);
147
148	this.parser.parse(xml_source);
149	Document doc = this.parser.getDocument();
150
151	return doc;
152
153	} catch (Exception e) {
154	logger.error(e.getMessage());
155	}
156	return null;
157	}
158
159
160	/** creates a new empty DOM Document */
161	public Document newDOM() {
162	Document doc = new DocumentImpl();
163	return doc;
164	}
165
166	/** returns the Node as a String */
167	public String getString(Node xmlNode)
168	{
169	outputEscaping = true;
170	StringBuffer xmlRepresentation = new StringBuffer();
171	getString(xmlNode, xmlRepresentation, 0, false);
172	return xmlRepresentation.toString();
173	}
174
175	/** returns the node as a nicely formatted String - this introduces extra
176	* text nodes if the String is read back in as a DOM, so should only be
177	* used for printing */
178	public String getPrettyString(Node xmlNode) {
179
180	outputEscaping = true;
181	StringBuffer xmlRepresentation = new StringBuffer();
182	getString(xmlNode, xmlRepresentation, 0, true);
183	return xmlRepresentation.toString();
184	}
185
186	private void getString(Node xmlNode, StringBuffer xmlRepresentation,
187	int depth, boolean pretty)
188	{
189
190	if (xmlNode == null) {
191	xmlRepresentation.append("<null>");
192	return;
193	}
194
195	short nodeType = xmlNode.getNodeType();
196	String nodeName = xmlNode.getNodeName();
197
198	if (nodeType == Node.DOCUMENT_NODE) {
199	Document xmlDocNode = (Document)xmlNode;
200
201	if (xmlDocNode.getDoctype() == null) {
202	System.err.println("Doctype is null.");
203	}
204	else {
205
206	DocumentType dt = xmlDocNode.getDoctype();
207
208	String name = dt.getName();
209	String pid = dt.getPublicId();
210	String sid = dt.getSystemId();
211
212	String doctype_str = "<!DOCTYPE " + dt.getName() + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
213
214	xmlRepresentation.append(doctype_str);
215	}
216	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
217	return;
218	}
219	// Handle Element nodes
220	if (nodeType == Node.ELEMENT_NODE) {
221	if (pretty) {
222	xmlRepresentation.append("\n");
223	for (int i = 0; i < depth; i++) {
224	xmlRepresentation.append(" ");
225	}
226	}
227
228	// Write opening tag
229	xmlRepresentation.append("<");
230	xmlRepresentation.append(nodeName);
231
232	// Write the node attributes
233	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
234	for (int i = 0; i < nodeAttributes.getLength(); i++) {
235	Node attribute = nodeAttributes.item(i);
236	xmlRepresentation.append(" ");
237	xmlRepresentation.append(attribute.getNodeName());
238	xmlRepresentation.append("=\"");
239	xmlRepresentation.append(attribute.getNodeValue());
240	xmlRepresentation.append("\"");
241	}
242
243	// If the node has no children, close the opening tag and return
244	if (xmlNode.hasChildNodes() == false) {
245	// This produces somewhat ugly output, but it is necessary to compensate
246	// for display bugs in Netscape. Firstly, the space is needed before the
247	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
248	// example). Also, a newline character would be expected after the tag,
249	// but this causes problems with the display of links (the link text
250	// will contain a newline character, which is displayed badly).
251	xmlRepresentation.append(" />");
252	return;
253	}
254
255	// Close the opening tag
256	xmlRepresentation.append(">");
257
258	// Apply recursively to the children of this node
259	// hack for nodes next to text nodes - dont make them pretty
260	// this is needed for text inside a <pre> element - any new lines
261	// or spaces around the span elements show up in the text
262	NodeList children = xmlNode.getChildNodes();
263	boolean do_pretty = pretty;
264	for (int i = 0; i < children.getLength(); i++) {
265	if (children.item(i).getNodeType()==Node.TEXT_NODE) {
266	do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
267	}
268	getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
269	}
270
271	// Write closing tag
272	if (pretty) {
273	if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
274	for (int i = 0; i < depth; i++)
275	xmlRepresentation.append(" ");
276	}
277	}
278	xmlRepresentation.append("</");
279	xmlRepresentation.append(nodeName);
280	xmlRepresentation.append(">");
281	if (pretty) {
282	xmlRepresentation.append("\n");
283	}
284	}
285
286	// Handle Text nodes
287	else if (nodeType == Node.TEXT_NODE) {
288	String text = xmlNode.getNodeValue();
289
290	// Perform output escaping, if required
291	if (outputEscaping) {
292	text = text.replaceAll("&", "&"); // Must be done first!!
293	text = text.replaceAll("<", "<");
294	text = text.replaceAll(">", ">");
295	text = text.replaceAll("\"", """);
296	text = text.replaceAll("\'", "'");
297	}
298
299	// Remove any control-C characters
300	text = text.replaceAll("" + (char) 3, "");
301	xmlRepresentation.append(text);
302	}
303
304	// Handle Processing Instruction nodes
305	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
306	if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
307	outputEscaping = false;
308	}
309	else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
310	outputEscaping = true;
311	}
312	else {
313	logger.warn("Unhandled processing instruction " + nodeName);
314	}
315	}
316
317	else if (nodeType == Node.COMMENT_NODE) {
318	String text = xmlNode.getNodeValue();
319	xmlRepresentation.append("<!-- ");
320	xmlRepresentation.append(text);
321	xmlRepresentation.append(" -->");
322	}
323
324
325
326	// A type of node that is not handled yet
327	else {
328	logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
329	}
330
331	return;
332	}
333
334	protected static String getNodeTypeString(short node_type) {
335
336	String type = "";
337	switch(node_type) {
338	case Node.ATTRIBUTE_NODE:
339	type="ATTRIBUTE_NODE";
340	break;
341	case Node.CDATA_SECTION_NODE:
342	type="CDATA_SECTION_NODE";
343	break;
344	case Node.COMMENT_NODE:
345	type="COMMENT_NODE";
346	break;
347	case Node.DOCUMENT_FRAGMENT_NODE:
348	type="DOCUMENT_FRAGMENT_NODE";
349	break;
350	case Node.DOCUMENT_NODE:
351	type="DOCUMENT_NODE";
352	break;
353	case Node.DOCUMENT_TYPE_NODE:
354	type="DOCUMENT_TYPE_NODE";
355	break;
356	case Node.ELEMENT_NODE:
357	type="ELEMENT_NODE";
358	break;
359	case Node.ENTITY_NODE:
360	type="ENTITY_NODE";
361	break;
362	case Node.ENTITY_REFERENCE_NODE:
363	type="ENTITY_REFERENCE_NODE";
364	break;
365	case Node.NOTATION_NODE:
366	type="NOTATION_NODE";
367	break;
368	case Node.PROCESSING_INSTRUCTION_NODE:
369	type="PROCESSING_INSTRUCTION_NODE";
370	break;
371	case Node.TEXT_NODE:
372	type="TEXT_NODE";
373	break;
374	default:
375	type="UNKNOWN";
376	}
377
378	return type;
379	}
380	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: