Context Navigation

source: greenstone3/trunk/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 16688

Last change on this file since 16688 was 16688, checked in by davidb, 16 years ago
Changed 'Element process(Element)' in ModuleInterface to 'Node process(Node)'. After some deliberation is was decided this is a more useful (generic) layer of the DOM to pass information around in. Helps with the DocType problem when producing XSL Transformed pages, for example. When this was an Element, it would loose track of its DocType. Supporting method provided in XMLConverter 'Element nodeToElement(Node)' which checks a nodes docType and casts to Element if appropriate, or if a Document, typecasts to that and then extracts the top-level Element. With this fundamental change in ModuleInterface, around 20 files needed to be updated (Actions, Services, etc) that build on top of 'process()' to reflect this change, and use nodeToElement where necessary.
Property svn:keywords set to `Author Date Id Revision`
File size: 10.9 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.Document;
23	import org.w3c.dom.DocumentType;
24	import org.w3c.dom.Element;
25	import org.w3c.dom.Node;
26	import org.w3c.dom.NodeList;
27	import org.w3c.dom.NamedNodeMap;
28	import org.xml.sax.InputSource;
29	import org.xml.sax.EntityResolver;
30	import org.apache.xerces.parsers.DOMParser;
31	import org.apache.xerces.dom.DocumentImpl; // for new Documents
32
33	// other java classes
34	import java.io.Reader;
35	import java.io.InputStreamReader;
36	import java.io.StringReader;
37	import java.io.File;
38	import java.io.FileInputStream;
39	import java.io.FileReader;
40
41	import org.apache.log4j.*;
42
43	/** XMLConverter - utility class for greenstone
44	*
45	* parses XML Strings into Documents, converts Nodes to Strings
46	* different parsers have different behaviour - can experiment in here
47	* now we only use xerces
48	*
49	* @author <a href="mailto:[email protected]">Katherine Don</a>
50	* @version $Revision: 16688 $
51	*
52	*/
53	public class XMLConverter {
54
55	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
56
57	/** xerces parser */
58	protected DOMParser parser = null;
59
60	private boolean outputEscaping = true;
61
62
63	/** the no-args constructor */
64	public XMLConverter() {
65	try {
66	this.parser = new DOMParser();
67	this.parser.setFeature("http://xml.org/sax/features/validation", false);
68	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
69	this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
70	// a performance test showed that having this on lead to increased
71	// memory use for small-medium docs, and not much gain for large
72	// docs.
73	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
74	this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
75	} catch (Exception e) {
76	logger.error(e.getMessage());
77	}
78	}
79
80	/** sets the entity resolver. pass in null to unset it */
81	public void setEntityResolver(EntityResolver er) {
82	this.parser.setEntityResolver(er);
83	}
84
85
86	public Element nodeToElement(Node node)
87	{
88	short nodeType = node.getNodeType();
89
90	if (nodeType == Node.DOCUMENT_NODE) {
91	Document docNode = (Document)node;
92	return docNode.getDocumentElement() ;
93	}
94	else if (nodeType == Node.ELEMENT_NODE) {
95	return (Element)node;
96	}
97	else {
98	System.err.println("Expecting Document or Element node type but got " + node.getNodeName());
99	System.err.println("Returning null");
100	return null;
101	}
102	}
103
104	/** returns a DOM Document */
105	public Document getDOM(String in) {
106
107	try {
108	Reader reader = new StringReader(in);
109	InputSource xml_source = new InputSource(reader);
110
111	this.parser.parse(xml_source);
112	Document doc = this.parser.getDocument();
113
114	return doc;
115
116	} catch (Exception e) {
117	logger.error(e.getMessage());
118	}
119	return null;
120	}
121
122	/** returns a DOM Document */
123	public Document getDOM(File in) {
124	try {
125	FileReader reader = new FileReader(in);
126	InputSource xml_source = new InputSource(reader);
127	this.parser.parse(xml_source);
128	Document doc = this.parser.getDocument();
129	return doc;
130
131	} catch (Exception e) {
132	logger.error(e.getMessage(), e);
133
134	}
135	return null;
136	}
137
138	/** returns a DOM document */
139	public Document getDOM(File in, String encoding) {
140	try {
141
142	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
143	InputSource xml_source = new InputSource(isr);
144
145	this.parser.parse(xml_source);
146	Document doc = this.parser.getDocument();
147
148	return doc;
149
150	} catch (Exception e) {
151	logger.error(e.getMessage());
152	}
153	return null;
154	}
155
156
157	/** creates a new empty DOM Document */
158	public Document newDOM() {
159	Document doc = new DocumentImpl();
160	return doc;
161	}
162
163	/** returns the Node as a String */
164	public String getString(Node xmlNode)
165	{
166	outputEscaping = true;
167	StringBuffer xmlRepresentation = new StringBuffer();
168	getString(xmlNode, xmlRepresentation, 0, false);
169	return xmlRepresentation.toString();
170	}
171
172	/** returns the node as a nicely formatted String - this introduces extra
173	* text nodes if the String is read back in as a DOM, so should only be
174	* used for printing */
175	public String getPrettyString(Node xmlNode) {
176
177	outputEscaping = true;
178	StringBuffer xmlRepresentation = new StringBuffer();
179	getString(xmlNode, xmlRepresentation, 0, true);
180	return xmlRepresentation.toString();
181	}
182
183	private void getString(Node xmlNode, StringBuffer xmlRepresentation,
184	int depth, boolean pretty)
185	{
186
187	if (xmlNode == null) {
188	xmlRepresentation.append("<null>");
189	return;
190	}
191
192	short nodeType = xmlNode.getNodeType();
193	String nodeName = xmlNode.getNodeName();
194
195	if (nodeType == Node.DOCUMENT_NODE) {
196	Document xmlDocNode = (Document)xmlNode;
197
198	if (xmlDocNode.getDoctype() == null) {
199	System.err.println("Doctype is null.");
200	}
201	else {
202
203	DocumentType dt = xmlDocNode.getDoctype();
204
205	String name = dt.getName();
206	String pid = dt.getPublicId();
207	String sid = dt.getSystemId();
208
209	String doctype_str = "<!DOCTYPE " + dt.getName() + " PUBLIC \"" + pid + "\"" + sid + "\">\n";
210
211	xmlRepresentation.append(doctype_str);
212	}
213	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
214	return;
215	}
216	// Handle Element nodes
217	if (nodeType == Node.ELEMENT_NODE) {
218	if (pretty) {
219	xmlRepresentation.append("\n");
220	for (int i = 0; i < depth; i++) {
221	xmlRepresentation.append(" ");
222	}
223	}
224
225	// Write opening tag
226	xmlRepresentation.append("<");
227	xmlRepresentation.append(nodeName);
228
229	// Write the node attributes
230	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
231	for (int i = 0; i < nodeAttributes.getLength(); i++) {
232	Node attribute = nodeAttributes.item(i);
233	xmlRepresentation.append(" ");
234	xmlRepresentation.append(attribute.getNodeName());
235	xmlRepresentation.append("=\"");
236	xmlRepresentation.append(attribute.getNodeValue());
237	xmlRepresentation.append("\"");
238	}
239
240	// If the node has no children, close the opening tag and return
241	if (xmlNode.hasChildNodes() == false) {
242	// This produces somewhat ugly output, but it is necessary to compensate
243	// for display bugs in Netscape. Firstly, the space is needed before the
244	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
245	// example). Also, a newline character would be expected after the tag,
246	// but this causes problems with the display of links (the link text
247	// will contain a newline character, which is displayed badly).
248	xmlRepresentation.append(" />");
249	return;
250	}
251
252	// Close the opening tag
253	xmlRepresentation.append(">");
254
255	// Apply recursively to the children of this node
256	// hack for nodes next to text nodes - dont make them pretty
257	// this is needed for text inside a <pre> element - any new lines
258	// or spaces around the span elements show up in the text
259	NodeList children = xmlNode.getChildNodes();
260	boolean do_pretty = pretty;
261	for (int i = 0; i < children.getLength(); i++) {
262	if (children.item(i).getNodeType()==Node.TEXT_NODE) {
263	do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
264	}
265	getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
266	}
267
268	// Write closing tag
269	if (pretty) {
270	if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
271	for (int i = 0; i < depth; i++)
272	xmlRepresentation.append(" ");
273	}
274	}
275	xmlRepresentation.append("</");
276	xmlRepresentation.append(nodeName);
277	xmlRepresentation.append(">");
278	if (pretty) {
279	xmlRepresentation.append("\n");
280	}
281	}
282
283	// Handle Text nodes
284	else if (nodeType == Node.TEXT_NODE) {
285	String text = xmlNode.getNodeValue();
286
287	// Perform output escaping, if required
288	if (outputEscaping) {
289	text = text.replaceAll("&", "&"); // Must be done first!!
290	text = text.replaceAll("<", "<");
291	text = text.replaceAll(">", ">");
292	text = text.replaceAll("\"", """);
293	text = text.replaceAll("\'", "'");
294	}
295
296	// Remove any control-C characters
297	text = text.replaceAll("" + (char) 3, "");
298	xmlRepresentation.append(text);
299	}
300
301	// Handle Processing Instruction nodes
302	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
303	if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
304	outputEscaping = false;
305	}
306	else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
307	outputEscaping = true;
308	}
309	else {
310	logger.warn("Unhandled processing instruction " + nodeName);
311	}
312	}
313
314	else if (nodeType == Node.COMMENT_NODE) {
315	String text = xmlNode.getNodeValue();
316	xmlRepresentation.append("<!-- ");
317	xmlRepresentation.append(text);
318	xmlRepresentation.append(" -->");
319	}
320
321
322
323	// A type of node that is not handled yet
324	else {
325	logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
326	}
327
328	return;
329	}
330
331	protected static String getNodeTypeString(short node_type) {
332
333	String type = "";
334	switch(node_type) {
335	case Node.ATTRIBUTE_NODE:
336	type="ATTRIBUTE_NODE";
337	break;
338	case Node.CDATA_SECTION_NODE:
339	type="CDATA_SECTION_NODE";
340	break;
341	case Node.COMMENT_NODE:
342	type="COMMENT_NODE";
343	break;
344	case Node.DOCUMENT_FRAGMENT_NODE:
345	type="DOCUMENT_FRAGMENT_NODE";
346	break;
347	case Node.DOCUMENT_NODE:
348	type="DOCUMENT_NODE";
349	break;
350	case Node.DOCUMENT_TYPE_NODE:
351	type="DOCUMENT_TYPE_NODE";
352	break;
353	case Node.ELEMENT_NODE:
354	type="ELEMENT_NODE";
355	break;
356	case Node.ENTITY_NODE:
357	type="ENTITY_NODE";
358	break;
359	case Node.ENTITY_REFERENCE_NODE:
360	type="ENTITY_REFERENCE_NODE";
361	break;
362	case Node.NOTATION_NODE:
363	type="NOTATION_NODE";
364	break;
365	case Node.PROCESSING_INSTRUCTION_NODE:
366	type="PROCESSING_INSTRUCTION_NODE";
367	break;
368	case Node.TEXT_NODE:
369	type="TEXT_NODE";
370	break;
371	default:
372	type="UNKNOWN";
373	}
374
375	return type;
376	}
377	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: