Context Navigation

source: greenstone3/trunk/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 18434

Last change on this file since 18434 was 18434, checked in by max, 15 years ago
Provides a new DOM method that allows a document to be created with a specified DOCTYPE. Reusable ErrorHandler class for parsing, that will deal with warnings, fatal and regular errors that may occur when parsing XML. XMLConverter's parse object has an ErrorHandler attached to it when initialised.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.7 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.DOMImplementation;
23	import org.w3c.dom.Document;
24	import org.w3c.dom.DocumentType;
25	import org.w3c.dom.Element;
26	import org.w3c.dom.Node;
27	import org.w3c.dom.NodeList;
28	import org.w3c.dom.NamedNodeMap;
29	import org.xml.sax.InputSource;
30	import org.xml.sax.EntityResolver;
31	import org.xml.sax.ErrorHandler;
32	import org.xml.sax.SAXParseException;
33	import org.apache.xerces.parsers.DOMParser;
34	import org.apache.xerces.dom.*; // for new Documents
35
36	// other java classes
37	import java.io.Reader;
38	import java.io.InputStreamReader;
39	import java.io.StringReader;
40	import java.io.File;
41	import java.io.FileInputStream;
42	import java.io.FileReader;
43
44	import org.apache.log4j.*;
45
46	/** XMLConverter - utility class for greenstone
47	*
48	* parses XML Strings into Documents, converts Nodes to Strings
49	* different parsers have different behaviour - can experiment in here
50	* now we only use xerces
51	*
52	*/
53	public class XMLConverter {
54
55	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
56
57	/** xerces parser */
58	protected DOMParser parser = null;
59
60	private static boolean outputEscaping = true;
61
62
63	/** the no-args constructor */
64	public XMLConverter() {
65	try {
66	this.parser = new DOMParser();
67	this.parser.setFeature("http://xml.org/sax/features/validation", false);
68	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
69	this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
70	// a performance test showed that having this on lead to increased
71	// memory use for small-medium docs, and not much gain for large
72	// docs.
73	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
74	this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
75	// add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
76	// this errormessage can then be converted to xhtml and displayed in a browser.
77	this.parser.setErrorHandler(new ParseErrorHandler());
78	} catch (Exception e) {
79	logger.error(e.getMessage());
80	}
81	}
82
83	/** sets the entity resolver. pass in null to unset it */
84	public void setEntityResolver(EntityResolver er) {
85	this.parser.setEntityResolver(er);
86	}
87
88	/** Given a Node representing an Element or Document, will return the
89	* Element/docroot Element. Returns null if the Node was not an element. */
90	public static Element nodeToElement(Node node)
91	{
92	if(node == null) {
93	return null;
94	}
95	short nodeType = node.getNodeType();
96
97	if (nodeType == Node.DOCUMENT_NODE) {
98	Document docNode = (Document)node;
99	return docNode.getDocumentElement() ;
100	}
101	else if (nodeType == Node.ELEMENT_NODE) {
102	return (Element)node;
103	}
104	else {
105	String message = "Expecting Document or Element node type but got "
106	+ node.getNodeName() + "\nReturning null";
107	System.err.println(message);
108	logger.warn(message);
109	return null;
110	}
111	}
112
113	/** returns a DOM Document */
114	public Document getDOM(String in) {
115
116	try {
117	Reader reader = new StringReader(in);
118	InputSource xml_source = new InputSource(reader);
119
120	this.parser.parse(xml_source);
121	Document doc = this.parser.getDocument();
122
123	return doc;
124
125	} catch (Exception e) {
126	logger.error(e.getMessage());
127	}
128	return null;
129	}
130
131	/** returns a DOM Document */
132	public Document getDOM(File in) {
133	try {
134	FileReader reader = new FileReader(in);
135	InputSource xml_source = new InputSource(reader);
136	this.parser.parse(xml_source);
137	Document doc = this.parser.getDocument();
138	return doc;
139
140	} catch (Exception e) {
141	logger.error(e.getMessage(), e);
142
143	}
144	return null;
145	}
146
147	/** returns a DOM document */
148	public Document getDOM(File in, String encoding) {
149	try {
150
151	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
152	InputSource xml_source = new InputSource(isr);
153
154	this.parser.parse(xml_source);
155	Document doc = this.parser.getDocument();
156
157	return doc;
158
159	} catch (Exception e) {
160	logger.error(e.getMessage());
161	}
162	return null;
163	}
164
165
166	/** creates a new empty DOM Document */
167	public static Document newDOM() {
168	Document doc = new DocumentImpl();
169	return doc;
170	}
171
172	/**
173	* This method's parameters represent the parts of the Doctype of this
174	* Document that is to be created.
175	* For more info see
176	* http://xerces.apache.org/xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl.html#DocumentTypeImpl(org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
177	*
178	* */
179	public static Document newDOM(String qualifiedName, String publicID, String systemID) {
180	// create empty DOM document
181	DocumentImpl docImpl = new DocumentImpl();
182
183	// Need to use the document to create the docType for it
184	DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
185
186	// Although we have created the docType using the document, we need to still
187	// put it into the empty document we just created
188	try{
189	docImpl.appendChild(myDocType);
190	} catch(Exception e) {
191	System.out.println("Could not append docType because: " + e) ;
192	}
193
194	// return the document containing a DocType
195	return docImpl;
196	}
197
198	/** returns the Node as a String */
199	public static String getString(Node xmlNode)
200	{
201	outputEscaping = true;
202	StringBuffer xmlRepresentation = new StringBuffer();
203	getString(xmlNode, xmlRepresentation, 0, false);
204	return xmlRepresentation.toString();
205	}
206
207	/** returns the node as a nicely formatted String - this introduces extra
208	* text nodes if the String is read back in as a DOM, so should only be
209	* used for printing */
210	public static String getPrettyString(Node xmlNode) {
211
212	outputEscaping = true;
213	StringBuffer xmlRepresentation = new StringBuffer();
214	getString(xmlNode, xmlRepresentation, 0, true);
215	return xmlRepresentation.toString();
216	}
217
218	private static void getString(Node xmlNode, StringBuffer xmlRepresentation,
219	int depth, boolean pretty)
220	{
221
222	if (xmlNode == null) {
223	xmlRepresentation.append("<null>");
224	return;
225	}
226
227	short nodeType = xmlNode.getNodeType();
228	String nodeName = xmlNode.getNodeName();
229
230	if (nodeType == Node.DOCUMENT_NODE) {
231	Document xmlDocNode = (Document)xmlNode;
232
233	//if (xmlDocNode.getDoctype() == null) {
234	//System.err.println("Doctype is null.");
235	//}
236	//else {
237	if (xmlDocNode.getDoctype() != null) {
238	DocumentType dt = xmlDocNode.getDoctype();
239
240	String name = dt.getName();
241	String pid = dt.getPublicId();
242	String sid = dt.getSystemId();
243
244	String doctype_str = "<!DOCTYPE " + dt.getName() + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
245
246	xmlRepresentation.append(doctype_str);
247	}
248	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
249	return;
250	}
251	// Handle Element nodes
252	if (nodeType == Node.ELEMENT_NODE) {
253	if (pretty) {
254	xmlRepresentation.append("\n");
255	for (int i = 0; i < depth; i++) {
256	xmlRepresentation.append(" ");
257	}
258	}
259
260	// Write opening tag
261	xmlRepresentation.append("<");
262	xmlRepresentation.append(nodeName);
263
264	// Write the node attributes
265	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
266	for (int i = 0; i < nodeAttributes.getLength(); i++) {
267	Node attribute = nodeAttributes.item(i);
268	xmlRepresentation.append(" ");
269	xmlRepresentation.append(attribute.getNodeName());
270	xmlRepresentation.append("=\"");
271	xmlRepresentation.append(attribute.getNodeValue());
272	xmlRepresentation.append("\"");
273	}
274
275	// If the node has no children, close the opening tag and return
276	if (xmlNode.hasChildNodes() == false) {
277	// This produces somewhat ugly output, but it is necessary to compensate
278	// for display bugs in Netscape. Firstly, the space is needed before the
279	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
280	// example). Also, a newline character would be expected after the tag,
281	// but this causes problems with the display of links (the link text
282	// will contain a newline character, which is displayed badly).
283	xmlRepresentation.append(" />");
284	return;
285	}
286
287	// Close the opening tag
288	xmlRepresentation.append(">");
289
290	// Apply recursively to the children of this node
291	// hack for nodes next to text nodes - dont make them pretty
292	// this is needed for text inside a <pre> element - any new lines
293	// or spaces around the span elements show up in the text
294	NodeList children = xmlNode.getChildNodes();
295	boolean do_pretty = pretty;
296	for (int i = 0; i < children.getLength(); i++) {
297	if (children.item(i).getNodeType()==Node.TEXT_NODE) {
298	do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
299	}
300	getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
301	}
302
303	// Write closing tag
304	if (pretty) {
305	if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
306	for (int i = 0; i < depth; i++)
307	xmlRepresentation.append(" ");
308	}
309	}
310	xmlRepresentation.append("</");
311	xmlRepresentation.append(nodeName);
312	xmlRepresentation.append(">");
313	if (pretty) {
314	xmlRepresentation.append("\n");
315	}
316	}
317
318	// Handle Text nodes
319	else if (nodeType == Node.TEXT_NODE) {
320	String text = xmlNode.getNodeValue();
321
322	// Perform output escaping, if required
323	if (outputEscaping) {
324	text = text.replaceAll("&", "&"); // Must be done first!!
325	text = text.replaceAll("<", "<");
326	text = text.replaceAll(">", ">");
327	text = text.replaceAll("\"", """);
328	text = text.replaceAll("\'", "'");
329	}
330
331	// Remove any control-C characters
332	text = text.replaceAll("" + (char) 3, "");
333	xmlRepresentation.append(text);
334	}
335
336	// Handle Processing Instruction nodes
337	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
338	if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
339	outputEscaping = false;
340	}
341	else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
342	outputEscaping = true;
343	}
344	else {
345	logger.warn("Unhandled processing instruction " + nodeName);
346	}
347	}
348
349	else if (nodeType == Node.COMMENT_NODE) {
350	String text = xmlNode.getNodeValue();
351	xmlRepresentation.append("<!-- ");
352	xmlRepresentation.append(text);
353	xmlRepresentation.append(" -->");
354	}
355
356
357
358	// A type of node that is not handled yet
359	else {
360	logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
361	}
362
363	return;
364	}
365
366	protected static String getNodeTypeString(short node_type) {
367
368	String type = "";
369	switch(node_type) {
370	case Node.ATTRIBUTE_NODE:
371	type="ATTRIBUTE_NODE";
372	break;
373	case Node.CDATA_SECTION_NODE:
374	type="CDATA_SECTION_NODE";
375	break;
376	case Node.COMMENT_NODE:
377	type="COMMENT_NODE";
378	break;
379	case Node.DOCUMENT_FRAGMENT_NODE:
380	type="DOCUMENT_FRAGMENT_NODE";
381	break;
382	case Node.DOCUMENT_NODE:
383	type="DOCUMENT_NODE";
384	break;
385	case Node.DOCUMENT_TYPE_NODE:
386	type="DOCUMENT_TYPE_NODE";
387	break;
388	case Node.ELEMENT_NODE:
389	type="ELEMENT_NODE";
390	break;
391	case Node.ENTITY_NODE:
392	type="ENTITY_NODE";
393	break;
394	case Node.ENTITY_REFERENCE_NODE:
395	type="ENTITY_REFERENCE_NODE";
396	break;
397	case Node.NOTATION_NODE:
398	type="NOTATION_NODE";
399	break;
400	case Node.PROCESSING_INSTRUCTION_NODE:
401	type="PROCESSING_INSTRUCTION_NODE";
402	break;
403	case Node.TEXT_NODE:
404	type="TEXT_NODE";
405	break;
406	default:
407	type="UNKNOWN";
408	}
409
410	return type;
411	}
412
413	// returns null if there no error occurred during parsing, or else returns the error message
414	public String getParseErrorMessage() {
415	ParseErrorHandler errorHandler = (ParseErrorHandler)this.parser.getErrorHandler();
416	return errorHandler.getErrorMessage();
417	}
418
419	// Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
420	// register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
421	// errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
422	// is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
423	static public class ParseErrorHandler implements ErrorHandler {
424	protected String errorMessage = null;
425
426	// Receive notification of a recoverable error.
427	public void error(SAXParseException exception) {
428	handleError("Error:\n", exception);
429	}
430	// Receive notification of a non-recoverable error.
431	public void fatalError(SAXParseException exception) {
432	handleError("Fatal Error:\n", exception);
433	}
434	// Receive notification of a warning.
435	public void warning(SAXParseException exception) {
436	handleError("Warning:\n", exception);
437	}
438
439	public String toString(SAXParseException e) {
440	String msg = e.getMessage();
441	msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
442	msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
443	msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
444
445	return msg;
446	}
447
448	// clears the errorPage variable after first call to this method
449	public String getErrorMessage() {
450	String errMsg = this.errorMessage;
451	if(this.errorMessage != null) {
452	this.errorMessage = null;
453	}
454	return errMsg;
455	}
456
457	// sets the errorMessage member variable to the data stored in the exception
458	// and writes the errorMessage to the logger and tomcat's System.err
459	protected void handleError(String errorType, SAXParseException exception) {
460	this.errorMessage = errorType + toString(exception);
461	System.err.println("\n**Error parsing xml:\n" + this.errorMessage + "\n**\n");
462	logger.error(this.errorMessage);
463	}
464	}
465	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: