Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 25128

Last change on this file since 25128 was 24863, checked in by sjm84, 12 years ago
XMLConverter no longer keeps one converter around to reuse, it makes a new one every time, this is to avoid an error where the parser is called whilst it is already parsing
Property svn:keywords set to `Author Date Id Revision`
File size: 16.4 KB

Line
1	/*
2	* XMLConverter.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.util;
20
21	// XML classes
22	import org.w3c.dom.DOMImplementation;
23	import org.w3c.dom.Document;
24	import org.w3c.dom.DocumentType;
25	import org.w3c.dom.Element;
26	import org.w3c.dom.Node;
27	import org.w3c.dom.NodeList;
28	import org.w3c.dom.NamedNodeMap;
29	import org.xml.sax.InputSource;
30	import org.xml.sax.EntityResolver;
31	import org.xml.sax.ErrorHandler;
32	import org.xml.sax.SAXParseException;
33	import org.apache.xerces.parsers.DOMParser;
34	import org.apache.xerces.dom.*; // for new Documents
35
36	// other java classes
37	import java.io.Reader;
38	import java.io.InputStreamReader;
39	import java.io.StringReader;
40	import java.io.File;
41	import java.io.FileInputStream;
42	import java.io.FileReader;
43	import java.util.regex.*;
44
45	import org.apache.log4j.*;
46
47	// Apache Commons
48	import org.apache.commons.lang3.*;
49
50	import java.util.*;
51	import java.lang.reflect.*;
52
53	/**
54	* XMLConverter - utility class for greenstone
55	*
56	* parses XML Strings into Documents, converts Nodes to Strings different
57	* parsers have different behaviour - can experiment in here now we only use
58	* xerces
59	*
60	*/
61	public class XMLConverter
62	{
63
64	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
65
66	protected EntityResolver resolver = null;
67
68	/** xerces parser */
69	protected DOMParser parser = null;
70
71	private static boolean outputEscaping = true;
72
73	/** the no-args constructor */
74	public XMLConverter()
75	{
76	try
77	{
78	this.parser = new DOMParser();
79	this.parser.setFeature("http://xml.org/sax/features/validation", false);
80	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
81	this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
82	// a performance test showed that having this on lead to increased
83	// memory use for small-medium docs, and not much gain for large
84	// docs.
85	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
86	this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
87	// add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
88	// this errormessage can then be converted to xhtml and displayed in a browser.
89	this.parser.setErrorHandler(new ParseErrorHandler());
90	}
91	catch (Exception e)
92	{
93	logger.error(e.getMessage());
94	}
95	}
96
97	/** sets the entity resolver. pass in null to unset it */
98	public void setEntityResolver(EntityResolver er)
99	{
100	this.resolver = er;
101	this.parser.setEntityResolver(er);
102	}
103
104	/**
105	* Given a Node representing an Element or Document, will return the
106	* Element/docroot Element. Returns null if the Node was not an element.
107	*/
108	public static Element nodeToElement(Node node)
109	{
110	if (node == null)
111	{
112	return null;
113	}
114	short nodeType = node.getNodeType();
115
116	if (nodeType == Node.DOCUMENT_NODE)
117	{
118	Document docNode = (Document) node;
119	return docNode.getDocumentElement();
120	}
121	else if (nodeType == Node.ELEMENT_NODE)
122	{
123	return (Element) node;
124	}
125	else
126	{
127	String message = "Expecting Document or Element node type but got " + node.getNodeName() + "\nReturning null";
128	System.err.println(message);
129	logger.warn(message);
130	return null;
131	}
132	}
133
134	/** returns a DOM Document */
135	public Document getDOM(String in)
136	{
137
138	try
139	{
140	Reader reader = new StringReader(in);
141	InputSource xml_source = new InputSource(reader);
142
143	DOMParser parser = new DOMParser();
144	parser.setFeature("http://xml.org/sax/features/validation", false);
145	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
146	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
147	if(resolver != null)
148	{
149	parser.setEntityResolver(this.resolver);
150	}
151	parser.setErrorHandler(new ParseErrorHandler());
152	parser.parse(xml_source);
153
154	Document doc = parser.getDocument();
155
156	return doc;
157
158	}
159	catch (Exception e)
160	{
161	logger.error(e.getMessage());
162	}
163	return null;
164	}
165
166	/** returns a DOM Document */
167	public Document getDOM(File in)
168	{
169	try
170	{
171	FileReader reader = new FileReader(in);
172	InputSource xml_source = new InputSource(reader);
173
174	DOMParser parser = new DOMParser();
175	parser.setFeature("http://xml.org/sax/features/validation", false);
176	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
177	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
178	if(resolver != null)
179	{
180	parser.setEntityResolver(this.resolver);
181	}
182	parser.setErrorHandler(new ParseErrorHandler());
183	parser.parse(xml_source);
184
185	Document doc = parser.getDocument();
186	return doc;
187
188	}
189	catch (Exception e)
190	{
191	logger.error(e.getMessage(), e);
192
193	}
194	return null;
195	}
196
197	/** returns a DOM document */
198	public Document getDOM(File in, String encoding)
199	{
200	try
201	{
202
203	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
204	InputSource xml_source = new InputSource(isr);
205
206	DOMParser parser = new DOMParser();
207	parser.setFeature("http://xml.org/sax/features/validation", false);
208	parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
209	parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
210	if(resolver != null)
211	{
212	parser.setEntityResolver(this.resolver);
213	}
214	parser.setErrorHandler(new ParseErrorHandler());
215	parser.parse(xml_source);
216
217	Document doc = parser.getDocument();
218
219	return doc;
220
221	}
222	catch (Exception e)
223	{
224	logger.error(e.getMessage());
225	}
226	return null;
227	}
228
229	/** creates a new empty DOM Document */
230	public static Document newDOM()
231	{
232	Document doc = new DocumentImpl();
233	return doc;
234	}
235
236	/**
237	* This method's parameters represent the parts of the Doctype of this
238	* Document that is to be created. For more info see
239	* http://xerces.apache.org
240	* /xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl
241	* .html#DocumentTypeImpl
242	* (org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
243	*
244	* */
245	public static Document newDOM(String qualifiedName, String publicID, String systemID)
246	{
247	// create empty DOM document
248	DocumentImpl docImpl = new DocumentImpl();
249
250	// Need to use the document to create the docType for it
251	DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
252
253	// Although we have created the docType using the document, we need to still
254	// put it into the empty document we just created
255	try
256	{
257	docImpl.appendChild(myDocType);
258	}
259	catch (Exception e)
260	{
261	System.out.println("Could not append docType because: " + e);
262	}
263
264	// return the document containing a DocType
265	return docImpl;
266	}
267
268	/** returns the Node as a String */
269	public static String getString(Node xmlNode)
270	{
271	outputEscaping = true;
272	StringBuffer xmlRepresentation = new StringBuffer();
273	getString(xmlNode, xmlRepresentation, 0, false);
274	return xmlRepresentation.toString();
275	}
276
277	/**
278	* returns the node as a nicely formatted String - this introduces extra
279	* text nodes if the String is read back in as a DOM, so should only be used
280	* for printing
281	*/
282	public static String getPrettyString(Node xmlNode)
283	{
284
285	outputEscaping = true;
286	StringBuffer xmlRepresentation = new StringBuffer();
287	getString(xmlNode, xmlRepresentation, 0, true);
288	return xmlRepresentation.toString();
289	}
290
291	/*
292	* For the purposes of logger.debug statements, where this is called and
293	* hence outputted, returns an empty string if debugging is not enabled
294	*/
295	public static String getPrettyStringLogger(Node xmlNode, Logger log)
296	{
297
298	if (log.isDebugEnabled())
299	return getPrettyString(xmlNode);
300
301	return "";
302
303	}
304
305	private static void getString(Node xmlNode, StringBuffer xmlRepresentation, int depth, boolean pretty)
306	{
307
308	if (xmlNode == null)
309	{
310	xmlRepresentation.append("<null>");
311	return;
312	}
313
314	short nodeType = xmlNode.getNodeType();
315	String nodeName = xmlNode.getNodeName();
316
317	if (nodeType == Node.DOCUMENT_NODE)
318	{
319	Document xmlDocNode = (Document) xmlNode;
320
321	//if (xmlDocNode.getDoctype() == null) {
322	//System.err.println("Doctype is null.");
323	//}
324	//else {
325	if (xmlDocNode.getDoctype() != null)
326	{
327	DocumentType dt = xmlDocNode.getDoctype();
328
329	String name = dt.getName();
330	String pid = dt.getPublicId();
331	String sid = dt.getSystemId();
332
333	// Use previously assigned name, not dt.getName() again
334	String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
335
336	xmlRepresentation.append(doctype_str);
337	}
338	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
339	return;
340	}
341	// Handle Element nodes
342	if (nodeType == Node.ELEMENT_NODE)
343	{
344	if (pretty)
345	{
346	xmlRepresentation.append("\n");
347	for (int i = 0; i < depth; i++)
348	{
349	xmlRepresentation.append(" ");
350	}
351	}
352
353	// Write opening tag
354	xmlRepresentation.append("<");
355	xmlRepresentation.append(nodeName);
356
357	// Write the node attributes
358	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
359	for (int i = 0; i < nodeAttributes.getLength(); i++)
360	{
361	Node attribute = nodeAttributes.item(i);
362	xmlRepresentation.append(" ");
363	xmlRepresentation.append(attribute.getNodeName());
364	xmlRepresentation.append("=\"");
365	xmlRepresentation.append(attribute.getNodeValue());
366	xmlRepresentation.append("\"");
367	}
368
369	// If the node has no children, close the opening tag and return
370	if (xmlNode.hasChildNodes() == false)
371	{
372	// This produces somewhat ugly output, but it is necessary to compensate
373	// for display bugs in Netscape. Firstly, the space is needed before the
374	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
375	// example). Also, a newline character would be expected after the tag,
376	// but this causes problems with the display of links (the link text
377	// will contain a newline character, which is displayed badly).
378	xmlRepresentation.append(" />");
379	return;
380	}
381
382	// Close the opening tag
383	xmlRepresentation.append(">");
384
385	// Apply recursively to the children of this node
386	// hack for nodes next to text nodes - dont make them pretty
387	// this is needed for text inside a <pre> element - any new lines
388	// or spaces around the span elements show up in the text
389	NodeList children = xmlNode.getChildNodes();
390	boolean do_pretty = pretty;
391	for (int i = 0; i < children.getLength(); i++)
392	{
393	if (children.item(i).getNodeType() == Node.TEXT_NODE)
394	{
395	do_pretty = false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
396	}
397	getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
398	}
399
400	// Write closing tag
401	if (pretty)
402	{
403	if (xmlRepresentation.charAt(xmlRepresentation.length() - 1) == '\n')
404	{
405	for (int i = 0; i < depth; i++)
406	xmlRepresentation.append(" ");
407	}
408	}
409	xmlRepresentation.append("</");
410	xmlRepresentation.append(nodeName);
411	xmlRepresentation.append(">");
412	if (pretty)
413	{
414	xmlRepresentation.append("\n");
415	}
416	}
417
418	// Handle Text nodes
419	else if (nodeType == Node.TEXT_NODE)
420	{
421	String text = xmlNode.getNodeValue();
422
423	// Perform output escaping, if required
424	// Apache Commons replace method is far superior to String.replaceAll - very fast!
425	if (outputEscaping)
426	{
427
428	text = StringUtils.replace(text, "&", "&");
429	text = StringUtils.replace(text, "<", "<");
430	text = StringUtils.replace(text, ">", ">");
431	text = StringUtils.replace(text, "'", "'");
432	text = StringUtils.replace(text, "\"", """);
433	}
434
435	// Remove any control-C characters
436	text = StringUtils.replace(text, "" + (char) 3, "");
437
438	xmlRepresentation.append(text);
439	}
440
441	// Handle Processing Instruction nodes
442	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE)
443	{
444	if (nodeName.equals("javax.xml.transform.disable-output-escaping"))
445	{
446	outputEscaping = false;
447	}
448	else if (nodeName.equals("javax.xml.transform.enable-output-escaping"))
449	{
450	outputEscaping = true;
451	}
452	else
453	{
454	logger.warn("Unhandled processing instruction " + nodeName);
455	}
456	}
457
458	else if (nodeType == Node.COMMENT_NODE)
459	{
460	String text = xmlNode.getNodeValue();
461	xmlRepresentation.append("<!-- ");
462	xmlRepresentation.append(text);
463	xmlRepresentation.append(" -->");
464	}
465
466	// A type of node that is not handled yet
467	else
468	{
469	logger.warn("Unknown node type: " + nodeType + " " + getNodeTypeString(nodeType));
470	}
471
472	return;
473	}
474
475	protected static String getNodeTypeString(short node_type)
476	{
477
478	String type = "";
479	switch (node_type)
480	{
481	case Node.ATTRIBUTE_NODE:
482	type = "ATTRIBUTE_NODE";
483	break;
484	case Node.CDATA_SECTION_NODE:
485	type = "CDATA_SECTION_NODE";
486	break;
487	case Node.COMMENT_NODE:
488	type = "COMMENT_NODE";
489	break;
490	case Node.DOCUMENT_FRAGMENT_NODE:
491	type = "DOCUMENT_FRAGMENT_NODE";
492	break;
493	case Node.DOCUMENT_NODE:
494	type = "DOCUMENT_NODE";
495	break;
496	case Node.DOCUMENT_TYPE_NODE:
497	type = "DOCUMENT_TYPE_NODE";
498	break;
499	case Node.ELEMENT_NODE:
500	type = "ELEMENT_NODE";
501	break;
502	case Node.ENTITY_NODE:
503	type = "ENTITY_NODE";
504	break;
505	case Node.ENTITY_REFERENCE_NODE:
506	type = "ENTITY_REFERENCE_NODE";
507	break;
508	case Node.NOTATION_NODE:
509	type = "NOTATION_NODE";
510	break;
511	case Node.PROCESSING_INSTRUCTION_NODE:
512	type = "PROCESSING_INSTRUCTION_NODE";
513	break;
514	case Node.TEXT_NODE:
515	type = "TEXT_NODE";
516	break;
517	default:
518	type = "UNKNOWN";
519	}
520
521	return type;
522	}
523
524	// returns null if there no error occurred during parsing, or else returns the error message
525	public String getParseErrorMessage()
526	{
527	ParseErrorHandler errorHandler = (ParseErrorHandler) this.parser.getErrorHandler();
528	return errorHandler.getErrorMessage();
529	}
530
531	// Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
532	// register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
533	// errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
534	// is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
535	static public class ParseErrorHandler implements ErrorHandler
536	{
537	protected String errorMessage = null;
538
539	// Receive notification of a recoverable error.
540	public void error(SAXParseException exception)
541	{
542	handleError("Error:\n", exception);
543	}
544
545	// Receive notification of a non-recoverable error.
546	public void fatalError(SAXParseException exception)
547	{
548	handleError("Fatal Error:\n", exception);
549	}
550
551	// Receive notification of a warning.
552	public void warning(SAXParseException exception)
553	{
554	handleError("Warning:\n", exception);
555	}
556
557	public String toString(SAXParseException e)
558	{
559	String msg = e.getMessage();
560	msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
561	msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
562	msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
563
564	return msg;
565	}
566
567	// clears the errorPage variable after first call to this method
568	public String getErrorMessage()
569	{
570	String errMsg = this.errorMessage;
571	if (this.errorMessage != null)
572	{
573	this.errorMessage = null;
574	}
575	return errMsg;
576	}
577
578	// sets the errorMessage member variable to the data stored in the exception
579	// and writes the errorMessage to the logger and tomcat's System.err
580	protected void handleError(String errorType, SAXParseException exception)
581	{
582	this.errorMessage = errorType + toString(exception);
583	System.err.println("\n**Error parsing xml:\n" + this.errorMessage + "\n**\n");
584	logger.error(this.errorMessage);
585	}
586	}
587	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: