Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/util/XMLConverter.java@ 24393

Last change on this file since 24393 was 23791, checked in by davidb, 13 years ago
Changes to take advantage of the more efficient Apache Commons string manipulation classes.
Property svn:keywords set to `Author Date Id Revision`
File size: 15.3 KB

Rev	Line
[3235]	1	/*
	2	* XMLConverter.java
	3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
	4	*
	5	* This program is free software; you can redistribute it and/or modify
	6	* it under the terms of the GNU General Public License as published by
	7	* the Free Software Foundation; either version 2 of the License, or
	8	* (at your option) any later version.
	9	*
	10	* This program is distributed in the hope that it will be useful,
	11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	* GNU General Public License for more details.
	14	*
	15	* You should have received a copy of the GNU General Public License
	16	* along with this program; if not, write to the Free Software
	17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	18	*/
[3222]	19	package org.greenstone.gsdl3.util;
	20
	21	// XML classes
[18434]	22	import org.w3c.dom.DOMImplementation;
[3222]	23	import org.w3c.dom.Document;
[16374]	24	import org.w3c.dom.DocumentType;
[16688]	25	import org.w3c.dom.Element;
[3222]	26	import org.w3c.dom.Node;
[3768]	27	import org.w3c.dom.NodeList;
	28	import org.w3c.dom.NamedNodeMap;
[3222]	29	import org.xml.sax.InputSource;
[5187]	30	import org.xml.sax.EntityResolver;
[18434]	31	import org.xml.sax.ErrorHandler;
	32	import org.xml.sax.SAXParseException;
[3222]	33	import org.apache.xerces.parsers.DOMParser;
[18434]	34	import org.apache.xerces.dom.*; // for new Documents
[3222]	35
	36	// other java classes
	37	import java.io.Reader;
[4087]	38	import java.io.InputStreamReader;
[3222]	39	import java.io.StringReader;
	40	import java.io.File;
[4087]	41	import java.io.FileInputStream;
[3222]	42	import java.io.FileReader;
[23791]	43	import java.util.regex.*;
[3222]	44
[13124]	45	import org.apache.log4j.*;
	46
[23791]	47	// Apache Commons
	48	import org.apache.commons.lang3.*;
	49
	50	import java.util.*;
	51	import java.lang.reflect.*;
	52
[5187]	53	/** XMLConverter - utility class for greenstone
[3222]	54	*
	55	* parses XML Strings into Documents, converts Nodes to Strings
	56	* different parsers have different behaviour - can experiment in here
[3941]	57	* now we only use xerces
[3222]	58	*
	59	*/
	60	public class XMLConverter {
	61
[13210]	62	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.XMLConverter.class.getName());
[13124]	63
[3222]	64	/** xerces parser */
[5187]	65	protected DOMParser parser = null;
[3222]	66
[18307]	67	private static boolean outputEscaping = true;
[3768]	68
[3235]	69	/** the no-args constructor */
[3222]	70	public XMLConverter() {
	71	try {
[5187]	72	this.parser = new DOMParser();
	73	this.parser.setFeature("http://xml.org/sax/features/validation", false);
[13210]	74	// don't try and load external DTD - no need if we are not validating, and may cause connection errors if a proxy is not set up.
	75	this.parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
[10067]	76	// a performance test showed that having this on lead to increased
	77	// memory use for small-medium docs, and not much gain for large
	78	// docs.
	79	// http://www.sosnoski.com/opensrc/xmlbench/conclusions.html
	80	this.parser.setFeature("http://apache.org/xml/features/dom/defer-node-expansion", false);
[18434]	81	// add an errorhandler to the parser which will store useful a error message on encountering fatal errors, errors and warnings when parsing
	82	// this errormessage can then be converted to xhtml and displayed in a browser.
	83	this.parser.setErrorHandler(new ParseErrorHandler());
[3222]	84	} catch (Exception e) {
[13210]	85	logger.error(e.getMessage());
[3222]	86	}
	87	}
	88
[5187]	89	/** sets the entity resolver. pass in null to unset it */
	90	public void setEntityResolver(EntityResolver er) {
	91	this.parser.setEntityResolver(er);
	92	}
[16688]	93
[16780]	94	/** Given a Node representing an Element or Document, will return the
	95	* Element/docroot Element. Returns null if the Node was not an element. */
	96	public static Element nodeToElement(Node node)
[16688]	97	{
[16999]	98	if(node == null) {
	99	return null;
	100	}
[16688]	101	short nodeType = node.getNodeType();
	102
	103	if (nodeType == Node.DOCUMENT_NODE) {
	104	Document docNode = (Document)node;
	105	return docNode.getDocumentElement() ;
	106	}
	107	else if (nodeType == Node.ELEMENT_NODE) {
	108	return (Element)node;
	109	}
	110	else {
[16780]	111	String message = "Expecting Document or Element node type but got "
	112	+ node.getNodeName() + "\nReturning null";
	113	System.err.println(message);
	114	logger.warn(message);
[16688]	115	return null;
	116	}
	117	}
	118
[3222]	119	/** returns a DOM Document */
	120	public Document getDOM(String in) {
	121
	122	try {
	123	Reader reader = new StringReader(in);
	124	InputSource xml_source = new InputSource(reader);
	125
[5187]	126	this.parser.parse(xml_source);
	127	Document doc = this.parser.getDocument();
[10202]	128
[3222]	129	return doc;
[4087]	130
[3222]	131	} catch (Exception e) {
[13210]	132	logger.error(e.getMessage());
[3222]	133	}
	134	return null;
	135	}
[3235]	136
	137	/** returns a DOM Document */
[3222]	138	public Document getDOM(File in) {
	139	try {
[4087]	140	FileReader reader = new FileReader(in);
[3222]	141	InputSource xml_source = new InputSource(reader);
[5187]	142	this.parser.parse(xml_source);
	143	Document doc = this.parser.getDocument();
[4087]	144	return doc;
	145
	146	} catch (Exception e) {
[13210]	147	logger.error(e.getMessage(), e);
	148
[4087]	149	}
	150	return null;
	151	}
	152
	153	/** returns a DOM document */
	154	public Document getDOM(File in, String encoding) {
	155	try {
	156
	157	InputStreamReader isr = new InputStreamReader(new FileInputStream(in), encoding);
	158	InputSource xml_source = new InputSource(isr);
	159
[5187]	160	this.parser.parse(xml_source);
	161	Document doc = this.parser.getDocument();
[3222]	162
	163	return doc;
	164
	165	} catch (Exception e) {
[13210]	166	logger.error(e.getMessage());
[3222]	167	}
	168	return null;
	169	}
[4087]	170
[3222]	171
	172	/** creates a new empty DOM Document */
[18307]	173	public static Document newDOM() {
[3941]	174	Document doc = new DocumentImpl();
[3222]	175	return doc;
	176	}
[18434]	177
	178	/**
	179	* This method's parameters represent the parts of the Doctype of this
	180	* Document that is to be created.
	181	* For more info see
	182	* http://xerces.apache.org/xerces-j/apiDocs/org/apache/xerces/dom/DocumentTypeImpl.html#DocumentTypeImpl(org.apache.xerces.dom.CoreDocumentImpl,%20java.lang.String)
	183	*
	184	* */
	185	public static Document newDOM(String qualifiedName, String publicID, String systemID) {
	186	// create empty DOM document
	187	DocumentImpl docImpl = new DocumentImpl();
	188
	189	// Need to use the document to create the docType for it
	190	DocumentType myDocType = new DocumentTypeImpl(docImpl, qualifiedName, publicID, systemID);
	191
	192	// Although we have created the docType using the document, we need to still
	193	// put it into the empty document we just created
	194	try{
	195	docImpl.appendChild(myDocType);
	196	} catch(Exception e) {
	197	System.out.println("Could not append docType because: " + e) ;
	198	}
	199
	200	// return the document containing a DocType
	201	return docImpl;
	202	}
[3222]	203
	204	/** returns the Node as a String */
[18307]	205	public static String getString(Node xmlNode)
[3768]	206	{
	207	outputEscaping = true;
[10202]	208	StringBuffer xmlRepresentation = new StringBuffer();
	209	getString(xmlNode, xmlRepresentation, 0, false);
	210	return xmlRepresentation.toString();
[3768]	211	}
	212
[3970]	213	/** returns the node as a nicely formatted String - this introduces extra
	214	* text nodes if the String is read back in as a DOM, so should only be
	215	* used for printing */
[18307]	216	public static String getPrettyString(Node xmlNode) {
[3970]	217
	218	outputEscaping = true;
[10202]	219	StringBuffer xmlRepresentation = new StringBuffer();
	220	getString(xmlNode, xmlRepresentation, 0, true);
	221	return xmlRepresentation.toString();
[3970]	222	}
[3768]	223
[23791]	224	/* For the purposes of logger.debug statements, where this is called and hence outputted,
	225	returns an empty string if debugging is not enabled */
	226	public static String getPrettyStringLogger(Node xmlNode, Logger log) {
	227
	228	if(log.isDebugEnabled())
	229	return getPrettyString(xmlNode);
	230
	231	return "";
	232
	233	}
	234
[18307]	235	private static void getString(Node xmlNode, StringBuffer xmlRepresentation,
[10202]	236	int depth, boolean pretty)
[3768]	237	{
[10202]	238
	239	if (xmlNode == null) {
	240	xmlRepresentation.append("<null>");
	241	return;
	242	}
[3768]	243
	244	short nodeType = xmlNode.getNodeType();
	245	String nodeName = xmlNode.getNodeName();
	246
[9874]	247	if (nodeType == Node.DOCUMENT_NODE) {
[16374]	248	Document xmlDocNode = (Document)xmlNode;
	249
[18434]	250	//if (xmlDocNode.getDoctype() == null) {
	251	//System.err.println("Doctype is null.");
	252	//}
	253	//else {
	254	if (xmlDocNode.getDoctype() != null) {
[16374]	255	DocumentType dt = xmlDocNode.getDoctype();
	256
	257	String name = dt.getName();
	258	String pid = dt.getPublicId();
	259	String sid = dt.getSystemId();
	260
[23791]	261	// Use previously assigned name, not dt.getName() again
	262	String doctype_str = "<!DOCTYPE " + name + " PUBLIC \"" + pid + "\" \"" + sid + "\">\n";
[16374]	263
	264	xmlRepresentation.append(doctype_str);
	265	}
	266	getString(xmlDocNode.getDocumentElement(), xmlRepresentation, depth, pretty);
[10202]	267	return;
[9874]	268	}
[3768]	269	// Handle Element nodes
	270	if (nodeType == Node.ELEMENT_NODE) {
[3970]	271	if (pretty) {
[10202]	272	xmlRepresentation.append("\n");
[3970]	273	for (int i = 0; i < depth; i++) {
[10202]	274	xmlRepresentation.append(" ");
[3970]	275	}
	276	}
[3768]	277
	278	// Write opening tag
[10202]	279	xmlRepresentation.append("<");
	280	xmlRepresentation.append(nodeName);
[3970]	281
[3768]	282	// Write the node attributes
	283	NamedNodeMap nodeAttributes = xmlNode.getAttributes();
	284	for (int i = 0; i < nodeAttributes.getLength(); i++) {
	285	Node attribute = nodeAttributes.item(i);
[10202]	286	xmlRepresentation.append(" ");
	287	xmlRepresentation.append(attribute.getNodeName());
	288	xmlRepresentation.append("=\"");
	289	xmlRepresentation.append(attribute.getNodeValue());
	290	xmlRepresentation.append("\"");
[3768]	291	}
	292
	293	// If the node has no children, close the opening tag and return
	294	if (xmlNode.hasChildNodes() == false) {
	295	// This produces somewhat ugly output, but it is necessary to compensate
	296	// for display bugs in Netscape. Firstly, the space is needed before the
	297	// closing bracket otherwise Netscape will ignore some tags (<br/>, for
	298	// example). Also, a newline character would be expected after the tag,
	299	// but this causes problems with the display of links (the link text
	300	// will contain a newline character, which is displayed badly).
[10202]	301	xmlRepresentation.append(" />");
	302	return;
[3768]	303	}
[3970]	304
[3768]	305	// Close the opening tag
[10202]	306	xmlRepresentation.append(">");
[3970]	307
[3768]	308	// Apply recursively to the children of this node
[4285]	309	// hack for nodes next to text nodes - dont make them pretty
	310	// this is needed for text inside a <pre> element - any new lines
	311	// or spaces around the span elements show up in the text
[3768]	312	NodeList children = xmlNode.getChildNodes();
[4285]	313	boolean do_pretty = pretty;
[3768]	314	for (int i = 0; i < children.getLength(); i++) {
[4285]	315	if (children.item(i).getNodeType()==Node.TEXT_NODE) {
	316	do_pretty=false; // if there is a text node amongst the children, do teh following nodes in non-pretty mode - hope this doesn't stuff up something else
	317	}
[10202]	318	getString(children.item(i), xmlRepresentation, depth + 1, do_pretty);
[3768]	319	}
[3970]	320
[3768]	321	// Write closing tag
[3970]	322	if (pretty) {
[10202]	323	if (xmlRepresentation.charAt(xmlRepresentation.length()-1) == '\n') {
[3970]	324	for (int i = 0; i < depth; i++)
[10202]	325	xmlRepresentation.append(" ");
[3970]	326	}
[3768]	327	}
[10202]	328	xmlRepresentation.append("</");
	329	xmlRepresentation.append(nodeName);
	330	xmlRepresentation.append(">");
[3970]	331	if (pretty) {
[10202]	332	xmlRepresentation.append("\n");
[3970]	333	}
[3501]	334	}
[3970]	335
[3768]	336	// Handle Text nodes
	337	else if (nodeType == Node.TEXT_NODE) {
	338	String text = xmlNode.getNodeValue();
	339
	340	// Perform output escaping, if required
[23791]	341	// Apache Commons replace method is far superior to String.replaceAll - very fast!
[3768]	342	if (outputEscaping) {
[23791]	343
	344	text = StringUtils.replace(text, "&", "&");
	345	text = StringUtils.replace(text, "<", "<");
	346	text = StringUtils.replace(text, ">", ">");
	347	text = StringUtils.replace(text, "'", "'");
	348	text = StringUtils.replace(text, "\"", """);
[3768]	349	}
	350
	351	// Remove any control-C characters
[23791]	352	text = StringUtils.replace(text, "" + (char)3, "");
	353
[10202]	354	xmlRepresentation.append(text);
[3768]	355	}
	356
	357	// Handle Processing Instruction nodes
	358	else if (nodeType == Node.PROCESSING_INSTRUCTION_NODE) {
[4013]	359	if (nodeName.equals("javax.xml.transform.disable-output-escaping")) {
[3768]	360	outputEscaping = false;
	361	}
[4013]	362	else if (nodeName.equals("javax.xml.transform.enable-output-escaping")) {
[3768]	363	outputEscaping = true;
	364	}
	365	else {
[13210]	366	logger.warn("Unhandled processing instruction " + nodeName);
[3768]	367	}
	368	}
[8923]	369
	370	else if (nodeType == Node.COMMENT_NODE) {
	371	String text = xmlNode.getNodeValue();
[10202]	372	xmlRepresentation.append("<!-- ");
	373	xmlRepresentation.append(text);
	374	xmlRepresentation.append(" -->");
[8923]	375	}
	376
	377
	378
[3768]	379	// A type of node that is not handled yet
	380	else {
[13210]	381	logger.warn("Unknown node type: " + nodeType+" "+getNodeTypeString(nodeType));
[3768]	382	}
	383
[10202]	384	return;
[3222]	385	}
[3908]	386
	387	protected static String getNodeTypeString(short node_type) {
	388
	389	String type = "";
	390	switch(node_type) {
	391	case Node.ATTRIBUTE_NODE:
	392	type="ATTRIBUTE_NODE";
	393	break;
	394	case Node.CDATA_SECTION_NODE:
	395	type="CDATA_SECTION_NODE";
	396	break;
	397	case Node.COMMENT_NODE:
	398	type="COMMENT_NODE";
	399	break;
	400	case Node.DOCUMENT_FRAGMENT_NODE:
	401	type="DOCUMENT_FRAGMENT_NODE";
	402	break;
	403	case Node.DOCUMENT_NODE:
	404	type="DOCUMENT_NODE";
	405	break;
	406	case Node.DOCUMENT_TYPE_NODE:
	407	type="DOCUMENT_TYPE_NODE";
	408	break;
	409	case Node.ELEMENT_NODE:
	410	type="ELEMENT_NODE";
	411	break;
	412	case Node.ENTITY_NODE:
	413	type="ENTITY_NODE";
	414	break;
	415	case Node.ENTITY_REFERENCE_NODE:
	416	type="ENTITY_REFERENCE_NODE";
	417	break;
	418	case Node.NOTATION_NODE:
	419	type="NOTATION_NODE";
	420	break;
	421	case Node.PROCESSING_INSTRUCTION_NODE:
	422	type="PROCESSING_INSTRUCTION_NODE";
	423	break;
	424	case Node.TEXT_NODE:
	425	type="TEXT_NODE";
	426	break;
	427	default:
	428	type="UNKNOWN";
	429	}
	430
	431	return type;
	432	}
[18434]	433
	434	// returns null if there no error occurred during parsing, or else returns the error message
	435	public String getParseErrorMessage() {
	436	ParseErrorHandler errorHandler = (ParseErrorHandler)this.parser.getErrorHandler();
	437	return errorHandler.getErrorMessage();
	438	}
	439
	440	// Errorhandler for SAXParseExceptions that are errors, fatal errors or warnings. This class can be used to
	441	// register a handler for any fatal errors, errors and warnings that may occur when parsing an xml file. The
	442	// errors are printed both to the greenstone.log and to the tomcat console (System.err), and the error message
	443	// is stored in the errorMessage variable so that it can be retrieved and be used to generate an xhtml error page.
	444	static public class ParseErrorHandler implements ErrorHandler {
	445	protected String errorMessage = null;
	446
	447	// Receive notification of a recoverable error.
	448	public void error(SAXParseException exception) {
	449	handleError("Error:\n", exception);
	450	}
	451	// Receive notification of a non-recoverable error.
	452	public void fatalError(SAXParseException exception) {
	453	handleError("Fatal Error:\n", exception);
	454	}
	455	// Receive notification of a warning.
	456	public void warning(SAXParseException exception) {
	457	handleError("Warning:\n", exception);
	458	}
	459
	460	public String toString(SAXParseException e) {
	461	String msg = e.getMessage();
	462	msg += "\nOn line(column): " + e.getLineNumber() + "(" + e.getColumnNumber() + ")";
	463	msg += (e.getPublicId() != null) ? ("\npublic ID: " + e.getPublicId()) : "\nNo public ID";
	464	msg += (e.getSystemId() != null) ? ("\nsystem ID: " + e.getSystemId()) : "\nNo system ID";
	465
	466	return msg;
	467	}
	468
	469	// clears the errorPage variable after first call to this method
	470	public String getErrorMessage() {
	471	String errMsg = this.errorMessage;
	472	if(this.errorMessage != null) {
	473	this.errorMessage = null;
	474	}
	475	return errMsg;
	476	}
	477
	478	// sets the errorMessage member variable to the data stored in the exception
	479	// and writes the errorMessage to the logger and tomcat's System.err
	480	protected void handleError(String errorType, SAXParseException exception) {
	481	this.errorMessage = errorType + toString(exception);
	482	System.err.println("\n**Error parsing xml:\n" + this.errorMessage + "\n**\n");
	483	logger.error(this.errorMessage);
	484	}
	485	}
[3222]	486	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: